"""File utilities.""" import os import re import hashlib import mimetypes from pathlib import Path from typing import Optional, List, Dict, Any try: from typing import TypeAlias except ImportError: from typing_extensions import TypeAlias FileInfo: TypeAlias = Dict[str, Any] ExtensionList: TypeAlias = List[str] class FileUtils: """Utility class for file operations. This class provides static methods for common file operations like validation, metadata extraction, hashing, and more. """ # Allowed file extensions for document upload ALLOWED_EXTENSIONS: set[str] = { '.txt', '.md', '.csv', # Text files '.pdf', # PDF files '.docx', '.doc', # Word documents '.xlsx', '.xls', # Excel files '.pptx', '.ppt', # PowerPoint files '.rtf', # Rich text format '.odt', '.ods', '.odp' # OpenDocument formats } # MIME type mappings MIME_TYPE_MAPPING: dict[str, str] = { '.txt': 'text/plain', '.md': 'text/markdown', '.csv': 'text/csv', '.pdf': 'application/pdf', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.doc': 'application/msword', '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', '.xls': 'application/vnd.ms-excel', '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', '.ppt': 'application/vnd.ms-powerpoint', '.rtf': 'application/rtf', '.odt': 'application/vnd.oasis.opendocument.text', '.ods': 'application/vnd.oasis.opendocument.spreadsheet', '.odp': 'application/vnd.oasis.opendocument.presentation' } @staticmethod def sanitize_filename(filename: str) -> str: """Sanitize filename to remove dangerous characters. Args: filename: The filename to sanitize. Returns: A sanitized filename that is safe to use. """ # Remove or replace dangerous characters sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename) # Remove leading/trailing spaces and dots sanitized = sanitized.strip(' .') # Ensure filename is not empty if not sanitized: sanitized = 'unnamed_file' # Limit filename length if len(sanitized) > 255: name, ext = os.path.splitext(sanitized) sanitized = name[:255 - len(ext)] + ext return sanitized @staticmethod def get_file_hash(file_path: str, algorithm: str = 'md5') -> str: """Calculate file hash. Args: file_path: The path to the file. algorithm: The hash algorithm to use (default: 'md5'). Returns: The hexadecimal representation of the file hash. Raises: FileNotFoundError: If the file does not exist. PermissionError: If the file cannot be read. ValueError: If the specified algorithm is not supported. """ try: hash_func = hashlib.new(algorithm) with open(file_path, 'rb') as f: for chunk in iter(lambda: f.read(4096), b""): hash_func.update(chunk) return hash_func.hexdigest() except FileNotFoundError: raise FileNotFoundError(f"File not found: {file_path}") except PermissionError: raise PermissionError(f"Permission denied when reading file: {file_path}") except ValueError: raise ValueError(f"Unsupported hash algorithm: {algorithm}") @staticmethod def get_file_info(file_path: str) -> FileInfo: """Get comprehensive file information. Args: file_path: The path to the file. Returns: A dictionary containing detailed file information. Raises: FileNotFoundError: If the file does not exist. """ path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") stat = path.stat() # Get MIME type mime_type, encoding = mimetypes.guess_type(str(path)) return { 'filename': path.name, 'extension': path.suffix.lower(), 'size_bytes': stat.st_size, 'size_mb': round(stat.st_size / (1024 * 1024), 2), 'mime_type': mime_type, 'encoding': encoding, 'created_at': stat.st_ctime, 'modified_at': stat.st_mtime, 'is_file': path.is_file(), 'is_readable': os.access(file_path, os.R_OK) } @staticmethod def validate_file_extension(filename: str, allowed_extensions: Optional[ExtensionList] = None) -> bool: """Validate file extension. Args: filename: The filename to validate. allowed_extensions: List of allowed extensions (default: ALLOWED_EXTENSIONS). Returns: True if the file extension is allowed, False otherwise. """ if allowed_extensions is None: allowed_extensions = list(FileUtils.ALLOWED_EXTENSIONS) extension = Path(filename).suffix.lower() return extension in allowed_extensions @staticmethod def validate_file_size(file_size: int, max_size: int) -> bool: """Validate file size.""" return file_size <= max_size @staticmethod def create_directory(directory_path: str) -> bool: """Create directory if it doesn't exist. Args: directory_path: The path to the directory to create. Returns: True if the directory was created or already exists, False otherwise. """ try: Path(directory_path).mkdir(parents=True, exist_ok=True) return True except PermissionError: return False except FileExistsError: return True # Directory already exists except Exception: return False @staticmethod def delete_file(file_path: str) -> bool: """Safely delete a file. Args: file_path: The path to the file to delete. Returns: True if the file was deleted, False otherwise. """ try: path = Path(file_path) if path.exists() and path.is_file(): path.unlink() return True return False except PermissionError: return False except FileNotFoundError: return False # File doesn't exist except Exception: return False @staticmethod def get_mime_type(filename: str) -> Optional[str]: """Get MIME type for filename.""" extension = Path(filename).suffix.lower() return FileUtils.MIME_TYPE_MAPPING.get(extension) @staticmethod def format_file_size(size_bytes: int) -> str: """Format file size in human readable format.""" if size_bytes == 0: return "0 B" size_names = ["B", "KB", "MB", "GB", "TB"] i = 0 size = float(size_bytes) while size >= 1024.0 and i < len(size_names) - 1: size /= 1024.0 i += 1 return f"{size:.1f} {size_names[i]}" @staticmethod def is_text_file(filename: str) -> bool: """Check if file is a text file.""" extension = Path(filename).suffix.lower() return extension in {'.txt', '.md', '.csv', '.rtf'} @staticmethod def is_pdf_file(filename: str) -> bool: """Check if file is a PDF.""" extension = Path(filename).suffix.lower() return extension == '.pdf' @staticmethod def is_office_file(filename: str) -> bool: """Check if file is an Office document.""" extension = Path(filename).suffix.lower() return extension in {'.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt', '.odt', '.ods', '.odp'} @staticmethod def get_file_category(filename: str) -> str: """Get file category based on extension.""" extension = Path(filename).suffix.lower() if extension in {'.txt', '.md', '.csv', '.rtf'}: return 'text' elif extension == '.pdf': return 'pdf' elif extension in {'.docx', '.doc', '.odt'}: return 'document' elif extension in {'.xlsx', '.xls', '.ods'}: return 'spreadsheet' elif extension in {'.pptx', '.ppt', '.odp'}: return 'presentation' else: return 'unknown'