hyf-backend/utils/util_file.py

270 lines
8.8 KiB
Python
Raw Normal View History

2026-01-21 13:45:39 +08:00
"""File utilities."""
import os
import re
import hashlib
import mimetypes
from pathlib import Path
from typing import Optional, List, Dict, Any
try:
from typing import TypeAlias
except ImportError:
from typing_extensions import TypeAlias
FileInfo: TypeAlias = Dict[str, Any]
ExtensionList: TypeAlias = List[str]
class FileUtils:
"""Utility class for file operations.
This class provides static methods for common file operations like validation,
metadata extraction, hashing, and more.
"""
# Allowed file extensions for document upload
ALLOWED_EXTENSIONS: set[str] = {
'.txt', '.md', '.csv', # Text files
'.pdf', # PDF files
'.docx', '.doc', # Word documents
'.xlsx', '.xls', # Excel files
'.pptx', '.ppt', # PowerPoint files
'.rtf', # Rich text format
'.odt', '.ods', '.odp' # OpenDocument formats
}
# MIME type mappings
MIME_TYPE_MAPPING: dict[str, str] = {
'.txt': 'text/plain',
'.md': 'text/markdown',
'.csv': 'text/csv',
'.pdf': 'application/pdf',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xls': 'application/vnd.ms-excel',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.ppt': 'application/vnd.ms-powerpoint',
'.rtf': 'application/rtf',
'.odt': 'application/vnd.oasis.opendocument.text',
'.ods': 'application/vnd.oasis.opendocument.spreadsheet',
'.odp': 'application/vnd.oasis.opendocument.presentation'
}
@staticmethod
def sanitize_filename(filename: str) -> str:
"""Sanitize filename to remove dangerous characters.
Args:
filename: The filename to sanitize.
Returns:
A sanitized filename that is safe to use.
"""
# Remove or replace dangerous characters
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
# Remove leading/trailing spaces and dots
sanitized = sanitized.strip(' .')
# Ensure filename is not empty
if not sanitized:
sanitized = 'unnamed_file'
# Limit filename length
if len(sanitized) > 255:
name, ext = os.path.splitext(sanitized)
sanitized = name[:255 - len(ext)] + ext
return sanitized
@staticmethod
def get_file_hash(file_path: str, algorithm: str = 'md5') -> str:
"""Calculate file hash.
Args:
file_path: The path to the file.
algorithm: The hash algorithm to use (default: 'md5').
Returns:
The hexadecimal representation of the file hash.
Raises:
FileNotFoundError: If the file does not exist.
PermissionError: If the file cannot be read.
ValueError: If the specified algorithm is not supported.
"""
try:
hash_func = hashlib.new(algorithm)
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_func.update(chunk)
return hash_func.hexdigest()
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {file_path}")
except PermissionError:
raise PermissionError(f"Permission denied when reading file: {file_path}")
except ValueError:
raise ValueError(f"Unsupported hash algorithm: {algorithm}")
@staticmethod
def get_file_info(file_path: str) -> FileInfo:
"""Get comprehensive file information.
Args:
file_path: The path to the file.
Returns:
A dictionary containing detailed file information.
Raises:
FileNotFoundError: If the file does not exist.
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
stat = path.stat()
# Get MIME type
mime_type, encoding = mimetypes.guess_type(str(path))
return {
'filename': path.name,
'extension': path.suffix.lower(),
'size_bytes': stat.st_size,
'size_mb': round(stat.st_size / (1024 * 1024), 2),
'mime_type': mime_type,
'encoding': encoding,
'created_at': stat.st_ctime,
'modified_at': stat.st_mtime,
'is_file': path.is_file(),
'is_readable': os.access(file_path, os.R_OK)
}
@staticmethod
def validate_file_extension(filename: str, allowed_extensions: Optional[ExtensionList] = None) -> bool:
"""Validate file extension.
Args:
filename: The filename to validate.
allowed_extensions: List of allowed extensions (default: ALLOWED_EXTENSIONS).
Returns:
True if the file extension is allowed, False otherwise.
"""
if allowed_extensions is None:
allowed_extensions = list(FileUtils.ALLOWED_EXTENSIONS)
extension = Path(filename).suffix.lower()
return extension in allowed_extensions
@staticmethod
def validate_file_size(file_size: int, max_size: int) -> bool:
"""Validate file size."""
return file_size <= max_size
@staticmethod
def create_directory(directory_path: str) -> bool:
"""Create directory if it doesn't exist.
Args:
directory_path: The path to the directory to create.
Returns:
True if the directory was created or already exists, False otherwise.
"""
try:
Path(directory_path).mkdir(parents=True, exist_ok=True)
return True
except PermissionError:
return False
except FileExistsError:
return True # Directory already exists
except Exception:
return False
@staticmethod
def delete_file(file_path: str) -> bool:
"""Safely delete a file.
Args:
file_path: The path to the file to delete.
Returns:
True if the file was deleted, False otherwise.
"""
try:
path = Path(file_path)
if path.exists() and path.is_file():
path.unlink()
return True
return False
except PermissionError:
return False
except FileNotFoundError:
return False # File doesn't exist
except Exception:
return False
@staticmethod
def get_mime_type(filename: str) -> Optional[str]:
"""Get MIME type for filename."""
extension = Path(filename).suffix.lower()
return FileUtils.MIME_TYPE_MAPPING.get(extension)
@staticmethod
def format_file_size(size_bytes: int) -> str:
"""Format file size in human readable format."""
if size_bytes == 0:
return "0 B"
size_names = ["B", "KB", "MB", "GB", "TB"]
i = 0
size = float(size_bytes)
while size >= 1024.0 and i < len(size_names) - 1:
size /= 1024.0
i += 1
return f"{size:.1f} {size_names[i]}"
@staticmethod
def is_text_file(filename: str) -> bool:
"""Check if file is a text file."""
extension = Path(filename).suffix.lower()
return extension in {'.txt', '.md', '.csv', '.rtf'}
@staticmethod
def is_pdf_file(filename: str) -> bool:
"""Check if file is a PDF."""
extension = Path(filename).suffix.lower()
return extension == '.pdf'
@staticmethod
def is_office_file(filename: str) -> bool:
"""Check if file is an Office document."""
extension = Path(filename).suffix.lower()
return extension in {'.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt', '.odt', '.ods', '.odp'}
@staticmethod
def get_file_category(filename: str) -> str:
"""Get file category based on extension."""
extension = Path(filename).suffix.lower()
if extension in {'.txt', '.md', '.csv', '.rtf'}:
return 'text'
elif extension == '.pdf':
return 'pdf'
elif extension in {'.docx', '.doc', '.odt'}:
return 'document'
elif extension in {'.xlsx', '.xls', '.ods'}:
return 'spreadsheet'
elif extension in {'.pptx', '.ppt', '.odp'}:
return 'presentation'
else:
return 'unknown'