hxf/backend/th_agenter/models/knowledge_base.py

91 lines
3.9 KiB
Python
Raw Normal View History

2025-12-04 14:48:38 +08:00
"""Knowledge base models."""
2025-12-16 13:55:16 +08:00
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import String, Integer, Text, Boolean, JSON
2025-12-04 14:48:38 +08:00
from ..db.base import BaseModel
class KnowledgeBase(BaseModel):
"""Knowledge base model."""
__tablename__ = "knowledge_bases"
2025-12-16 13:55:16 +08:00
name: Mapped[str] = mapped_column(String(100), unique=False, index=True, nullable=False)
description: Mapped[str | None] = mapped_column(Text, nullable=True)
embedding_model: Mapped[str] = mapped_column(String(100), nullable=False, default="sentence-transformers/all-MiniLM-L6-v2")
chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, default=1000)
chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, default=200)
is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
2025-12-04 14:48:38 +08:00
# Vector database settings
2025-12-16 13:55:16 +08:00
vector_db_type: Mapped[str] = mapped_column(String(50), nullable=False, default="chroma")
collection_name: Mapped[str | None] = mapped_column(String(100), nullable=True) # For vector DB collection
2025-12-04 14:48:38 +08:00
# Relationships removed to eliminate foreign key constraints
def __repr__(self):
return f"<KnowledgeBase(id={self.id}, name='{self.name}')>"
2025-12-16 13:55:16 +08:00
# Relationships are commented out to remove foreign key constraints, so these properties should be updated
# @property
# def document_count(self):
# """Get the number of documents in this knowledge base."""
# return len(self.documents)
2025-12-04 14:48:38 +08:00
2025-12-16 13:55:16 +08:00
# @property
# def active_document_count(self):
# """Get the number of active documents in this knowledge base."""
# return len([doc for doc in self.documents if doc.is_processed])
2025-12-04 14:48:38 +08:00
class Document(BaseModel):
"""Document model."""
__tablename__ = "documents"
2025-12-16 13:55:16 +08:00
knowledge_base_id: Mapped[int] = mapped_column(Integer, nullable=False) # Removed ForeignKey("knowledge_bases.id")
filename: Mapped[str] = mapped_column(String(255), nullable=False)
original_filename: Mapped[str] = mapped_column(String(255), nullable=False)
file_path: Mapped[str] = mapped_column(String(500), nullable=False)
file_size: Mapped[int] = mapped_column(Integer, nullable=False) # in bytes
file_type: Mapped[str] = mapped_column(String(50), nullable=False) # .pdf, .txt, .docx, etc.
mime_type: Mapped[str | None] = mapped_column(String(100), nullable=True)
2025-12-04 14:48:38 +08:00
# Processing status
2025-12-16 13:55:16 +08:00
is_processed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
processing_error: Mapped[str | None] = mapped_column(Text, nullable=True)
2025-12-04 14:48:38 +08:00
# Content and metadata
2025-12-16 13:55:16 +08:00
content: Mapped[str | None] = mapped_column(Text, nullable=True) # Extracted text content
doc_metadata: Mapped[dict | None] = mapped_column(JSON, nullable=True) # Additional metadata
2025-12-04 14:48:38 +08:00
# Chunking information
2025-12-16 13:55:16 +08:00
chunk_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
2025-12-04 14:48:38 +08:00
# Embedding information
2025-12-16 13:55:16 +08:00
embedding_model: Mapped[str | None] = mapped_column(String(100), nullable=True)
vector_ids: Mapped[list | None] = mapped_column(JSON, nullable=True) # Store vector database IDs for chunks
2025-12-04 14:48:38 +08:00
# Relationships removed to eliminate foreign key constraints
def __repr__(self):
return f"<Document(id={self.id}, filename='{self.filename}', kb_id={self.knowledge_base_id})>"
@property
def file_size_mb(self):
"""Get file size in MB."""
return round(self.file_size / (1024 * 1024), 2)
@property
def is_text_file(self):
"""Check if document is a text file."""
return self.file_type.lower() in ['.txt', '.md', '.csv']
@property
def is_pdf_file(self):
"""Check if document is a PDF file."""
return self.file_type.lower() == '.pdf'
@property
def is_office_file(self):
"""Check if document is an Office file."""
return self.file_type.lower() in ['.docx', '.xlsx', '.pptx']