93 lines
3.9 KiB
Python
93 lines
3.9 KiB
Python
|
|
"""Knowledge base models."""
|
||
|
|
|
||
|
|
from typing import Optional
|
||
|
|
from sqlalchemy.orm import Mapped, mapped_column
|
||
|
|
from sqlalchemy import String, Integer, Text, Boolean, JSON
|
||
|
|
|
||
|
|
from ..db.base import BaseModel
|
||
|
|
|
||
|
|
class KnowledgeBase(BaseModel):
|
||
|
|
"""Knowledge base model."""
|
||
|
|
|
||
|
|
__tablename__ = "knowledge_bases"
|
||
|
|
|
||
|
|
name: Mapped[str] = mapped_column(String(100), unique=False, index=True, nullable=False)
|
||
|
|
description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
|
||
|
|
embedding_model: Mapped[str] = mapped_column(String(100), nullable=False, default="sentence-transformers/all-MiniLM-L6-v2")
|
||
|
|
chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, default=1000)
|
||
|
|
chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, default=200)
|
||
|
|
is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
|
||
|
|
|
||
|
|
# Vector database settings
|
||
|
|
vector_db_type: Mapped[str] = mapped_column(String(50), nullable=False, default="chroma")
|
||
|
|
collection_name: Mapped[Optional[str]] = mapped_column(String(100), nullable=True) # For vector DB collection
|
||
|
|
|
||
|
|
# Relationships removed to eliminate foreign key constraints
|
||
|
|
|
||
|
|
def __repr__(self):
|
||
|
|
return f"<KnowledgeBase(id={self.id}, name='{self.name}')>"
|
||
|
|
|
||
|
|
# Relationships are commented out to remove foreign key constraints, so these properties should be updated
|
||
|
|
# @property
|
||
|
|
# def document_count(self):
|
||
|
|
# """Get the number of documents in this knowledge base."""
|
||
|
|
# return len(self.documents)
|
||
|
|
|
||
|
|
# @property
|
||
|
|
# def active_document_count(self):
|
||
|
|
# """Get the number of active documents in this knowledge base."""
|
||
|
|
# return len([doc for doc in self.documents if doc.is_processed])
|
||
|
|
|
||
|
|
|
||
|
|
class Document(BaseModel):
|
||
|
|
"""Document model."""
|
||
|
|
|
||
|
|
__tablename__ = "documents"
|
||
|
|
|
||
|
|
knowledge_base_id: Mapped[int] = mapped_column(Integer, nullable=False) # Removed ForeignKey("knowledge_bases.id")
|
||
|
|
filename: Mapped[str] = mapped_column(String(255), nullable=False)
|
||
|
|
original_filename: Mapped[str] = mapped_column(String(255), nullable=False)
|
||
|
|
file_path: Mapped[str] = mapped_column(String(500), nullable=False)
|
||
|
|
file_size: Mapped[int] = mapped_column(Integer, nullable=False) # in bytes
|
||
|
|
file_type: Mapped[str] = mapped_column(String(50), nullable=False) # .pdf, .txt, .docx, etc.
|
||
|
|
mime_type: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)
|
||
|
|
|
||
|
|
# Processing status
|
||
|
|
is_processed: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)
|
||
|
|
processing_error: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
|
||
|
|
|
||
|
|
# Content and metadata
|
||
|
|
content: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Extracted text content
|
||
|
|
doc_metadata: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) # Additional metadata
|
||
|
|
|
||
|
|
# Chunking information
|
||
|
|
chunk_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False)
|
||
|
|
|
||
|
|
# Embedding information
|
||
|
|
embedding_model: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)
|
||
|
|
vector_ids: Mapped[Optional[list]] = mapped_column(JSON, nullable=True) # Store vector database IDs for chunks
|
||
|
|
|
||
|
|
# Relationships removed to eliminate foreign key constraints
|
||
|
|
|
||
|
|
def __repr__(self):
|
||
|
|
return f"<Document(id={self.id}, filename='{self.filename}', kb_id={self.knowledge_base_id})>"
|
||
|
|
|
||
|
|
@property
|
||
|
|
def file_size_mb(self):
|
||
|
|
"""Get file size in MB."""
|
||
|
|
return round(self.file_size / (1024 * 1024), 2)
|
||
|
|
|
||
|
|
@property
|
||
|
|
def is_text_file(self):
|
||
|
|
"""Check if document is a text file."""
|
||
|
|
return self.file_type.lower() in ['.txt', '.md', '.csv']
|
||
|
|
|
||
|
|
@property
|
||
|
|
def is_pdf_file(self):
|
||
|
|
"""Check if document is a PDF file."""
|
||
|
|
return self.file_type.lower() == '.pdf'
|
||
|
|
|
||
|
|
@property
|
||
|
|
def is_office_file(self):
|
||
|
|
"""Check if document is an Office file."""
|
||
|
|
return self.file_type.lower() in ['.docx', '.xlsx', '.pptx']
|