""" Metadata models for RAG optimization. Implements structured metadata for knowledge chunks. Reference: rag-optimization/spec.md Section 3.2 """ from dataclasses import dataclass, field from datetime import date, datetime from enum import Enum from typing import Any from pydantic import BaseModel class RetrievalStrategy(str, Enum): """Retrieval strategy options.""" VECTOR_ONLY = "vector" BM25_ONLY = "bm25" HYBRID = "hybrid" TWO_STAGE = "two_stage" class ChunkMetadataModel(BaseModel): """Pydantic model for API serialization.""" category: str = "" subcategory: str = "" target_audience: list[str] = [] source_doc: str = "" source_url: str = "" department: str = "" valid_from: str | None = None valid_until: str | None = None priority: int = 5 keywords: list[str] = [] @dataclass class ChunkMetadata: """ Metadata for knowledge chunks. Reference: rag-optimization/spec.md Section 3.2.2 """ category: str = "" subcategory: str = "" target_audience: list[str] = field(default_factory=list) source_doc: str = "" source_url: str = "" department: str = "" valid_from: date | None = None valid_until: date | None = None priority: int = 5 keywords: list[str] = field(default_factory=list) def to_dict(self) -> dict[str, Any]: """Convert to dictionary for storage.""" return { "category": self.category, "subcategory": self.subcategory, "target_audience": self.target_audience, "source_doc": self.source_doc, "source_url": self.source_url, "department": self.department, "valid_from": self.valid_from.isoformat() if self.valid_from else None, "valid_until": self.valid_until.isoformat() if self.valid_until else None, "priority": self.priority, "keywords": self.keywords, } @classmethod def from_dict(cls, data: dict[str, Any]) -> "ChunkMetadata": """Create from dictionary.""" return cls( category=data.get("category", ""), subcategory=data.get("subcategory", ""), target_audience=data.get("target_audience", []), source_doc=data.get("source_doc", ""), source_url=data.get("source_url", ""), department=data.get("department", ""), valid_from=date.fromisoformat(data["valid_from"]) if data.get("valid_from") else None, valid_until=date.fromisoformat(data["valid_until"]) if data.get("valid_until") else None, priority=data.get("priority", 5), keywords=data.get("keywords", []), ) @dataclass class MetadataFilter: """ Filter conditions for metadata-based retrieval. Reference: rag-optimization/spec.md Section 4.1 """ categories: list[str] | None = None target_audiences: list[str] | None = None departments: list[str] | None = None valid_only: bool = True min_priority: int | None = None keywords: list[str] | None = None def to_qdrant_filter(self) -> dict[str, Any] | None: """Convert to Qdrant filter format.""" conditions = [] if self.categories: conditions.append({ "key": "metadata.category", "match": {"any": self.categories} }) if self.departments: conditions.append({ "key": "metadata.department", "match": {"any": self.departments} }) if self.target_audiences: conditions.append({ "key": "metadata.target_audience", "match": {"any": self.target_audiences} }) if self.valid_only: today = date.today().isoformat() conditions.append({ "should": [ {"key": "metadata.valid_until", "match": {"value": None}}, {"key": "metadata.valid_until", "range": {"gte": today}} ] }) if self.min_priority is not None: conditions.append({ "key": "metadata.priority", "range": {"lte": self.min_priority} }) if not conditions: return None if len(conditions) == 1: return {"must": conditions} return {"must": conditions} @dataclass class KnowledgeChunk: """ Knowledge chunk with multi-dimensional embeddings. Reference: rag-optimization/spec.md Section 3.2.1 """ chunk_id: str document_id: str content: str embedding_full: list[float] = field(default_factory=list) embedding_256: list[float] = field(default_factory=list) embedding_512: list[float] = field(default_factory=list) metadata: ChunkMetadata = field(default_factory=ChunkMetadata) created_at: datetime = field(default_factory=datetime.utcnow) updated_at: datetime = field(default_factory=datetime.utcnow) def to_qdrant_point(self, point_id: int | str) -> dict[str, Any]: """Convert to Qdrant point format.""" return { "id": point_id, "vector": { "full": self.embedding_full, "dim_256": self.embedding_256, "dim_512": self.embedding_512, }, "payload": { "chunk_id": self.chunk_id, "document_id": self.document_id, "text": self.content, "metadata": self.metadata.to_dict(), "created_at": self.created_at.isoformat(), "updated_at": self.updated_at.isoformat(), } } @dataclass class RetrieveRequest: """ Request for knowledge retrieval. Reference: rag-optimization/spec.md Section 4.1 """ query: str query_with_prefix: str = "" top_k: int = 10 filters: MetadataFilter | None = None strategy: RetrievalStrategy = RetrievalStrategy.HYBRID def __post_init__(self): if not self.query_with_prefix: self.query_with_prefix = f"search_query:{self.query}" @dataclass class RetrieveResult: """ Result from knowledge retrieval. Reference: rag-optimization/spec.md Section 4.1 """ chunk_id: str content: str score: float vector_score: float = 0.0 bm25_score: float = 0.0 metadata: ChunkMetadata = field(default_factory=ChunkMetadata) rank: int = 0