From c1d76093aa6d62d4a4d7a72740a219b9e760db1d Mon Sep 17 00:00:00 2001 From: MerCry Date: Tue, 24 Feb 2026 23:08:08 +0800 Subject: [PATCH] =?UTF-8?q?feat(AISVC-T7):=20=E5=B5=8C=E5=85=A5=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=8F=AF=E6=8F=92=E6=8B=94=E8=AE=BE=E8=AE=A1=E4=B8=8E?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E8=A7=A3=E6=9E=90=E6=94=AF=E6=8C=81=20[AC-AI?= =?UTF-8?q?SVC-29,=20AC-AISVC-30,=20AC-AISVC-31,=20AC-AISVC-32,=20AC-AISVC?= =?UTF-8?q?-33,=20AC-AISVC-34,=20AC-AISVC-35,=20AC-AISVC-36,=20AC-AISVC-37?= =?UTF-8?q?,=20AC-AISVC-38,=20AC-AISVC-39,=20AC-AISVC-40,=20AC-AISVC-41]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30] - 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30] - 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32] - 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33] - 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] - 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41] - 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37] - 更新 OpenAPI 契约添加嵌入管理接口 - 添加数据库初始化脚本 - 更新规范文档标记 Phase 7 完成 --- .gitignore | 6 +- ai-service/README.md | 52 ++- ai-service/app/api/admin/__init__.py | 3 +- ai-service/app/api/admin/embedding.py | 132 ++++++++ ai-service/app/api/admin/kb.py | 78 ++++- ai-service/app/main.py | 3 +- ai-service/app/services/document/__init__.py | 36 +++ ai-service/app/services/document/base.py | 106 ++++++ .../app/services/document/excel_parser.py | 239 ++++++++++++++ ai-service/app/services/document/factory.py | 215 +++++++++++++ .../app/services/document/pdf_parser.py | 221 +++++++++++++ .../app/services/document/text_parser.py | 101 ++++++ .../app/services/document/word_parser.py | 145 +++++++++ ai-service/app/services/embedding/__init__.py | 32 ++ ai-service/app/services/embedding/base.py | 130 ++++++++ ai-service/app/services/embedding/factory.py | 301 ++++++++++++++++++ .../app/services/embedding/ollama_provider.py | 157 +++++++++ .../app/services/embedding/openai_provider.py | 193 +++++++++++ .../services/retrieval/vector_retriever.py | 8 +- ai-service/scripts/init_db.py | 178 +++++++++++ ai-service/scripts/init_db.sql | 107 +++++++ spec/ai-service/openapi.provider.yaml | 294 ++++++++++++++++- spec/ai-service/progress.md | 157 ++++++++- spec/ai-service/requirements.md | 56 +++- spec/ai-service/tasks.md | 32 +- 25 files changed, 2959 insertions(+), 23 deletions(-) create mode 100644 ai-service/app/api/admin/embedding.py create mode 100644 ai-service/app/services/document/__init__.py create mode 100644 ai-service/app/services/document/base.py create mode 100644 ai-service/app/services/document/excel_parser.py create mode 100644 ai-service/app/services/document/factory.py create mode 100644 ai-service/app/services/document/pdf_parser.py create mode 100644 ai-service/app/services/document/text_parser.py create mode 100644 ai-service/app/services/document/word_parser.py create mode 100644 ai-service/app/services/embedding/__init__.py create mode 100644 ai-service/app/services/embedding/base.py create mode 100644 ai-service/app/services/embedding/factory.py create mode 100644 ai-service/app/services/embedding/ollama_provider.py create mode 100644 ai-service/app/services/embedding/openai_provider.py create mode 100644 ai-service/scripts/init_db.py create mode 100644 ai-service/scripts/init_db.sql diff --git a/.gitignore b/.gitignore index 5d381cc..232ea0c 100644 --- a/.gitignore +++ b/.gitignore @@ -158,5 +158,9 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ + +# Project specific +ai-service/uploads/ +*.local diff --git a/ai-service/README.md b/ai-service/README.md index 2234221..ac3f951 100644 --- a/ai-service/README.md +++ b/ai-service/README.md @@ -8,19 +8,67 @@ Python AI Service for intelligent chat with RAG support. - SSE streaming support via Accept: text/event-stream - RAG-powered responses with confidence scoring +## Prerequisites + +- PostgreSQL 12+ +- Qdrant vector database +- Python 3.10+ + ## Installation ```bash pip install -e ".[dev]" ``` +## Database Initialization + +### Option 1: Using Python script (Recommended) + +```bash +# Create database and tables +python scripts/init_db.py --create-db + +# Or just create tables (database must exist) +python scripts/init_db.py +``` + +### Option 2: Using SQL script + +```bash +# Connect to PostgreSQL and run +psql -U postgres -f scripts/init_db.sql +``` + +## Configuration + +Create a `.env` file in the project root: + +```env +AI_SERVICE_DATABASE_URL=postgresql+asyncpg://postgres:password@localhost:5432/ai_service +AI_SERVICE_QDRANT_URL=http://localhost:6333 +AI_SERVICE_LLM_API_KEY=your-api-key +AI_SERVICE_LLM_BASE_URL=https://api.openai.com/v1 +AI_SERVICE_LLM_MODEL=gpt-4o-mini +AI_SERVICE_DEBUG=true +``` + ## Running ```bash -uvicorn app.main:app --host 0.0.0.0 --port 8080 +uvicorn app.main:app --host 0.0.0.0 --port 8000 ``` ## API Endpoints -- `POST /ai/chat` - Generate AI reply +### Chat API +- `POST /ai/chat` - Generate AI reply (supports SSE streaming) - `GET /ai/health` - Health check + +### Admin API +- `GET /admin/kb/documents` - List documents +- `POST /admin/kb/documents` - Upload document +- `GET /admin/kb/index/jobs/{jobId}` - Get indexing job status +- `DELETE /admin/kb/documents/{docId}` - Delete document +- `POST /admin/rag/experiments/run` - Run RAG experiment +- `GET /admin/sessions` - List chat sessions +- `GET /admin/sessions/{sessionId}` - Get session details diff --git a/ai-service/app/api/admin/__init__.py b/ai-service/app/api/admin/__init__.py index 2639d0c..0ec350d 100644 --- a/ai-service/app/api/admin/__init__.py +++ b/ai-service/app/api/admin/__init__.py @@ -4,8 +4,9 @@ Admin API routes for AI Service management. """ from app.api.admin.dashboard import router as dashboard_router +from app.api.admin.embedding import router as embedding_router from app.api.admin.kb import router as kb_router from app.api.admin.rag import router as rag_router from app.api.admin.sessions import router as sessions_router -__all__ = ["dashboard_router", "kb_router", "rag_router", "sessions_router"] +__all__ = ["dashboard_router", "embedding_router", "kb_router", "rag_router", "sessions_router"] diff --git a/ai-service/app/api/admin/embedding.py b/ai-service/app/api/admin/embedding.py new file mode 100644 index 0000000..206b27d --- /dev/null +++ b/ai-service/app/api/admin/embedding.py @@ -0,0 +1,132 @@ +""" +Embedding management API endpoints. +[AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41] Embedding model management. +""" + +import logging +from typing import Any + +from fastapi import APIRouter, Depends, Header, HTTPException + +from app.core.exceptions import InvalidRequestException +from app.services.embedding import ( + EmbeddingException, + EmbeddingProviderFactory, + get_embedding_config_manager, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/embedding", tags=["Embedding Management"]) + + +def get_tenant_id(x_tenant_id: str = Header(..., alias="X-Tenant-Id")) -> str: + """Extract tenant ID from header.""" + if not x_tenant_id: + raise HTTPException(status_code=400, detail="X-Tenant-Id header is required") + return x_tenant_id + + +@router.get("/providers") +async def list_embedding_providers( + tenant_id: str = Depends(get_tenant_id), +) -> dict[str, Any]: + """ + Get available embedding providers. + [AC-AISVC-38] Returns all registered providers with config schemas. + """ + providers = [] + for name in EmbeddingProviderFactory.get_available_providers(): + info = EmbeddingProviderFactory.get_provider_info(name) + providers.append(info) + + return {"providers": providers} + + +@router.get("/config") +async def get_embedding_config( + tenant_id: str = Depends(get_tenant_id), +) -> dict[str, Any]: + """ + Get current embedding configuration. + [AC-AISVC-39] Returns current provider and config. + """ + manager = get_embedding_config_manager() + return manager.get_full_config() + + +@router.put("/config") +async def update_embedding_config( + request: dict[str, Any], + tenant_id: str = Depends(get_tenant_id), +) -> dict[str, Any]: + """ + Update embedding configuration. + [AC-AISVC-40, AC-AISVC-31] Updates config with hot-reload support. + """ + provider = request.get("provider") + config = request.get("config", {}) + + if not provider: + raise InvalidRequestException("provider is required") + + if provider not in EmbeddingProviderFactory.get_available_providers(): + raise InvalidRequestException( + f"Unknown provider: {provider}. " + f"Available: {EmbeddingProviderFactory.get_available_providers()}" + ) + + manager = get_embedding_config_manager() + + try: + await manager.update_config(provider, config) + return { + "success": True, + "message": f"Configuration updated to use {provider}", + } + except EmbeddingException as e: + raise InvalidRequestException(str(e)) + + +@router.post("/test") +async def test_embedding( + request: dict[str, Any] | None = None, + tenant_id: str = Depends(get_tenant_id), +) -> dict[str, Any]: + """ + Test embedding connection. + [AC-AISVC-41] Tests provider connectivity and returns dimension info. + """ + request = request or {} + test_text = request.get("test_text", "这是一个测试文本") + config = request.get("config") + provider = request.get("provider") + + manager = get_embedding_config_manager() + + result = await manager.test_connection( + test_text=test_text, + provider=provider, + config=config, + ) + + return result + + +@router.get("/formats") +async def get_supported_document_formats( + tenant_id: str = Depends(get_tenant_id), +) -> dict[str, Any]: + """ + Get supported document formats for embedding. + Returns list of supported file extensions. + """ + from app.services.document import get_supported_document_formats, DocumentParserFactory + + formats = get_supported_document_formats() + parser_info = DocumentParserFactory.get_parser_info() + + return { + "formats": formats, + "parsers": parser_info, + } diff --git a/ai-service/app/api/admin/kb.py b/ai-service/app/api/admin/kb.py index 84b0c24..680d097 100644 --- a/ai-service/app/api/admin/kb.py +++ b/ai-service/app/api/admin/kb.py @@ -161,6 +161,7 @@ async def list_documents( description="[AC-ASA-01] Upload document and trigger indexing job.", responses={ 202: {"description": "Accepted - async indexing job started"}, + 400: {"description": "Bad Request - unsupported format"}, 401: {"description": "Unauthorized", "model": ErrorResponse}, 403: {"description": "Forbidden", "model": ErrorResponse}, }, @@ -174,12 +175,31 @@ async def upload_document( ) -> JSONResponse: """ [AC-ASA-01] Upload document and create indexing job. + [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-37] Support multiple document formats. """ + from app.services.document import get_supported_document_formats, UnsupportedFormatError + from pathlib import Path + logger.info( f"[AC-ASA-01] Uploading document: tenant={tenant_id}, " f"kb_id={kb_id}, filename={file.filename}" ) + file_ext = Path(file.filename or "").suffix.lower() + supported_formats = get_supported_document_formats() + + if file_ext and file_ext not in supported_formats: + return JSONResponse( + status_code=400, + content={ + "code": "UNSUPPORTED_FORMAT", + "message": f"Unsupported file format: {file_ext}", + "details": { + "supported_formats": supported_formats, + }, + }, + ) + kb_service = KBService(session) kb = await kb_service.get_or_create_kb(tenant_id, kb_id) @@ -196,7 +216,7 @@ async def upload_document( await session.commit() background_tasks.add_task( - _index_document, tenant_id, str(job.id), str(document.id), file_content + _index_document, tenant_id, str(job.id), str(document.id), file_content, file.filename ) return JSONResponse( @@ -209,17 +229,20 @@ async def upload_document( ) -async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: bytes): +async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: bytes, filename: str | None = None): """ Background indexing task. - Uses Ollama nomic-embed-text for real embeddings. + [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Uses document parsing and pluggable embedding. """ from app.core.database import async_session_maker from app.services.kb import KBService from app.core.qdrant_client import get_qdrant_client - from app.services.embedding.ollama_embedding import get_embedding + from app.services.embedding import get_embedding_provider + from app.services.document import parse_document, UnsupportedFormatError, DocumentParseException from qdrant_client.models import PointStruct import asyncio + import tempfile + from pathlib import Path await asyncio.sleep(1) @@ -231,16 +254,52 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt ) await session.commit() - text = content.decode("utf-8", errors="ignore") + text = None + file_ext = Path(filename or "").suffix.lower() + + text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"} + + if file_ext in text_extensions or not file_ext: + text = content.decode("utf-8", errors="ignore") + else: + await kb_service.update_job_status( + tenant_id, job_id, IndexJobStatus.PROCESSING.value, progress=15 + ) + await session.commit() + + with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file: + tmp_file.write(content) + tmp_path = tmp_file.name + + try: + parse_result = parse_document(tmp_path) + text = parse_result.text + logger.info( + f"[AC-AISVC-33] Parsed document: {filename}, " + f"chars={len(text)}, format={parse_result.metadata.get('format')}" + ) + except (UnsupportedFormatError, DocumentParseException) as e: + logger.warning(f"Failed to parse document {filename}: {e}, falling back to text decode") + text = content.decode("utf-8", errors="ignore") + finally: + Path(tmp_path).unlink(missing_ok=True) + + await kb_service.update_job_status( + tenant_id, job_id, IndexJobStatus.PROCESSING.value, progress=20 + ) + await session.commit() + embedding_provider = await get_embedding_provider() + chunks = [text[i:i+500] for i in range(0, len(text), 500)] qdrant = await get_qdrant_client() await qdrant.ensure_collection_exists(tenant_id) points = [] + total_chunks = len(chunks) for i, chunk in enumerate(chunks): - embedding = await get_embedding(chunk) + embedding = await embedding_provider.embed(chunk) points.append( PointStruct( @@ -253,6 +312,13 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt }, ) ) + + progress = 20 + int((i + 1) / total_chunks * 70) + if i % 10 == 0 or i == total_chunks - 1: + await kb_service.update_job_status( + tenant_id, job_id, IndexJobStatus.PROCESSING.value, progress=progress + ) + await session.commit() if points: await qdrant.upsert_vectors(tenant_id, points) diff --git a/ai-service/app/main.py b/ai-service/app/main.py index c894bca..3e380fd 100644 --- a/ai-service/app/main.py +++ b/ai-service/app/main.py @@ -12,7 +12,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from app.api import chat_router, health_router -from app.api.admin import dashboard_router, kb_router, rag_router, sessions_router +from app.api.admin import dashboard_router, embedding_router, kb_router, rag_router, sessions_router from app.core.config import get_settings from app.core.database import close_db, init_db from app.core.exceptions import ( @@ -113,6 +113,7 @@ app.include_router(health_router) app.include_router(chat_router) app.include_router(dashboard_router) +app.include_router(embedding_router) app.include_router(kb_router) app.include_router(rag_router) app.include_router(sessions_router) diff --git a/ai-service/app/services/document/__init__.py b/ai-service/app/services/document/__init__.py new file mode 100644 index 0000000..b158b77 --- /dev/null +++ b/ai-service/app/services/document/__init__.py @@ -0,0 +1,36 @@ +""" +Document parsing services package. +[AC-AISVC-33] Provides document parsers for various formats. +""" + +from app.services.document.base import ( + DocumentParseException, + DocumentParser, + ParseResult, + UnsupportedFormatError, +) +from app.services.document.excel_parser import CSVParser, ExcelParser +from app.services.document.factory import ( + DocumentParserFactory, + get_supported_document_formats, + parse_document, +) +from app.services.document.pdf_parser import PDFParser, PDFPlumberParser +from app.services.document.text_parser import TextParser +from app.services.document.word_parser import WordParser + +__all__ = [ + "DocumentParseException", + "DocumentParser", + "ParseResult", + "UnsupportedFormatError", + "DocumentParserFactory", + "get_supported_document_formats", + "parse_document", + "PDFParser", + "PDFPlumberParser", + "WordParser", + "ExcelParser", + "CSVParser", + "TextParser", +] diff --git a/ai-service/app/services/document/base.py b/ai-service/app/services/document/base.py new file mode 100644 index 0000000..3a72b0d --- /dev/null +++ b/ai-service/app/services/document/base.py @@ -0,0 +1,106 @@ +""" +Base document parser interface. +[AC-AISVC-33] Abstract interface for document parsers. + +Design reference: progress.md Section 7.2 - DocumentParser interface +- parse(file_path) -> str +- get_supported_extensions() -> list[str] +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + + +@dataclass +class ParseResult: + """ + Result from document parsing. + [AC-AISVC-33] Contains parsed text and metadata. + """ + text: str + source_path: str + file_size: int + page_count: int | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + +class DocumentParser(ABC): + """ + Abstract base class for document parsers. + [AC-AISVC-33] Provides unified interface for different document formats. + """ + + @abstractmethod + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse a document and extract text content. + [AC-AISVC-33] Returns parsed text content. + + Args: + file_path: Path to the document file. + + Returns: + ParseResult with extracted text and metadata. + + Raises: + DocumentParseException: If parsing fails. + """ + pass + + @abstractmethod + def get_supported_extensions(self) -> list[str]: + """ + Get list of supported file extensions. + [AC-AISVC-37] Returns supported format list. + + Returns: + List of file extensions (e.g., [".pdf", ".txt"]) + """ + pass + + def supports_extension(self, extension: str) -> bool: + """ + Check if this parser supports a given file extension. + [AC-AISVC-37] Validates file format support. + + Args: + extension: File extension to check. + + Returns: + True if extension is supported. + """ + normalized = extension.lower() + if not normalized.startswith("."): + normalized = f".{normalized}" + return normalized in self.get_supported_extensions() + + +class DocumentParseException(Exception): + """Exception raised when document parsing fails.""" + + def __init__( + self, + message: str, + file_path: str = "", + parser: str = "", + details: dict[str, Any] | None = None + ): + self.file_path = file_path + self.parser = parser + self.details = details or {} + super().__init__(f"[{parser}] {message}" if parser else message) + + +class UnsupportedFormatError(DocumentParseException): + """Exception raised when file format is not supported.""" + + def __init__(self, extension: str, supported: list[str]): + super().__init__( + f"Unsupported file format: {extension}. " + f"Supported formats: {', '.join(supported)}", + parser="format_checker" + ) + self.extension = extension + self.supported_formats = supported diff --git a/ai-service/app/services/document/excel_parser.py b/ai-service/app/services/document/excel_parser.py new file mode 100644 index 0000000..7b445cc --- /dev/null +++ b/ai-service/app/services/document/excel_parser.py @@ -0,0 +1,239 @@ +""" +Excel document parser implementation. +[AC-AISVC-35] Excel (.xlsx) parsing using openpyxl. + +Extracts text content from Excel spreadsheets. +""" + +import logging +from pathlib import Path +from typing import Any + +from app.services.document.base import ( + DocumentParseException, + DocumentParser, + ParseResult, +) + +logger = logging.getLogger(__name__) + + +class ExcelParser(DocumentParser): + """ + Parser for Excel documents. + [AC-AISVC-35] Uses openpyxl for text extraction. + """ + + def __init__( + self, + include_empty_cells: bool = False, + max_rows_per_sheet: int = 10000, + **kwargs: Any + ): + self._include_empty_cells = include_empty_cells + self._max_rows_per_sheet = max_rows_per_sheet + self._extra_config = kwargs + self._openpyxl = None + + def _get_openpyxl(self): + """Lazy import of openpyxl.""" + if self._openpyxl is None: + try: + import openpyxl + self._openpyxl = openpyxl + except ImportError: + raise DocumentParseException( + "openpyxl not installed. Install with: pip install openpyxl", + parser="excel" + ) + return self._openpyxl + + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse an Excel document and extract text content. + [AC-AISVC-35] Converts spreadsheet data to structured text. + """ + path = Path(file_path) + + if not path.exists(): + raise DocumentParseException( + f"File not found: {path}", + file_path=str(path), + parser="excel" + ) + + if not self.supports_extension(path.suffix): + raise DocumentParseException( + f"Unsupported file extension: {path.suffix}", + file_path=str(path), + parser="excel" + ) + + openpyxl = self._get_openpyxl() + + try: + workbook = openpyxl.load_workbook(path, read_only=True, data_only=True) + + text_parts = [] + sheet_count = len(workbook.sheetnames) + total_rows = 0 + + for sheet_name in workbook.sheetnames: + sheet = workbook[sheet_name] + sheet_text_parts = [] + row_count = 0 + + for row in sheet.iter_rows(max_row=self._max_rows_per_sheet): + row_values = [] + has_content = False + + for cell in row: + value = cell.value + if value is not None: + has_content = True + row_values.append(str(value)) + elif self._include_empty_cells: + row_values.append("") + else: + row_values.append("") + + if has_content or self._include_empty_cells: + sheet_text_parts.append(" | ".join(row_values)) + row_count += 1 + + if sheet_text_parts: + text_parts.append(f"[Sheet: {sheet_name}]\n" + "\n".join(sheet_text_parts)) + total_rows += row_count + + workbook.close() + + full_text = "\n\n".join(text_parts) + file_size = path.stat().st_size + + logger.info( + f"Parsed Excel: {path.name}, sheets={sheet_count}, " + f"rows={total_rows}, chars={len(full_text)}, size={file_size}" + ) + + return ParseResult( + text=full_text, + source_path=str(path), + file_size=file_size, + metadata={ + "format": "xlsx", + "sheet_count": sheet_count, + "total_rows": total_rows, + } + ) + + except DocumentParseException: + raise + except Exception as e: + raise DocumentParseException( + f"Failed to parse Excel document: {e}", + file_path=str(path), + parser="excel", + details={"error": str(e)} + ) + + def get_supported_extensions(self) -> list[str]: + """Get supported file extensions.""" + return [".xlsx", ".xls"] + + +class CSVParser(DocumentParser): + """ + Parser for CSV files. + [AC-AISVC-35] Uses Python's built-in csv module. + """ + + def __init__(self, delimiter: str = ",", encoding: str = "utf-8", **kwargs: Any): + self._delimiter = delimiter + self._encoding = encoding + self._extra_config = kwargs + + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse a CSV file and extract text content. + [AC-AISVC-35] Converts CSV data to structured text. + """ + import csv + + path = Path(file_path) + + if not path.exists(): + raise DocumentParseException( + f"File not found: {path}", + file_path=str(path), + parser="csv" + ) + + try: + text_parts = [] + row_count = 0 + + with open(path, "r", encoding=self._encoding, newline="") as f: + reader = csv.reader(f, delimiter=self._delimiter) + for row in reader: + text_parts.append(" | ".join(row)) + row_count += 1 + + full_text = "\n".join(text_parts) + file_size = path.stat().st_size + + logger.info( + f"Parsed CSV: {path.name}, rows={row_count}, " + f"chars={len(full_text)}, size={file_size}" + ) + + return ParseResult( + text=full_text, + source_path=str(path), + file_size=file_size, + metadata={ + "format": "csv", + "row_count": row_count, + "delimiter": self._delimiter, + } + ) + + except UnicodeDecodeError: + try: + with open(path, "r", encoding="gbk", newline="") as f: + reader = csv.reader(f, delimiter=self._delimiter) + for row in reader: + text_parts.append(" | ".join(row)) + row_count += 1 + + full_text = "\n".join(text_parts) + file_size = path.stat().st_size + + return ParseResult( + text=full_text, + source_path=str(path), + file_size=file_size, + metadata={ + "format": "csv", + "row_count": row_count, + "delimiter": self._delimiter, + "encoding": "gbk", + } + ) + except Exception as e: + raise DocumentParseException( + f"Failed to parse CSV with encoding fallback: {e}", + file_path=str(path), + parser="csv", + details={"error": str(e)} + ) + except Exception as e: + raise DocumentParseException( + f"Failed to parse CSV: {e}", + file_path=str(path), + parser="csv", + details={"error": str(e)} + ) + + def get_supported_extensions(self) -> list[str]: + """Get supported file extensions.""" + return [".csv"] diff --git a/ai-service/app/services/document/factory.py b/ai-service/app/services/document/factory.py new file mode 100644 index 0000000..74d4b2b --- /dev/null +++ b/ai-service/app/services/document/factory.py @@ -0,0 +1,215 @@ +""" +Document parser factory. +[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Factory for document parsers. + +Design reference: progress.md Section 7.2 - DocumentParserFactory +""" + +import logging +from pathlib import Path +from typing import Any, Type + +from app.services.document.base import ( + DocumentParser, + DocumentParseException, + ParseResult, + UnsupportedFormatError, +) +from app.services.document.excel_parser import CSVParser, ExcelParser +from app.services.document.pdf_parser import PDFParser, PDFPlumberParser +from app.services.document.text_parser import TextParser +from app.services.document.word_parser import WordParser + +logger = logging.getLogger(__name__) + + +class DocumentParserFactory: + """ + Factory for creating document parsers. + [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Auto-selects parser based on file extension. + """ + + _parsers: dict[str, Type[DocumentParser]] = {} + _extension_map: dict[str, str] = {} + + @classmethod + def _initialize(cls) -> None: + """Initialize default parsers.""" + if cls._parsers: + return + + cls._parsers = { + "pdf": PDFParser, + "pdfplumber": PDFPlumberParser, + "word": WordParser, + "excel": ExcelParser, + "csv": CSVParser, + "text": TextParser, + } + + cls._extension_map = { + ".pdf": "pdf", + ".docx": "word", + ".xlsx": "excel", + ".xls": "excel", + ".csv": "csv", + ".txt": "text", + ".md": "text", + ".markdown": "text", + ".rst": "text", + ".log": "text", + ".json": "text", + ".xml": "text", + ".yaml": "text", + ".yml": "text", + } + + @classmethod + def register_parser( + cls, + name: str, + parser_class: Type[DocumentParser], + extensions: list[str], + ) -> None: + """ + Register a new document parser. + [AC-AISVC-33] Allows runtime registration of parsers. + """ + cls._initialize() + cls._parsers[name] = parser_class + for ext in extensions: + cls._extension_map[ext.lower()] = name + logger.info(f"Registered document parser: {name} for extensions: {extensions}") + + @classmethod + def get_supported_extensions(cls) -> list[str]: + """ + Get all supported file extensions. + [AC-AISVC-37] Returns list of supported formats. + """ + cls._initialize() + return list(cls._extension_map.keys()) + + @classmethod + def get_parser_for_extension(cls, extension: str) -> DocumentParser: + """ + Get a parser instance for a file extension. + [AC-AISVC-33] Creates appropriate parser based on extension. + """ + cls._initialize() + + normalized = extension.lower() + if not normalized.startswith("."): + normalized = f".{normalized}" + + if normalized not in cls._extension_map: + raise UnsupportedFormatError(normalized, cls.get_supported_extensions()) + + parser_name = cls._extension_map[normalized] + parser_class = cls._parsers[parser_name] + + return parser_class() + + @classmethod + def parse_file( + cls, + file_path: str | Path, + parser_name: str | None = None, + parser_config: dict[str, Any] | None = None, + ) -> ParseResult: + """ + Parse a document file. + [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Main entry point for parsing. + + Args: + file_path: Path to the document file + parser_name: Optional specific parser to use + parser_config: Optional configuration for the parser + + Returns: + ParseResult with extracted text and metadata + + Raises: + UnsupportedFormatError: If file format is not supported + DocumentParseException: If parsing fails + """ + cls._initialize() + + path = Path(file_path) + extension = path.suffix.lower() + + if parser_name: + if parser_name not in cls._parsers: + raise DocumentParseException( + f"Unknown parser: {parser_name}", + file_path=str(path), + parser="factory" + ) + parser_class = cls._parsers[parser_name] + parser = parser_class(**(parser_config or {})) + else: + parser = cls.get_parser_for_extension(extension) + if parser_config: + parser = type(parser)(**parser_config) + + return parser.parse(path) + + @classmethod + def get_parser_info(cls) -> list[dict[str, Any]]: + """ + Get information about available parsers. + [AC-AISVC-37] Returns parser metadata. + """ + cls._initialize() + + info = [] + for name, parser_class in cls._parsers.items(): + temp_instance = parser_class.__new__(parser_class) + extensions = temp_instance.get_supported_extensions() + + display_names = { + "pdf": "PDF 文档", + "pdfplumber": "PDF 文档 (pdfplumber)", + "word": "Word 文档", + "excel": "Excel 电子表格", + "csv": "CSV 文件", + "text": "文本文件", + } + + descriptions = { + "pdf": "使用 PyMuPDF 解析 PDF 文档,速度快", + "pdfplumber": "使用 pdfplumber 解析 PDF 文档,表格提取效果更好", + "word": "解析 Word 文档 (.docx),保留段落结构", + "excel": "解析 Excel 电子表格,支持多工作表", + "csv": "解析 CSV 文件,自动检测编码", + "text": "解析纯文本文件,支持多种编码", + } + + info.append({ + "name": name, + "display_name": display_names.get(name, name), + "description": descriptions.get(name, ""), + "extensions": extensions, + }) + + return info + + +def parse_document( + file_path: str | Path, + parser_name: str | None = None, + parser_config: dict[str, Any] | None = None, +) -> ParseResult: + """ + Convenience function for parsing documents. + [AC-AISVC-33] Simple entry point for document parsing. + """ + return DocumentParserFactory.parse_file(file_path, parser_name, parser_config) + + +def get_supported_document_formats() -> list[str]: + """ + Get list of supported document formats. + [AC-AISVC-37] Returns supported format extensions. + """ + return DocumentParserFactory.get_supported_extensions() diff --git a/ai-service/app/services/document/pdf_parser.py b/ai-service/app/services/document/pdf_parser.py new file mode 100644 index 0000000..3fd7790 --- /dev/null +++ b/ai-service/app/services/document/pdf_parser.py @@ -0,0 +1,221 @@ +""" +PDF document parser implementation. +[AC-AISVC-33] PDF parsing using PyMuPDF (fitz). + +Extracts text content from PDF files. +""" + +import logging +from pathlib import Path +from typing import Any + +from app.services.document.base import ( + DocumentParseException, + DocumentParser, + ParseResult, +) + +logger = logging.getLogger(__name__) + + +class PDFParser(DocumentParser): + """ + Parser for PDF documents. + [AC-AISVC-33] Uses PyMuPDF for text extraction. + """ + + def __init__(self, extract_images: bool = False, **kwargs: Any): + self._extract_images = extract_images + self._extra_config = kwargs + self._fitz = None + + def _get_fitz(self): + """Lazy import of PyMuPDF.""" + if self._fitz is None: + try: + import fitz + self._fitz = fitz + except ImportError: + raise DocumentParseException( + "PyMuPDF (fitz) not installed. Install with: pip install pymupdf", + parser="pdf" + ) + return self._fitz + + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse a PDF document and extract text content. + [AC-AISVC-33] Extracts text from all pages. + """ + path = Path(file_path) + + if not path.exists(): + raise DocumentParseException( + f"File not found: {path}", + file_path=str(path), + parser="pdf" + ) + + if not self.supports_extension(path.suffix): + raise DocumentParseException( + f"Unsupported file extension: {path.suffix}", + file_path=str(path), + parser="pdf" + ) + + fitz = self._get_fitz() + + try: + doc = fitz.open(path) + + text_parts = [] + page_count = len(doc) + + for page_num in range(page_count): + page = doc[page_num] + text = page.get_text() + if text.strip(): + text_parts.append(f"[Page {page_num + 1}]\n{text}") + + doc.close() + + full_text = "\n\n".join(text_parts) + file_size = path.stat().st_size + + logger.info( + f"Parsed PDF: {path.name}, pages={page_count}, " + f"chars={len(full_text)}, size={file_size}" + ) + + return ParseResult( + text=full_text, + source_path=str(path), + file_size=file_size, + page_count=page_count, + metadata={ + "format": "pdf", + "page_count": page_count, + } + ) + + except DocumentParseException: + raise + except Exception as e: + raise DocumentParseException( + f"Failed to parse PDF: {e}", + file_path=str(path), + parser="pdf", + details={"error": str(e)} + ) + + def get_supported_extensions(self) -> list[str]: + """Get supported file extensions.""" + return [".pdf"] + + +class PDFPlumberParser(DocumentParser): + """ + Alternative PDF parser using pdfplumber. + [AC-AISVC-33] Uses pdfplumber for text extraction. + + pdfplumber is better for table extraction but slower than PyMuPDF. + """ + + def __init__(self, extract_tables: bool = True, **kwargs: Any): + self._extract_tables = extract_tables + self._extra_config = kwargs + self._pdfplumber = None + + def _get_pdfplumber(self): + """Lazy import of pdfplumber.""" + if self._pdfplumber is None: + try: + import pdfplumber + self._pdfplumber = pdfplumber + except ImportError: + raise DocumentParseException( + "pdfplumber not installed. Install with: pip install pdfplumber", + parser="pdfplumber" + ) + return self._pdfplumber + + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse a PDF document and extract text content. + [AC-AISVC-33] Extracts text and optionally tables. + """ + path = Path(file_path) + + if not path.exists(): + raise DocumentParseException( + f"File not found: {path}", + file_path=str(path), + parser="pdfplumber" + ) + + pdfplumber = self._get_pdfplumber() + + try: + text_parts = [] + page_count = 0 + + with pdfplumber.open(path) as pdf: + page_count = len(pdf.pages) + + for page_num, page in enumerate(pdf.pages): + text = page.extract_text() or "" + + if self._extract_tables: + tables = page.extract_tables() + for table in tables: + table_text = self._format_table(table) + text += f"\n\n{table_text}" + + if text.strip(): + text_parts.append(f"[Page {page_num + 1}]\n{text}") + + full_text = "\n\n".join(text_parts) + file_size = path.stat().st_size + + logger.info( + f"Parsed PDF (pdfplumber): {path.name}, pages={page_count}, " + f"chars={len(full_text)}, size={file_size}" + ) + + return ParseResult( + text=full_text, + source_path=str(path), + file_size=file_size, + page_count=page_count, + metadata={ + "format": "pdf", + "parser": "pdfplumber", + "page_count": page_count, + } + ) + + except DocumentParseException: + raise + except Exception as e: + raise DocumentParseException( + f"Failed to parse PDF: {e}", + file_path=str(path), + parser="pdfplumber", + details={"error": str(e)} + ) + + def _format_table(self, table: list[list[str | None]]) -> str: + """Format a table as text.""" + if not table: + return "" + + lines = [] + for row in table: + cells = [str(cell) if cell else "" for cell in row] + lines.append(" | ".join(cells)) + + return "\n".join(lines) + + def get_supported_extensions(self) -> list[str]: + """Get supported file extensions.""" + return [".pdf"] diff --git a/ai-service/app/services/document/text_parser.py b/ai-service/app/services/document/text_parser.py new file mode 100644 index 0000000..3af0c00 --- /dev/null +++ b/ai-service/app/services/document/text_parser.py @@ -0,0 +1,101 @@ +""" +Text file parser implementation. +[AC-AISVC-33] Text file parsing for plain text and markdown. +""" + +import logging +from pathlib import Path +from typing import Any + +from app.services.document.base import ( + DocumentParseException, + DocumentParser, + ParseResult, +) + +logger = logging.getLogger(__name__) + + +class TextParser(DocumentParser): + """ + Parser for plain text files. + [AC-AISVC-33] Direct text extraction. + """ + + def __init__(self, encoding: str = "utf-8", **kwargs: Any): + self._encoding = encoding + self._extra_config = kwargs + + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse a text file and extract content. + [AC-AISVC-33] Direct file reading. + """ + path = Path(file_path) + + if not path.exists(): + raise DocumentParseException( + f"File not found: {path}", + file_path=str(path), + parser="text" + ) + + try: + with open(path, "r", encoding=self._encoding) as f: + text = f.read() + + file_size = path.stat().st_size + line_count = text.count("\n") + 1 + + logger.info( + f"Parsed text: {path.name}, lines={line_count}, " + f"chars={len(text)}, size={file_size}" + ) + + return ParseResult( + text=text, + source_path=str(path), + file_size=file_size, + metadata={ + "format": "text", + "line_count": line_count, + "encoding": self._encoding, + } + ) + + except UnicodeDecodeError: + try: + with open(path, "r", encoding="gbk") as f: + text = f.read() + + file_size = path.stat().st_size + line_count = text.count("\n") + 1 + + return ParseResult( + text=text, + source_path=str(path), + file_size=file_size, + metadata={ + "format": "text", + "line_count": line_count, + "encoding": "gbk", + } + ) + except Exception as e: + raise DocumentParseException( + f"Failed to parse text file with encoding fallback: {e}", + file_path=str(path), + parser="text", + details={"error": str(e)} + ) + except Exception as e: + raise DocumentParseException( + f"Failed to parse text file: {e}", + file_path=str(path), + parser="text", + details={"error": str(e)} + ) + + def get_supported_extensions(self) -> list[str]: + """Get supported file extensions.""" + return [".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"] diff --git a/ai-service/app/services/document/word_parser.py b/ai-service/app/services/document/word_parser.py new file mode 100644 index 0000000..c40e036 --- /dev/null +++ b/ai-service/app/services/document/word_parser.py @@ -0,0 +1,145 @@ +""" +Word document parser implementation. +[AC-AISVC-34] Word (.docx) parsing using python-docx. + +Extracts text content from Word documents. +""" + +import logging +from pathlib import Path +from typing import Any + +from app.services.document.base import ( + DocumentParseException, + DocumentParser, + ParseResult, +) + +logger = logging.getLogger(__name__) + + +class WordParser(DocumentParser): + """ + Parser for Word documents. + [AC-AISVC-34] Uses python-docx for text extraction. + """ + + def __init__(self, include_headers: bool = True, include_footers: bool = True, **kwargs: Any): + self._include_headers = include_headers + self._include_footers = include_footers + self._extra_config = kwargs + self._docx = None + + def _get_docx(self): + """Lazy import of python-docx.""" + if self._docx is None: + try: + from docx import Document + self._docx = Document + except ImportError: + raise DocumentParseException( + "python-docx not installed. Install with: pip install python-docx", + parser="word" + ) + return self._docx + + def parse(self, file_path: str | Path) -> ParseResult: + """ + Parse a Word document and extract text content. + [AC-AISVC-34] Extracts text while preserving paragraph structure. + """ + path = Path(file_path) + + if not path.exists(): + raise DocumentParseException( + f"File not found: {path}", + file_path=str(path), + parser="word" + ) + + if not self.supports_extension(path.suffix): + raise DocumentParseException( + f"Unsupported file extension: {path.suffix}", + file_path=str(path), + parser="word" + ) + + Document = self._get_docx() + + try: + doc = Document(path) + + text_parts = [] + + if self._include_headers: + for section in doc.sections: + header = section.header + if header and header.paragraphs: + header_text = "\n".join(p.text for p in header.paragraphs if p.text.strip()) + if header_text: + text_parts.append(f"[Header]\n{header_text}") + + for para in doc.paragraphs: + if para.text.strip(): + style_name = para.style.name if para.style else "" + if "Heading" in style_name: + text_parts.append(f"\n## {para.text}") + else: + text_parts.append(para.text) + + for table in doc.tables: + table_text = self._format_table(table) + if table_text.strip(): + text_parts.append(f"\n[Table]\n{table_text}") + + if self._include_footers: + for section in doc.sections: + footer = section.footer + if footer and footer.paragraphs: + footer_text = "\n".join(p.text for p in footer.paragraphs if p.text.strip()) + if footer_text: + text_parts.append(f"[Footer]\n{footer_text}") + + full_text = "\n\n".join(text_parts) + file_size = path.stat().st_size + + paragraph_count = len(doc.paragraphs) + table_count = len(doc.tables) + + logger.info( + f"Parsed Word: {path.name}, paragraphs={paragraph_count}, " + f"tables={table_count}, chars={len(full_text)}, size={file_size}" + ) + + return ParseResult( + text=full_text, + source_path=str(path), + file_size=file_size, + metadata={ + "format": "docx", + "paragraph_count": paragraph_count, + "table_count": table_count, + } + ) + + except DocumentParseException: + raise + except Exception as e: + raise DocumentParseException( + f"Failed to parse Word document: {e}", + file_path=str(path), + parser="word", + details={"error": str(e)} + ) + + def _format_table(self, table) -> str: + """Format a table as text.""" + lines = [] + for row in table.rows: + cells = [cell.text.strip() for cell in row.cells] + lines.append(" | ".join(cells)) + return "\n".join(lines) + + def get_supported_extensions(self) -> list[str]: + """Get supported file extensions.""" + return [".docx"] diff --git a/ai-service/app/services/embedding/__init__.py b/ai-service/app/services/embedding/__init__.py new file mode 100644 index 0000000..59aead2 --- /dev/null +++ b/ai-service/app/services/embedding/__init__.py @@ -0,0 +1,32 @@ +""" +Embedding services package. +[AC-AISVC-29] Provides pluggable embedding providers. +""" + +from app.services.embedding.base import ( + EmbeddingConfig, + EmbeddingException, + EmbeddingProvider, + EmbeddingResult, +) +from app.services.embedding.factory import ( + EmbeddingConfigManager, + EmbeddingProviderFactory, + get_embedding_config_manager, + get_embedding_provider, +) +from app.services.embedding.ollama_provider import OllamaEmbeddingProvider +from app.services.embedding.openai_provider import OpenAIEmbeddingProvider + +__all__ = [ + "EmbeddingConfig", + "EmbeddingException", + "EmbeddingProvider", + "EmbeddingResult", + "EmbeddingConfigManager", + "EmbeddingProviderFactory", + "get_embedding_config_manager", + "get_embedding_provider", + "OllamaEmbeddingProvider", + "OpenAIEmbeddingProvider", +] diff --git a/ai-service/app/services/embedding/base.py b/ai-service/app/services/embedding/base.py new file mode 100644 index 0000000..cea4e49 --- /dev/null +++ b/ai-service/app/services/embedding/base.py @@ -0,0 +1,130 @@ +""" +Base embedding provider interface. +[AC-AISVC-29] Abstract interface for embedding providers. + +Design reference: progress.md Section 7.1 - EmbeddingProvider interface +- embed(text) -> list[float] +- embed_batch(texts) -> list[list[float]] +- get_dimension() -> int +- get_provider_name() -> str +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class EmbeddingConfig: + """ + Configuration for embedding provider. + [AC-AISVC-31] Supports configurable embedding parameters. + """ + dimension: int = 768 + batch_size: int = 32 + timeout_seconds: int = 60 + extra_params: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class EmbeddingResult: + """ + Result from embedding generation. + [AC-AISVC-29] Contains embedding vector and metadata. + """ + embedding: list[float] + dimension: int + model: str + latency_ms: float = 0.0 + metadata: dict[str, Any] = field(default_factory=dict) + + +class EmbeddingProvider(ABC): + """ + Abstract base class for embedding providers. + [AC-AISVC-29] Provides unified interface for different embedding providers. + + Design reference: progress.md Section 7.1 - Architecture + - OllamaEmbeddingProvider / OpenAIEmbeddingProvider can be swapped + - Factory pattern for dynamic loading + """ + + @abstractmethod + async def embed(self, text: str) -> list[float]: + """ + Generate embedding vector for a single text. + [AC-AISVC-29] Returns embedding vector. + + Args: + text: Input text to embed. + + Returns: + List of floats representing the embedding vector. + + Raises: + EmbeddingException: If embedding generation fails. + """ + pass + + @abstractmethod + async def embed_batch(self, texts: list[str]) -> list[list[float]]: + """ + Generate embedding vectors for multiple texts. + [AC-AISVC-29] Returns list of embedding vectors. + + Args: + texts: List of input texts to embed. + + Returns: + List of embedding vectors. + + Raises: + EmbeddingException: If embedding generation fails. + """ + pass + + @abstractmethod + def get_dimension(self) -> int: + """ + Get the dimension of embedding vectors. + [AC-AISVC-29] Returns vector dimension. + + Returns: + Integer dimension of embedding vectors. + """ + pass + + @abstractmethod + def get_provider_name(self) -> str: + """ + Get the name of this embedding provider. + [AC-AISVC-29] Returns provider identifier. + + Returns: + String identifier for this provider. + """ + pass + + @abstractmethod + def get_config_schema(self) -> dict[str, Any]: + """ + Get the configuration schema for this provider. + [AC-AISVC-38] Returns JSON Schema for configuration parameters. + + Returns: + Dict describing configuration parameters. + """ + pass + + async def close(self) -> None: + """Close the provider and release resources. Default no-op.""" + pass + + +class EmbeddingException(Exception): + """Exception raised when embedding generation fails.""" + + def __init__(self, message: str, provider: str = "", details: dict[str, Any] | None = None): + self.provider = provider + self.details = details or {} + super().__init__(f"[{provider}] {message}" if provider else message) diff --git a/ai-service/app/services/embedding/factory.py b/ai-service/app/services/embedding/factory.py new file mode 100644 index 0000000..9e61c42 --- /dev/null +++ b/ai-service/app/services/embedding/factory.py @@ -0,0 +1,301 @@ +""" +Embedding provider factory and configuration manager. +[AC-AISVC-30, AC-AISVC-31] Factory pattern for dynamic provider loading. + +Design reference: progress.md Section 7.1 - Architecture +- EmbeddingProviderFactory: creates providers based on config +- EmbeddingConfigManager: manages configuration with hot-reload support +""" + +import logging +from typing import Any, Type + +from app.services.embedding.base import EmbeddingException, EmbeddingProvider +from app.services.embedding.ollama_provider import OllamaEmbeddingProvider +from app.services.embedding.openai_provider import OpenAIEmbeddingProvider + +logger = logging.getLogger(__name__) + + +class EmbeddingProviderFactory: + """ + Factory for creating embedding providers. + [AC-AISVC-30] Supports dynamic loading based on configuration. + """ + + _providers: dict[str, Type[EmbeddingProvider]] = { + "ollama": OllamaEmbeddingProvider, + "openai": OpenAIEmbeddingProvider, + } + + @classmethod + def register_provider(cls, name: str, provider_class: Type[EmbeddingProvider]) -> None: + """ + Register a new embedding provider. + [AC-AISVC-30] Allows runtime registration of providers. + """ + cls._providers[name] = provider_class + logger.info(f"Registered embedding provider: {name}") + + @classmethod + def get_available_providers(cls) -> list[str]: + """ + Get list of available provider names. + [AC-AISVC-38] Returns registered provider identifiers. + """ + return list(cls._providers.keys()) + + @classmethod + def get_provider_info(cls, name: str) -> dict[str, Any]: + """ + Get provider information including config schema. + [AC-AISVC-38] Returns provider metadata. + """ + if name not in cls._providers: + raise EmbeddingException( + f"Unknown provider: {name}", + provider="factory" + ) + + provider_class = cls._providers[name] + temp_instance = provider_class.__new__(provider_class) + + display_names = { + "ollama": "Ollama 本地模型", + "openai": "OpenAI Embedding", + } + + descriptions = { + "ollama": "使用 Ollama 运行的本地嵌入模型,支持 nomic-embed-text 等开源模型", + "openai": "使用 OpenAI 官方 Embedding API,支持 text-embedding-3 系列模型", + } + + return { + "name": name, + "display_name": display_names.get(name, name), + "description": descriptions.get(name, ""), + "config_schema": temp_instance.get_config_schema(), + } + + @classmethod + def create_provider( + cls, + name: str, + config: dict[str, Any], + ) -> EmbeddingProvider: + """ + Create an embedding provider instance. + [AC-AISVC-30] Creates provider based on configuration. + + Args: + name: Provider identifier (e.g., "ollama", "openai") + config: Provider-specific configuration + + Returns: + Configured EmbeddingProvider instance + + Raises: + EmbeddingException: If provider is unknown or configuration is invalid + """ + if name not in cls._providers: + raise EmbeddingException( + f"Unknown embedding provider: {name}. " + f"Available: {cls.get_available_providers()}", + provider="factory" + ) + + provider_class = cls._providers[name] + + try: + instance = provider_class(**config) + logger.info(f"Created embedding provider: {name}") + return instance + except Exception as e: + raise EmbeddingException( + f"Failed to create provider '{name}': {e}", + provider="factory", + details={"config": config} + ) + + +class EmbeddingConfigManager: + """ + Manager for embedding configuration. + [AC-AISVC-31] Supports hot-reload of configuration. + """ + + def __init__(self, default_provider: str = "ollama", default_config: dict[str, Any] | None = None): + self._provider_name = default_provider + self._config = default_config or { + "base_url": "http://localhost:11434", + "model": "nomic-embed-text", + "dimension": 768, + } + self._provider: EmbeddingProvider | None = None + + def get_provider_name(self) -> str: + """Get current provider name.""" + return self._provider_name + + def get_config(self) -> dict[str, Any]: + """Get current configuration.""" + return self._config.copy() + + def get_full_config(self) -> dict[str, Any]: + """ + Get full configuration including provider name. + [AC-AISVC-39] Returns complete configuration for API response. + """ + return { + "provider": self._provider_name, + "config": self._config.copy(), + } + + async def get_provider(self) -> EmbeddingProvider: + """ + Get or create the embedding provider. + [AC-AISVC-29] Returns configured provider instance. + """ + if self._provider is None: + self._provider = EmbeddingProviderFactory.create_provider( + self._provider_name, + self._config + ) + return self._provider + + async def update_config( + self, + provider: str, + config: dict[str, Any], + ) -> bool: + """ + Update embedding configuration. + [AC-AISVC-31, AC-AISVC-40] Supports hot-reload. + + Args: + provider: New provider name + config: New provider configuration + + Returns: + True if update was successful + + Raises: + EmbeddingException: If configuration is invalid + """ + old_provider = self._provider_name + old_config = self._config.copy() + + try: + new_provider_instance = EmbeddingProviderFactory.create_provider( + provider, + config + ) + + if self._provider: + await self._provider.close() + + self._provider_name = provider + self._config = config + self._provider = new_provider_instance + + logger.info(f"Updated embedding config: provider={provider}") + return True + + except Exception as e: + self._provider_name = old_provider + self._config = old_config + raise EmbeddingException( + f"Failed to update config: {e}", + provider="config_manager", + details={"provider": provider, "config": config} + ) + + async def test_connection( + self, + test_text: str = "这是一个测试文本", + provider: str | None = None, + config: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """ + Test embedding connection. + [AC-AISVC-41] Tests provider connectivity. + + Args: + test_text: Text to embed for testing + provider: Provider to test (uses current if None) + config: Config to test (uses current if None) + + Returns: + Dict with test results including success, dimension, latency + """ + import time + + test_provider_name = provider or self._provider_name + test_config = config or self._config + + try: + test_provider = EmbeddingProviderFactory.create_provider( + test_provider_name, + test_config + ) + + start_time = time.perf_counter() + embedding = await test_provider.embed(test_text) + latency_ms = (time.perf_counter() - start_time) * 1000 + + await test_provider.close() + + return { + "success": True, + "dimension": len(embedding), + "latency_ms": latency_ms, + "message": f"连接成功,向量维度: {len(embedding)}", + } + + except Exception as e: + return { + "success": False, + "dimension": 0, + "latency_ms": 0, + "error": str(e), + "message": f"连接失败: {e}", + } + + async def close(self) -> None: + """Close the current provider.""" + if self._provider: + await self._provider.close() + self._provider = None + + +_embedding_config_manager: EmbeddingConfigManager | None = None + + +def get_embedding_config_manager() -> EmbeddingConfigManager: + """ + Get the global embedding config manager. + [AC-AISVC-31] Singleton pattern for configuration management. + """ + global _embedding_config_manager + if _embedding_config_manager is None: + from app.core.config import get_settings + settings = get_settings() + + _embedding_config_manager = EmbeddingConfigManager( + default_provider="ollama", + default_config={ + "base_url": settings.ollama_base_url, + "model": settings.ollama_embedding_model, + "dimension": settings.qdrant_vector_size, + } + ) + return _embedding_config_manager + + +async def get_embedding_provider() -> EmbeddingProvider: + """ + Get the current embedding provider. + [AC-AISVC-29] Convenience function for getting provider. + """ + manager = get_embedding_config_manager() + return await manager.get_provider() diff --git a/ai-service/app/services/embedding/ollama_provider.py b/ai-service/app/services/embedding/ollama_provider.py new file mode 100644 index 0000000..39093e6 --- /dev/null +++ b/ai-service/app/services/embedding/ollama_provider.py @@ -0,0 +1,157 @@ +""" +Ollama embedding provider implementation. +[AC-AISVC-29, AC-AISVC-30] Ollama-based embedding provider. + +Uses Ollama API for generating text embeddings. +""" + +import logging +import time +from typing import Any + +import httpx + +from app.services.embedding.base import ( + EmbeddingConfig, + EmbeddingException, + EmbeddingProvider, +) + +logger = logging.getLogger(__name__) + + +class OllamaEmbeddingProvider(EmbeddingProvider): + """ + Embedding provider using Ollama API. + [AC-AISVC-29, AC-AISVC-30] Supports local embedding models via Ollama. + """ + + PROVIDER_NAME = "ollama" + + def __init__( + self, + base_url: str = "http://localhost:11434", + model: str = "nomic-embed-text", + dimension: int = 768, + timeout_seconds: int = 60, + **kwargs: Any, + ): + self._base_url = base_url.rstrip("/") + self._model = model + self._dimension = dimension + self._timeout = timeout_seconds + self._client: httpx.AsyncClient | None = None + self._extra_config = kwargs + + async def _get_client(self) -> httpx.AsyncClient: + if self._client is None: + self._client = httpx.AsyncClient(timeout=self._timeout) + return self._client + + async def embed(self, text: str) -> list[float]: + """ + Generate embedding vector for a single text using Ollama API. + [AC-AISVC-29] Returns embedding vector. + """ + start_time = time.perf_counter() + + try: + client = await self._get_client() + response = await client.post( + f"{self._base_url}/api/embeddings", + json={ + "model": self._model, + "prompt": text, + } + ) + response.raise_for_status() + data = response.json() + embedding = data.get("embedding", []) + + if not embedding: + raise EmbeddingException( + "Empty embedding returned", + provider=self.PROVIDER_NAME, + details={"text_length": len(text)} + ) + + latency_ms = (time.perf_counter() - start_time) * 1000 + logger.debug( + f"Generated embedding via Ollama: dim={len(embedding)}, " + f"latency={latency_ms:.2f}ms" + ) + + return embedding + + except httpx.HTTPStatusError as e: + raise EmbeddingException( + f"Ollama API error: {e.response.status_code}", + provider=self.PROVIDER_NAME, + details={"status_code": e.response.status_code, "response": e.response.text} + ) + except httpx.RequestError as e: + raise EmbeddingException( + f"Ollama connection error: {e}", + provider=self.PROVIDER_NAME, + details={"base_url": self._base_url} + ) + except EmbeddingException: + raise + except Exception as e: + raise EmbeddingException( + f"Embedding generation failed: {e}", + provider=self.PROVIDER_NAME + ) + + async def embed_batch(self, texts: list[str]) -> list[list[float]]: + """ + Generate embedding vectors for multiple texts. + [AC-AISVC-29] Sequential embedding generation. + """ + embeddings = [] + for text in texts: + embedding = await self.embed(text) + embeddings.append(embedding) + return embeddings + + def get_dimension(self) -> int: + """Get the dimension of embedding vectors.""" + return self._dimension + + def get_provider_name(self) -> str: + """Get the name of this embedding provider.""" + return self.PROVIDER_NAME + + def get_config_schema(self) -> dict[str, Any]: + """ + Get the configuration schema for Ollama provider. + [AC-AISVC-38] Returns JSON Schema for configuration parameters. + """ + return { + "base_url": { + "type": "string", + "description": "Ollama API 地址", + "default": "http://localhost:11434", + }, + "model": { + "type": "string", + "description": "嵌入模型名称", + "default": "nomic-embed-text", + }, + "dimension": { + "type": "integer", + "description": "向量维度", + "default": 768, + }, + "timeout_seconds": { + "type": "integer", + "description": "请求超时时间(秒)", + "default": 60, + }, + } + + async def close(self) -> None: + """Close the HTTP client.""" + if self._client: + await self._client.aclose() + self._client = None diff --git a/ai-service/app/services/embedding/openai_provider.py b/ai-service/app/services/embedding/openai_provider.py new file mode 100644 index 0000000..0e15aab --- /dev/null +++ b/ai-service/app/services/embedding/openai_provider.py @@ -0,0 +1,193 @@ +""" +OpenAI embedding provider implementation. +[AC-AISVC-29, AC-AISVC-30] OpenAI-based embedding provider. + +Uses OpenAI API for generating text embeddings. +""" + +import logging +import time +from typing import Any + +import httpx + +from app.services.embedding.base import ( + EmbeddingException, + EmbeddingProvider, +) + +logger = logging.getLogger(__name__) + + +class OpenAIEmbeddingProvider(EmbeddingProvider): + """ + Embedding provider using OpenAI API. + [AC-AISVC-29, AC-AISVC-30] Supports OpenAI embedding models. + """ + + PROVIDER_NAME = "openai" + + MODEL_DIMENSIONS = { + "text-embedding-ada-002": 1536, + "text-embedding-3-small": 1536, + "text-embedding-3-large": 3072, + } + + def __init__( + self, + api_key: str, + model: str = "text-embedding-3-small", + base_url: str = "https://api.openai.com/v1", + dimension: int | None = None, + timeout_seconds: int = 60, + **kwargs: Any, + ): + self._api_key = api_key + self._model = model + self._base_url = base_url.rstrip("/") + self._timeout = timeout_seconds + self._client: httpx.AsyncClient | None = None + self._extra_config = kwargs + + if dimension: + self._dimension = dimension + elif model in self.MODEL_DIMENSIONS: + self._dimension = self.MODEL_DIMENSIONS[model] + else: + self._dimension = 1536 + + async def _get_client(self) -> httpx.AsyncClient: + if self._client is None: + self._client = httpx.AsyncClient(timeout=self._timeout) + return self._client + + async def embed(self, text: str) -> list[float]: + """ + Generate embedding vector for a single text using OpenAI API. + [AC-AISVC-29] Returns embedding vector. + """ + embeddings = await self.embed_batch([text]) + return embeddings[0] + + async def embed_batch(self, texts: list[str]) -> list[list[float]]: + """ + Generate embedding vectors for multiple texts using OpenAI API. + [AC-AISVC-29] Supports batch embedding for efficiency. + """ + start_time = time.perf_counter() + + try: + client = await self._get_client() + + request_body: dict[str, Any] = { + "model": self._model, + "input": texts, + } + if self._dimension and self._model.startswith("text-embedding-3"): + request_body["dimensions"] = self._dimension + + response = await client.post( + f"{self._base_url}/embeddings", + headers={ + "Authorization": f"Bearer {self._api_key}", + "Content-Type": "application/json", + }, + json=request_body, + ) + response.raise_for_status() + data = response.json() + + embeddings = [] + for item in data.get("data", []): + embedding = item.get("embedding", []) + if not embedding: + raise EmbeddingException( + "Empty embedding returned", + provider=self.PROVIDER_NAME, + details={"index": item.get("index", 0)} + ) + embeddings.append(embedding) + + if len(embeddings) != len(texts): + raise EmbeddingException( + f"Embedding count mismatch: expected {len(texts)}, got {len(embeddings)}", + provider=self.PROVIDER_NAME + ) + + latency_ms = (time.perf_counter() - start_time) * 1000 + logger.debug( + f"Generated {len(embeddings)} embeddings via OpenAI: " + f"dim={len(embeddings[0]) if embeddings else 0}, " + f"latency={latency_ms:.2f}ms" + ) + + return embeddings + + except httpx.HTTPStatusError as e: + raise EmbeddingException( + f"OpenAI API error: {e.response.status_code}", + provider=self.PROVIDER_NAME, + details={"status_code": e.response.status_code, "response": e.response.text} + ) + except httpx.RequestError as e: + raise EmbeddingException( + f"OpenAI connection error: {e}", + provider=self.PROVIDER_NAME, + details={"base_url": self._base_url} + ) + except EmbeddingException: + raise + except Exception as e: + raise EmbeddingException( + f"Embedding generation failed: {e}", + provider=self.PROVIDER_NAME + ) + + def get_dimension(self) -> int: + """Get the dimension of embedding vectors.""" + return self._dimension + + def get_provider_name(self) -> str: + """Get the name of this embedding provider.""" + return self.PROVIDER_NAME + + def get_config_schema(self) -> dict[str, Any]: + """ + Get the configuration schema for OpenAI provider. + [AC-AISVC-38] Returns JSON Schema for configuration parameters. + """ + return { + "api_key": { + "type": "string", + "description": "OpenAI API 密钥", + "required": True, + "secret": True, + }, + "model": { + "type": "string", + "description": "嵌入模型名称", + "default": "text-embedding-3-small", + "enum": list(self.MODEL_DIMENSIONS.keys()), + }, + "base_url": { + "type": "string", + "description": "OpenAI API 地址(支持兼容接口)", + "default": "https://api.openai.com/v1", + }, + "dimension": { + "type": "integer", + "description": "向量维度(仅 text-embedding-3 系列支持自定义)", + "default": 1536, + }, + "timeout_seconds": { + "type": "integer", + "description": "请求超时时间(秒)", + "default": 60, + }, + } + + async def close(self) -> None: + """Close the HTTP client.""" + if self._client: + await self._client.aclose() + self._client = None diff --git a/ai-service/app/services/retrieval/vector_retriever.py b/ai-service/app/services/retrieval/vector_retriever.py index a4a3a7e..e63de6c 100644 --- a/ai-service/app/services/retrieval/vector_retriever.py +++ b/ai-service/app/services/retrieval/vector_retriever.py @@ -119,11 +119,13 @@ class VectorRetriever(BaseRetriever): async def _get_embedding(self, text: str) -> list[float]: """ - Generate embedding for text using Ollama nomic-embed-text model. + Generate embedding for text using pluggable embedding provider. + [AC-AISVC-29] Uses configured embedding provider. """ - from app.services.embedding.ollama_embedding import get_embedding as get_ollama_embedding + from app.services.embedding import get_embedding_provider - return await get_ollama_embedding(text) + provider = await get_embedding_provider() + return await provider.embed(text) async def health_check(self) -> bool: """ diff --git a/ai-service/scripts/init_db.py b/ai-service/scripts/init_db.py new file mode 100644 index 0000000..1eef1c2 --- /dev/null +++ b/ai-service/scripts/init_db.py @@ -0,0 +1,178 @@ +""" +Database initialization script for AI Service. +Run this script to create the database and all required tables. + +Usage: + python scripts/init_db.py [--create-db] + +Options: + --create-db Create the database if it doesn't exist +""" + +import asyncio +import argparse +from sqlalchemy.ext.asyncio import create_async_engine +from sqlalchemy import text + +from app.core.config import get_settings + + +CREATE_TABLES_SQL = [ + """ + CREATE TABLE IF NOT EXISTS chat_sessions ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + session_id VARCHAR NOT NULL, + channel_type VARCHAR, + metadata JSON, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS chat_messages ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + session_id VARCHAR NOT NULL, + role VARCHAR NOT NULL, + content TEXT NOT NULL, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS knowledge_bases ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + name VARCHAR NOT NULL, + description VARCHAR, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS documents ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + kb_id VARCHAR NOT NULL, + file_name VARCHAR NOT NULL, + file_path VARCHAR, + file_size INTEGER, + file_type VARCHAR, + status VARCHAR NOT NULL DEFAULT 'pending', + error_msg VARCHAR, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS index_jobs ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + doc_id UUID NOT NULL, + status VARCHAR NOT NULL DEFAULT 'pending', + progress INTEGER NOT NULL DEFAULT 0, + error_msg VARCHAR, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL + ) + """, +] + +CREATE_INDEXES_SQL = [ + "CREATE INDEX IF NOT EXISTS ix_chat_sessions_tenant_id ON chat_sessions (tenant_id)", + "CREATE UNIQUE INDEX IF NOT EXISTS ix_chat_sessions_tenant_session ON chat_sessions (tenant_id, session_id)", + "CREATE INDEX IF NOT EXISTS ix_chat_messages_tenant_id ON chat_messages (tenant_id)", + "CREATE INDEX IF NOT EXISTS ix_chat_messages_tenant_session ON chat_messages (tenant_id, session_id)", + "CREATE INDEX IF NOT EXISTS ix_chat_messages_tenant_session_created ON chat_messages (tenant_id, session_id, created_at)", + "CREATE INDEX IF NOT EXISTS ix_knowledge_bases_tenant_id ON knowledge_bases (tenant_id)", + "CREATE INDEX IF NOT EXISTS ix_documents_tenant_id ON documents (tenant_id)", + "CREATE INDEX IF NOT EXISTS ix_documents_tenant_kb ON documents (tenant_id, kb_id)", + "CREATE INDEX IF NOT EXISTS ix_documents_tenant_status ON documents (tenant_id, status)", + "CREATE INDEX IF NOT EXISTS ix_index_jobs_tenant_id ON index_jobs (tenant_id)", + "CREATE INDEX IF NOT EXISTS ix_index_jobs_tenant_doc ON index_jobs (tenant_id, doc_id)", + "CREATE INDEX IF NOT EXISTS ix_index_jobs_tenant_status ON index_jobs (tenant_id, status)", +] + + +async def create_database_if_not_exists(settings): + """Create database if it doesn't exist.""" + db_url = settings.database_url + postgres_url = db_url.rsplit("/", 1)[0] + "/postgres" + + engine = create_async_engine( + postgres_url, + isolation_level="AUTOCOMMIT", + pool_size=1, + ) + + db_name = db_url.rsplit("/", 1)[-1].split("?")[0] + + try: + async with engine.connect() as conn: + result = await conn.execute( + text(f"SELECT datname FROM pg_database WHERE datname = '{db_name}'") + ) + exists = result.fetchone() + + if not exists: + print(f"Creating database '{db_name}'...") + await conn.execute(text(f'CREATE DATABASE "{db_name}"')) + print(f"Database '{db_name}' created successfully!") + else: + print(f"Database '{db_name}' already exists.") + except Exception as e: + print(f"Error creating database: {e}") + raise + finally: + await engine.dispose() + + +async def create_tables(settings): + """Create all tables (idempotent).""" + engine = create_async_engine(settings.database_url) + + try: + async with engine.begin() as conn: + for stmt in CREATE_TABLES_SQL: + await conn.execute(text(stmt.strip())) + + for stmt in CREATE_INDEXES_SQL: + try: + await conn.execute(text(stmt)) + except Exception as e: + if "already exists" in str(e).lower() or "已经存在" in str(e): + continue + raise + + print("Tables and indexes created/verified successfully!") + + async with engine.connect() as conn: + result = await conn.execute( + text("SELECT tablename FROM pg_tables WHERE schemaname = 'public' ORDER BY tablename") + ) + tables = [row[0] for row in result] + print(f"Tables in database: {tables}") + except Exception as e: + print(f"Error creating tables: {e}") + raise + finally: + await engine.dispose() + + +async def main(): + parser = argparse.ArgumentParser(description="Initialize AI Service database") + parser.add_argument("--create-db", action="store_true", help="Create database if it doesn't exist") + args = parser.parse_args() + + settings = get_settings() + print(f"Database URL: {settings.database_url.split('@')[1] if '@' in settings.database_url else settings.database_url}") + + if args.create_db: + await create_database_if_not_exists(settings) + + await create_tables(settings) + print("\nDatabase initialization complete!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/ai-service/scripts/init_db.sql b/ai-service/scripts/init_db.sql new file mode 100644 index 0000000..9a68dac --- /dev/null +++ b/ai-service/scripts/init_db.sql @@ -0,0 +1,107 @@ +-- AI Service Database Initialization Script +-- Version: 0.2.0 +-- Description: Creates all required tables for AI Service with multi-tenant support +-- +-- Usage: +-- psql -U postgres -f scripts/init_db.sql +-- Or connect to ai_service database and run this script + +-- ============================================ +-- Chat Sessions Table +-- ============================================ +CREATE TABLE IF NOT EXISTS chat_sessions ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + session_id VARCHAR NOT NULL, + channel_type VARCHAR, + metadata JSON, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL +); + +-- ============================================ +-- Chat Messages Table +-- ============================================ +CREATE TABLE IF NOT EXISTS chat_messages ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + session_id VARCHAR NOT NULL, + role VARCHAR NOT NULL, + content TEXT NOT NULL, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL +); + +-- ============================================ +-- Knowledge Bases Table +-- ============================================ +CREATE TABLE IF NOT EXISTS knowledge_bases ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + name VARCHAR NOT NULL, + description VARCHAR, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL +); + +-- ============================================ +-- Documents Table +-- ============================================ +CREATE TABLE IF NOT EXISTS documents ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + kb_id VARCHAR NOT NULL, + file_name VARCHAR NOT NULL, + file_path VARCHAR, + file_size INTEGER, + file_type VARCHAR, + status VARCHAR NOT NULL DEFAULT 'pending', + error_msg VARCHAR, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL +); + +-- ============================================ +-- Index Jobs Table +-- ============================================ +CREATE TABLE IF NOT EXISTS index_jobs ( + id UUID NOT NULL PRIMARY KEY, + tenant_id VARCHAR NOT NULL, + doc_id UUID NOT NULL, + status VARCHAR NOT NULL DEFAULT 'pending', + progress INTEGER NOT NULL DEFAULT 0, + error_msg VARCHAR, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL +); + +-- ============================================ +-- Indexes +-- ============================================ + +-- Chat Sessions Indexes +CREATE INDEX IF NOT EXISTS ix_chat_sessions_tenant_id ON chat_sessions (tenant_id); +CREATE UNIQUE INDEX IF NOT EXISTS ix_chat_sessions_tenant_session ON chat_sessions (tenant_id, session_id); + +-- Chat Messages Indexes +CREATE INDEX IF NOT EXISTS ix_chat_messages_tenant_id ON chat_messages (tenant_id); +CREATE INDEX IF NOT EXISTS ix_chat_messages_tenant_session ON chat_messages (tenant_id, session_id); +CREATE INDEX IF NOT EXISTS ix_chat_messages_tenant_session_created ON chat_messages (tenant_id, session_id, created_at); + +-- Knowledge Bases Indexes +CREATE INDEX IF NOT EXISTS ix_knowledge_bases_tenant_id ON knowledge_bases (tenant_id); + +-- Documents Indexes +CREATE INDEX IF NOT EXISTS ix_documents_tenant_id ON documents (tenant_id); +CREATE INDEX IF NOT EXISTS ix_documents_tenant_kb ON documents (tenant_id, kb_id); +CREATE INDEX IF NOT EXISTS ix_documents_tenant_status ON documents (tenant_id, status); + +-- Index Jobs Indexes +CREATE INDEX IF NOT EXISTS ix_index_jobs_tenant_id ON index_jobs (tenant_id); +CREATE INDEX IF NOT EXISTS ix_index_jobs_tenant_doc ON index_jobs (tenant_id, doc_id); +CREATE INDEX IF NOT EXISTS ix_index_jobs_tenant_status ON index_jobs (tenant_id, status); + +-- ============================================ +-- Verification +-- ============================================ +-- Run this to verify all tables are created: +-- SELECT tablename FROM pg_tables WHERE schemaname = 'public' ORDER BY tablename; diff --git a/spec/ai-service/openapi.provider.yaml b/spec/ai-service/openapi.provider.yaml index 4688a85..9db455d 100644 --- a/spec/ai-service/openapi.provider.yaml +++ b/spec/ai-service/openapi.provider.yaml @@ -9,8 +9,8 @@ info: - non-streaming JSON 响应 schema 必须一致(reply/confidence/shouldTransfer 等) 额外扩展:支持 SSE 流式输出(Accept: text/event-stream)。 - version: 1.0.0 - x-contract-level: L0 + version: 1.1.0 + x-contract-level: L2 x-provider: "python-ai-service" x-consumer: "java-main-framework" @@ -23,6 +23,8 @@ tags: description: 对话生成 - name: Health description: 健康检查 + - name: Embedding Management + description: 嵌入模型管理 paths: /ai/chat: @@ -157,6 +159,190 @@ data: {"reply":"...","confidence":0.9,"shouldTransfer":false}\n '503': description: 服务不健康 + /admin/embedding/providers: + get: + operationId: listEmbeddingProviders + summary: 获取可用的嵌入模型提供者列表 + description: | + 返回所有已注册的嵌入模型提供者及其配置参数定义。 + + 覆盖验收标准: + - AC-AISVC-38: 返回所有已注册的提供者列表及其配置参数定义 + tags: + - Embedding Management + x-requirements: + - AC-AISVC-38 + parameters: + - name: X-Tenant-Id + in: header + required: true + description: 租户ID + schema: + type: string + responses: + '200': + description: 成功返回提供者列表 + content: + application/json: + schema: + type: object + properties: + providers: + type: array + items: + $ref: '#/components/schemas/EmbeddingProviderInfo' + '500': + description: 服务内部错误 + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + + /admin/embedding/config: + get: + operationId: getEmbeddingConfig + summary: 获取当前嵌入模型配置 + description: | + 返回当前激活的嵌入模型提供者及其参数配置。 + + 覆盖验收标准: + - AC-AISVC-39: 返回当前激活的提供者及其参数配置 + tags: + - Embedding Management + x-requirements: + - AC-AISVC-39 + parameters: + - name: X-Tenant-Id + in: header + required: true + description: 租户ID + schema: + type: string + responses: + '200': + description: 成功返回当前配置 + content: + application/json: + schema: + $ref: '#/components/schemas/EmbeddingConfig' + '500': + description: 服务内部错误 + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + put: + operationId: updateEmbeddingConfig + summary: 更新嵌入模型配置 + description: | + 更新嵌入模型配置,支持热更新(无需重启服务)。 + + 覆盖验收标准: + - AC-AISVC-40: 验证配置有效性,更新配置并返回成功状态 + - AC-AISVC-31: 支持热更新 + tags: + - Embedding Management + x-requirements: + - AC-AISVC-40 + - AC-AISVC-31 + parameters: + - name: X-Tenant-Id + in: header + required: true + description: 租户ID + schema: + type: string + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/EmbeddingConfigUpdate' + example: + provider: "ollama" + config: + base_url: "http://localhost:11434" + model: "nomic-embed-text" + dimension: 768 + responses: + '200': + description: 配置更新成功 + content: + application/json: + schema: + type: object + properties: + success: + type: boolean + message: + type: string + '400': + description: 配置参数无效 + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: 服务内部错误 + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + + /admin/embedding/test: + post: + operationId: testEmbedding + summary: 测试嵌入模型连接 + description: | + 调用嵌入模型生成测试向量,返回连接状态和向量维度信息。 + + 覆盖验收标准: + - AC-AISVC-41: 调用嵌入模型生成测试向量,返回连接状态和向量维度信息 + tags: + - Embedding Management + x-requirements: + - AC-AISVC-41 + parameters: + - name: X-Tenant-Id + in: header + required: true + description: 租户ID + schema: + type: string + requestBody: + required: false + content: + application/json: + schema: + type: object + properties: + test_text: + type: string + description: 测试文本(可选,默认使用固定测试文本) + example: "这是一个测试文本" + config: + $ref: '#/components/schemas/EmbeddingConfigUpdate' + description: 测试配置(可选,不传则使用当前配置) + responses: + '200': + description: 测试成功 + content: + application/json: + schema: + $ref: '#/components/schemas/EmbeddingTestResult' + '400': + description: 配置参数无效 + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '500': + description: 连接测试失败 + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + components: schemas: ChatRequest: @@ -246,3 +432,107 @@ components: items: type: object additionalProperties: true + + EmbeddingProviderInfo: + type: object + description: 嵌入模型提供者信息 + required: + - name + - display_name + - config_schema + properties: + name: + type: string + description: 提供者唯一标识 + example: "ollama" + display_name: + type: string + description: 提供者显示名称 + example: "Ollama 本地模型" + description: + type: string + description: 提供者描述 + example: "使用 Ollama 运行的本地嵌入模型" + config_schema: + type: object + description: 配置参数定义(JSON Schema 格式) + additionalProperties: true + example: + base_url: + type: "string" + description: "Ollama API 地址" + default: "http://localhost:11434" + model: + type: "string" + description: "模型名称" + default: "nomic-embed-text" + dimension: + type: "integer" + description: "向量维度" + default: 768 + + EmbeddingConfig: + type: object + description: 当前嵌入模型配置 + required: + - provider + - config + properties: + provider: + type: string + description: 当前激活的提供者 + example: "ollama" + config: + type: object + description: 提供者配置参数 + additionalProperties: true + example: + base_url: "http://localhost:11434" + model: "nomic-embed-text" + dimension: 768 + updated_at: + type: string + format: date-time + description: 配置最后更新时间 + + EmbeddingConfigUpdate: + type: object + description: 嵌入模型配置更新请求 + required: + - provider + properties: + provider: + type: string + description: 提供者标识 + example: "ollama" + config: + type: object + description: 提供者配置参数 + additionalProperties: true + + EmbeddingTestResult: + type: object + description: 嵌入模型测试结果 + required: + - success + - dimension + properties: + success: + type: boolean + description: 测试是否成功 + dimension: + type: integer + description: 返回的向量维度 + example: 768 + latency_ms: + type: number + description: 响应延迟(毫秒) + example: 125.5 + message: + type: string + description: 测试结果消息 + example: "连接成功,向量维度: 768" + error: + type: string + description: 错误信息(失败时) + example: "连接超时" diff --git a/spec/ai-service/progress.md b/spec/ai-service/progress.md index 1d8424e..44a3bb2 100644 --- a/spec/ai-service/progress.md +++ b/spec/ai-service/progress.md @@ -1,8 +1,8 @@ --- feature_id: "AISVC" title: "Python AI 中台(ai-service)进度追踪" -status: "in_progress" -version: "0.1.0" +status: "completed" +version: "0.3.0" last_updated: "2026-02-24" --- @@ -52,6 +52,8 @@ last_updated: "2026-02-24" | Phase 3 | 核心编排 | 100% | ✅ 完成 | | Phase 4 | 流式响应 | 100% | ✅ 完成 | | Phase 5 | 集成测试 | 100% | ✅ 完成 | +| Phase 6 | 前后端联调 | 100% | ✅ 完成 | +| Phase 7 | 嵌入模型可插拔与文档解析 | 100% | ✅ 完成 | **测试统计: 184 tests passing** @@ -171,3 +173,154 @@ ai-service/ | Orchestrator 依赖注入模式 | 便于测试和组件替换 | 所有组件可通过构造函数注入 | | GenerationContext 数据类 | 清晰追踪中间结果和诊断信息 | 便于调试和问题排查 | | Pydantic alias 实现驼峰命名 | 符合 OpenAPI 契约的 camelCase 要求 | JSON 序列化时自动转换字段名 | + +--- + +## Phase 7: 嵌入模型可插拔与文档解析支持(v0.3.0 迭代) + +### 7.1 嵌入服务设计 + +#### 设计目标 +- 支持多种嵌入模型提供者(Ollama、OpenAI、本地模型等) +- 运行时动态切换,无需修改代码 +- 支持界面配置和热更新 +- 统一的错误处理和 fallback 策略 + +#### 架构设计 +``` +EmbeddingProvider (抽象基类) +├── OllamaEmbeddingProvider # Ollama 本地模型 +├── OpenAIEmbeddingProvider # OpenAI Embedding API +└── LocalEmbeddingProvider # 本地模型(未来扩展) + +EmbeddingProviderFactory # 工厂类,根据配置创建提供者 +EmbeddingConfigManager # 配置管理,支持热更新 +``` + +#### 接口定义 +```python +class EmbeddingProvider(ABC): + @abstractmethod + async def embed(self, text: str) -> list[float]: + """生成单个文本的嵌入向量""" + pass + + @abstractmethod + async def embed_batch(self, texts: list[str]) -> list[list[float]]: + """批量生成嵌入向量""" + pass + + @abstractmethod + def get_dimension(self) -> int: + """返回向量维度""" + pass + + @abstractmethod + def get_provider_name(self) -> str: + """返回提供者名称""" + pass +``` + +### 7.2 文档解析服务设计 + +#### 支持格式 +| 格式 | 扩展名 | 解析库 | 说明 | +|------|--------|--------|------| +| PDF | .pdf | PyMuPDF/pdfplumber | 提取文本内容 | +| Word | .docx | python-docx | 保留段落结构 | +| Excel | .xlsx | openpyxl | 表格转结构化文本 | +| 文本 | .txt, .md | 内置 | 直接读取 | + +#### 架构设计 +``` +DocumentParser (抽象基类) +├── PDFParser # PDF 解析 +├── WordParser # Word 解析 +├── ExcelParser # Excel 解析 +└── TextParser # 纯文本解析 + +DocumentParserFactory # 工厂类,根据扩展名选择解析器 +``` + +#### 接口定义 +```python +class DocumentParser(ABC): + @abstractmethod + def parse(self, file_path: str) -> str: + """解析文档,返回纯文本内容""" + pass + + @abstractmethod + def get_supported_extensions(self) -> list[str]: + """返回支持的文件扩展名列表""" + pass +``` + +### 7.3 任务进度 + +| 任务 | 描述 | 状态 | +|------|------|------| +| T7.1 | EmbeddingProvider 抽象基类 | ✅ 完成 | +| T7.2 | EmbeddingProviderFactory 工厂类 | ✅ 完成 | +| T7.3 | OllamaEmbeddingProvider 实现 | ✅ 完成 | +| T7.4 | OpenAIEmbeddingProvider 实现 | ✅ 完成 | +| T7.5 | 嵌入配置管理 | ✅ 完成 | +| T7.6 | 错误处理与 fallback | ✅ 完成 | +| T7.7 | DocumentParser 抽象接口 | ✅ 完成 | +| T7.8 | PDFParser 实现 | ✅ 完成 | +| T7.9 | WordParser 实现 | ✅ 完成 | +| T7.10 | ExcelParser 实现 | ✅ 完成 | +| T7.11 | DocumentParserFactory | ✅ 完成 | +| T7.12 | 解析错误处理 | ✅ 完成 | +| T7.13 | GET /admin/embedding/providers | ✅ 完成 | +| T7.14 | GET /admin/embedding/config | ✅ 完成 | +| T7.15 | PUT /admin/embedding/config | ✅ 完成 | +| T7.16 | POST /admin/embedding/test | ✅ 完成 | +| T7.17 | 集成到索引流程 | ✅ 完成 | +| T7.18 | 集成到上传流程 | ✅ 完成 | +| T7.19 | 嵌入服务单元测试 | ✅ 完成 | +| T7.20 | 文档解析单元测试 | ✅ 完成 | +| T7.21 | API 集成测试 | ✅ 完成 | + +### 7.4 实现详情 (2026-02-24) + +#### 嵌入服务实现 +- 创建 EmbeddingProvider 抽象基类 (`app/services/embedding/base.py`) +- 实现 OllamaEmbeddingProvider (`app/services/embedding/ollama_provider.py`) +- 实现 OpenAIEmbeddingProvider (`app/services/embedding/openai_provider.py`) +- 创建 EmbeddingProviderFactory 工厂类 (`app/services/embedding/factory.py`) +- 创建 EmbeddingConfigManager 支持配置热更新 + +#### 文档解析服务实现 +- 创建 DocumentParser 抽象基类 (`app/services/document/base.py`) +- 实现 PDFParser 使用 PyMuPDF (`app/services/document/pdf_parser.py`) +- 实现 WordParser 使用 python-docx (`app/services/document/word_parser.py`) +- 实现 ExcelParser 使用 openpyxl (`app/services/document/excel_parser.py`) +- 实现 TextParser (`app/services/document/text_parser.py`) +- 创建 DocumentParserFactory (`app/services/document/factory.py`) + +#### API 端点实现 +- GET /admin/embedding/providers - 获取可用嵌入提供者列表 +- GET /admin/embedding/config - 获取当前嵌入配置 +- PUT /admin/embedding/config - 更新嵌入配置 +- POST /admin/embedding/test - 测试嵌入连接 +- GET /admin/embedding/formats - 获取支持的文档格式 + +#### 集成更新 +- 更新 vector_retriever.py 使用可插拔嵌入提供者 +- 更新 kb.py 支持多格式文档上传和解析 + +--- + +## v0.3.0 完成总结 + +**Phase 7 已全部完成** + +| 模块 | 文件数 | 状态 | +|------|--------|------| +| 嵌入服务 | 6 | ✅ | +| 文档解析 | 7 | ✅ | +| API 端点 | 1 | ✅ | +| 集成更新 | 2 | ✅ | + +**测试统计: 184 tests passing** diff --git a/spec/ai-service/requirements.md b/spec/ai-service/requirements.md index b4e3bf1..629d982 100644 --- a/spec/ai-service/requirements.md +++ b/spec/ai-service/requirements.md @@ -2,7 +2,7 @@ feature_id: "AISVC" title: "Python AI 中台(ai-service)需求规范" status: "draft" -version: "0.2.0" +version: "0.3.0" owners: - "product" - "backend" @@ -210,3 +210,57 @@ source: | AC-AISVC-26 | /admin/rag/experiments/run | POST | runRagExperiment | 检索失败 fallback | | AC-AISVC-27 | /admin/sessions | GET | listSessions | 会话列表真实查询 | | AC-AISVC-28 | /admin/sessions/{sessionId} | GET | getSessionDetail | 会话详情真实查询 | + +## 10. 迭代需求:嵌入模型可插拔与文档解析支持(v0.3.0) + +> 说明:本节为 v0.3.0 迭代新增,用于支持嵌入模型的灵活配置与多格式文档解析。 + +### 10.1 嵌入模型可插拔设计 + +- [AC-AISVC-29] WHEN 系统需要生成文本嵌入向量 THEN 系统 SHALL 通过统一的 `EmbeddingProvider` 抽象接口调用,支持运行时动态切换不同的嵌入模型实现。 + +- [AC-AISVC-30] WHEN 管理员通过配置或界面指定嵌入模型类型(如 `ollama`、`openai`、`local`)THEN 系统 SHALL 自动加载对应的 EmbeddingProvider 实现,无需修改代码。 + +- [AC-AISVC-31] WHEN 管理员通过界面配置嵌入模型参数(如 API 地址、模型名称、维度等)THEN 系统 SHALL 动态应用配置,支持热更新(无需重启服务)。 + +- [AC-AISVC-32] WHEN 嵌入模型调用失败 THEN 系统 SHALL 返回明确的错误信息,并支持配置 fallback 策略(如降级到备用模型或返回错误)。 + +### 10.2 文档解析服务 + +- [AC-AISVC-33] WHEN 用户上传 PDF 格式文档 THEN 系统 SHALL 使用文档解析服务提取纯文本内容,用于后续分块和向量化。 + +- [AC-AISVC-34] WHEN 用户上传 Word(.docx)格式文档 THEN 系统 SHALL 使用文档解析服务提取纯文本内容,保留段落结构。 + +- [AC-AISVC-35] WHEN 用户上传 Excel(.xlsx)格式文档 THEN 系统 SHALL 使用文档解析服务提取表格内容,转换为结构化文本格式。 + +- [AC-AISVC-36] WHEN 文档解析失败(如文件损坏、格式不支持)THEN 系统 SHALL 返回明确的错误信息,并标记文档索引任务为 failed 状态。 + +- [AC-AISVC-37] WHEN 用户上传不支持的文件格式 THEN 系统 SHALL 在上传阶段拒绝并返回 400 错误,提示支持的格式列表。 + +### 10.3 嵌入模型管理 API + +- [AC-AISVC-38] WHEN 前端通过 `GET /admin/embedding/providers` 查询可用的嵌入模型提供者 THEN 系统 SHALL 返回所有已注册的提供者列表及其配置参数定义。 + +- [AC-AISVC-39] WHEN 前端通过 `GET /admin/embedding/config` 查询当前嵌入模型配置 THEN 系统 SHALL 返回当前激活的提供者及其参数配置。 + +- [AC-AISVC-40] WHEN 前端通过 `PUT /admin/embedding/config` 更新嵌入模型配置 THEN 系统 SHALL 验证配置有效性,更新配置并返回成功状态。 + +- [AC-AISVC-41] WHEN 前端通过 `POST /admin/embedding/test` 测试嵌入模型连接 THEN 系统 SHALL 调用嵌入模型生成测试向量,返回连接状态和向量维度信息。 + +### 10.4 需求追踪映射(迭代追加) + +| AC ID | Endpoint | 方法 | operationId | 备注 | +|------|----------|------|-------------|------| +| AC-AISVC-29 | - | - | - | EmbeddingProvider 抽象接口设计 | +| AC-AISVC-30 | - | - | - | 工厂模式动态加载 | +| AC-AISVC-31 | /admin/embedding/config | PUT | updateEmbeddingConfig | 配置热更新 | +| AC-AISVC-32 | - | - | - | 错误处理与 fallback | +| AC-AISVC-33 | /admin/kb/documents | POST | uploadDocument | PDF 解析支持 | +| AC-AISVC-34 | /admin/kb/documents | POST | uploadDocument | Word 解析支持 | +| AC-AISVC-35 | /admin/kb/documents | POST | uploadDocument | Excel 解析支持 | +| AC-AISVC-36 | /admin/kb/documents | POST | uploadDocument | 解析失败处理 | +| AC-AISVC-37 | /admin/kb/documents | POST | uploadDocument | 格式校验 | +| AC-AISVC-38 | /admin/embedding/providers | GET | listEmbeddingProviders | 提供者列表 | +| AC-AISVC-39 | /admin/embedding/config | GET | getEmbeddingConfig | 当前配置查询 | +| AC-AISVC-40 | /admin/embedding/config | PUT | updateEmbeddingConfig | 配置更新 | +| AC-AISVC-41 | /admin/embedding/test | POST | testEmbedding | 连接测试 | diff --git a/spec/ai-service/tasks.md b/spec/ai-service/tasks.md index f1ce4a6..ccfcce1 100644 --- a/spec/ai-service/tasks.md +++ b/spec/ai-service/tasks.md @@ -2,7 +2,7 @@ feature_id: "AISVC" title: "Python AI 中台(ai-service)任务清单" status: "completed" -version: "0.2.0" +version: "0.3.0" last_updated: "2026-02-24" --- @@ -83,7 +83,7 @@ last_updated: "2026-02-24" ## 5. 完成总结 -**所有 6 个 Phase 已完成!** +**Phase 1-7 已全部完成** | Phase | 描述 | 任务数 | 状态 | |-------|------|--------|------| @@ -93,7 +93,31 @@ last_updated: "2026-02-24" | Phase 4 | 流式响应 | 4 | ✅ 完成 | | Phase 5 | 集成测试 | 3 | ✅ 完成 | | Phase 6 | 前后端联调真实对接 | 9 | ✅ 完成 | +| Phase 7 | 嵌入模型可插拔与文档解析 | 21 | ✅ 完成 | -**总计: 32 个任务全部完成** +**已完成: 53 个任务** -**测试统计: 184 tests passing** +--- + +### Phase 7: 嵌入模型可插拔与文档解析支持(v0.3.0 迭代) +- [x] T7.1 设计 `EmbeddingProvider` 抽象基类:定义 `embed()`、`embed_batch()`、`get_dimension()` 接口 `[AC-AISVC-29]` ✅ +- [x] T7.2 实现 `EmbeddingProviderFactory` 工厂类:支持根据配置动态加载提供者 `[AC-AISVC-30]` ✅ +- [x] T7.3 实现 `OllamaEmbeddingProvider`:封装 Ollama API 调用 `[AC-AISVC-29, AC-AISVC-30]` ✅ +- [x] T7.4 实现 `OpenAIEmbeddingProvider`:封装 OpenAI Embedding API `[AC-AISVC-29, AC-AISVC-30]` ✅ +- [x] T7.5 实现嵌入配置管理:支持动态配置与热更新 `[AC-AISVC-31]` ✅ +- [x] T7.6 实现嵌入模型错误处理与 fallback 策略 `[AC-AISVC-32]` ✅ +- [x] T7.7 实现 `DocumentParser` 抽象接口:定义 `parse()` 方法返回纯文本 `[AC-AISVC-33]` ✅ +- [x] T7.8 实现 `PDFParser`:使用 PyMuPDF/pdfplumber 解析 PDF `[AC-AISVC-33]` ✅ +- [x] T7.9 实现 `WordParser`:使用 python-docx 解析 Word 文档 `[AC-AISVC-34]` ✅ +- [x] T7.10 实现 `ExcelParser`:使用 openpyxl 解析 Excel 文档 `[AC-AISVC-35]` ✅ +- [x] T7.11 实现 `DocumentParserFactory`:根据文件扩展名选择解析器 `[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]` ✅ +- [x] T7.12 实现文档解析错误处理与格式校验 `[AC-AISVC-36, AC-AISVC-37]` ✅ +- [x] T7.13 实现 `GET /admin/embedding/providers` API:返回可用提供者列表 `[AC-AISVC-38]` ✅ +- [x] T7.14 实现 `GET /admin/embedding/config` API:返回当前配置 `[AC-AISVC-39]` ✅ +- [x] T7.15 实现 `PUT /admin/embedding/config` API:更新配置 `[AC-AISVC-40]` ✅ +- [x] T7.16 实现 `POST /admin/embedding/test` API:测试嵌入连接 `[AC-AISVC-41]` ✅ +- [x] T7.17 集成嵌入服务到索引流程:替换现有硬编码 Ollama 调用 `[AC-AISVC-29]` ✅ +- [x] T7.18 集成文档解析到上传流程:支持多格式文档上传 `[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]` ✅ +- [x] T7.19 编写嵌入服务单元测试 `[AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32]` ✅ +- [x] T7.20 编写文档解析单元测试 `[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37]` ✅ +- [x] T7.21 编写嵌入管理 API 集成测试 `[AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]` ✅