diff --git a/README.md b/README.md index 65b5f23..96b84e5 100644 --- a/README.md +++ b/README.md @@ -78,14 +78,36 @@ docker-compose up -d --build #### 4. 拉取嵌入模型 -服务启动后,需要在 Ollama 容器中拉取 nomic-embed-text 模型: +服务启动后,需要在 Ollama 容器中拉取嵌入模型。推荐使用 `nomic-embed-text-v2-moe`,对中文支持更好: ```bash # 进入 Ollama 容器拉取模型 -docker exec -it ai-ollama ollama pull nomic-embed-text +docker exec -it ai-ollama ollama pull toshk0/nomic-embed-text-v2-moe:Q6_K ``` -#### 5. 验证服务 +**可选模型**: + +| 模型 | 维度 | 说明 | +|------|------|------| +| `toshk0/nomic-embed-text-v2-moe:Q6_K` | 768 | 推荐,中文支持好,支持任务前缀 | +| `nomic-embed-text:v1.5` | 768 | 原版,支持任务前缀和 Matryoshka | +| `bge-large-zh` | 1024 | 中文专用,效果最好 | + +#### 5. 配置嵌入模型 + +访问前端管理界面,进入 **嵌入模型配置** 页面: + +1. 选择提供者:**Nomic Embed (优化版)** +2. 配置参数: + - **API 地址**:`http://ollama:11434`(Docker 环境)或 `http://localhost:11434`(本地开发) + - **模型名称**:`toshk0/nomic-embed-text-v2-moe:Q6_K` + - **向量维度**:`768` + - **Matryoshka 截断**:`true` +3. 点击 **保存配置** + +> **注意**: 使用 Nomic Embed (优化版) provider 可启用完整的 RAG 优化功能:任务前缀、Matryoshka 多向量、两阶段检索。 + +#### 6. 验证服务 ```bash # 检查服务状态 @@ -97,7 +119,7 @@ docker compose logs -f ai-service | grep "Default API Key" > **重要**: 后端首次启动时会自动生成一个默认 API Key,请从日志中复制该 Key,用于前端配置。 -#### 6. 配置前端 API Key +#### 7. 配置前端 API Key ```bash # 创建前端环境变量文件 diff --git a/ai-service-admin/vite.config.ts b/ai-service-admin/vite.config.ts index 6df2553..b25653b 100644 --- a/ai-service-admin/vite.config.ts +++ b/ai-service-admin/vite.config.ts @@ -14,7 +14,7 @@ export default defineConfig({ port: 3000, proxy: { '/api': { - target: 'http://localhost:8000', + target: 'http://localhost:8088', changeOrigin: true, rewrite: (path) => path.replace(/^\/api/, ''), }, diff --git a/ai-service/app/core/qdrant_client.py b/ai-service/app/core/qdrant_client.py index 5742b5a..85639d2 100644 --- a/ai-service/app/core/qdrant_client.py +++ b/ai-service/app/core/qdrant_client.py @@ -176,6 +176,7 @@ class QdrantClient: limit: int = 5, score_threshold: float | None = None, vector_name: str = "full", + with_vectors: bool = False, ) -> list[dict[str, Any]]: """ [AC-AISVC-10] Search vectors in tenant's collection. @@ -189,6 +190,7 @@ class QdrantClient: score_threshold: Minimum score threshold for results vector_name: Name of the vector to search (for multi-vector collections) Default is "full" for 768-dim vectors in Matryoshka setup. + with_vectors: Whether to return vectors in results (for two-stage reranking) """ client = await self.get_client() @@ -216,6 +218,7 @@ class QdrantClient: collection_name=collection_name, query_vector=(vector_name, query_vector), limit=limit, + with_vectors=with_vectors, ) except Exception as e: if "vector name" in str(e).lower() or "Not existing vector" in str(e): @@ -227,6 +230,7 @@ class QdrantClient: collection_name=collection_name, query_vector=query_vector, limit=limit, + with_vectors=with_vectors, ) else: raise @@ -235,15 +239,18 @@ class QdrantClient: f"[AC-AISVC-10] Collection {collection_name} returned {len(results)} raw results" ) - hits = [ - { + hits = [] + for result in results: + if score_threshold is not None and result.score < score_threshold: + continue + hit = { "id": str(result.id), "score": result.score, "payload": result.payload or {}, } - for result in results - if score_threshold is None or result.score >= score_threshold - ] + if with_vectors and result.vector: + hit["vector"] = result.vector + hits.append(hit) all_hits.extend(hits) if hits: diff --git a/ai-service/app/services/embedding/factory.py b/ai-service/app/services/embedding/factory.py index e42e506..d0e8ad6 100644 --- a/ai-service/app/services/embedding/factory.py +++ b/ai-service/app/services/embedding/factory.py @@ -74,11 +74,38 @@ class EmbeddingProviderFactory: "nomic": "Nomic-embed-text v1.5 优化版,支持任务前缀和 Matryoshka 维度截断,专为RAG优化", } + raw_schema = temp_instance.get_config_schema() + + properties = {} + required = [] + for key, field in raw_schema.items(): + properties[key] = { + "type": field.get("type", "string"), + "title": field.get("title", key), + "description": field.get("description", ""), + "default": field.get("default"), + } + if field.get("enum"): + properties[key]["enum"] = field.get("enum") + if field.get("minimum") is not None: + properties[key]["minimum"] = field.get("minimum") + if field.get("maximum") is not None: + properties[key]["maximum"] = field.get("maximum") + if field.get("required"): + required.append(key) + + config_schema = { + "type": "object", + "properties": properties, + } + if required: + config_schema["required"] = required + return { "name": name, "display_name": display_names.get(name, name), "description": descriptions.get(name, ""), - "config_schema": temp_instance.get_config_schema(), + "config_schema": config_schema, } @classmethod @@ -286,7 +313,7 @@ def get_embedding_config_manager() -> EmbeddingConfigManager: settings = get_settings() _embedding_config_manager = EmbeddingConfigManager( - default_provider="ollama", + default_provider="nomic", default_config={ "base_url": settings.ollama_base_url, "model": settings.ollama_embedding_model, diff --git a/ai-service/app/services/embedding/nomic_provider.py b/ai-service/app/services/embedding/nomic_provider.py index ba6a73b..cd19825 100644 --- a/ai-service/app/services/embedding/nomic_provider.py +++ b/ai-service/app/services/embedding/nomic_provider.py @@ -149,6 +149,7 @@ class NomicEmbeddingProvider(EmbeddingProvider): embedding_256 = self._truncate_and_normalize(embedding, 256) embedding_512 = self._truncate_and_normalize(embedding, 512) + embedding_full = self._truncate_and_normalize(embedding, len(embedding)) logger.debug( f"Generated Nomic embedding: task={task.value}, " @@ -156,7 +157,7 @@ class NomicEmbeddingProvider(EmbeddingProvider): ) return NomicEmbeddingResult( - embedding_full=embedding, + embedding_full=embedding_full, embedding_256=embedding_256, embedding_512=embedding_512, dimension=len(embedding), @@ -259,26 +260,31 @@ class NomicEmbeddingProvider(EmbeddingProvider): return { "base_url": { "type": "string", + "title": "API 地址", "description": "Ollama API 地址", "default": "http://localhost:11434", }, "model": { "type": "string", + "title": "模型名称", "description": "嵌入模型名称(推荐 nomic-embed-text v1.5)", "default": "nomic-embed-text", }, "dimension": { "type": "integer", + "title": "向量维度", "description": "向量维度(支持 256/512/768)", "default": 768, }, "timeout_seconds": { "type": "integer", + "title": "超时时间", "description": "请求超时时间(秒)", "default": 60, }, "enable_matryoshka": { "type": "boolean", + "title": "Matryoshka 截断", "description": "启用 Matryoshka 维度截断", "default": True, }, diff --git a/ai-service/app/services/embedding/ollama_provider.py b/ai-service/app/services/embedding/ollama_provider.py index 39093e6..c57b0a4 100644 --- a/ai-service/app/services/embedding/ollama_provider.py +++ b/ai-service/app/services/embedding/ollama_provider.py @@ -130,21 +130,25 @@ class OllamaEmbeddingProvider(EmbeddingProvider): return { "base_url": { "type": "string", + "title": "API 地址", "description": "Ollama API 地址", "default": "http://localhost:11434", }, "model": { "type": "string", + "title": "模型名称", "description": "嵌入模型名称", "default": "nomic-embed-text", }, "dimension": { "type": "integer", + "title": "向量维度", "description": "向量维度", "default": 768, }, "timeout_seconds": { "type": "integer", + "title": "超时时间", "description": "请求超时时间(秒)", "default": 60, }, diff --git a/ai-service/app/services/embedding/openai_provider.py b/ai-service/app/services/embedding/openai_provider.py index 0e15aab..31b4a00 100644 --- a/ai-service/app/services/embedding/openai_provider.py +++ b/ai-service/app/services/embedding/openai_provider.py @@ -159,28 +159,33 @@ class OpenAIEmbeddingProvider(EmbeddingProvider): return { "api_key": { "type": "string", + "title": "API 密钥", "description": "OpenAI API 密钥", "required": True, "secret": True, }, "model": { "type": "string", + "title": "模型名称", "description": "嵌入模型名称", "default": "text-embedding-3-small", "enum": list(self.MODEL_DIMENSIONS.keys()), }, "base_url": { "type": "string", + "title": "API 地址", "description": "OpenAI API 地址(支持兼容接口)", "default": "https://api.openai.com/v1", }, "dimension": { "type": "integer", + "title": "向量维度", "description": "向量维度(仅 text-embedding-3 系列支持自定义)", "default": 1536, }, "timeout_seconds": { "type": "integer", + "title": "超时时间", "description": "请求超时时间(秒)", "default": 60, }, diff --git a/ai-service/app/services/retrieval/optimized_retriever.py b/ai-service/app/services/retrieval/optimized_retriever.py index 74d6a21..02cd28d 100644 --- a/ai-service/app/services/retrieval/optimized_retriever.py +++ b/ai-service/app/services/retrieval/optimized_retriever.py @@ -84,7 +84,13 @@ class RRFCombiner: "bm25_rank": -1, "payload": result.get("payload", {}), "id": chunk_id, + "vector": result.get("vector"), } + else: + combined_scores[chunk_id]["vector_score"] = result.get("score", 0.0) + combined_scores[chunk_id]["vector_rank"] = rank + if result.get("vector"): + combined_scores[chunk_id]["vector"] = result.get("vector") combined_scores[chunk_id]["score"] += rrf_score @@ -101,6 +107,7 @@ class RRFCombiner: "bm25_rank": rank, "payload": result.get("payload", {}), "id": chunk_id, + "vector": result.get("vector"), } else: combined_scores[chunk_id]["bm25_score"] = result.get("score", 0.0) @@ -199,7 +206,15 @@ class OptimizedRetriever(BaseRetriever): f"dim_256={'available' if embedding_result.embedding_256 else 'not available'}" ) - if self._two_stage_enabled: + if self._two_stage_enabled and self._hybrid_enabled: + logger.info("[RAG-OPT] Using two-stage + hybrid retrieval strategy") + results = await self._two_stage_hybrid_retrieve( + ctx.tenant_id, + embedding_result, + ctx.query, + self._top_k, + ) + elif self._two_stage_enabled: logger.info("[RAG-OPT] Using two-stage retrieval strategy") results = await self._two_stage_retrieve( ctx.tenant_id, @@ -300,20 +315,27 @@ class OptimizedRetriever(BaseRetriever): stage1_start = time.perf_counter() candidates = await self._search_with_dimension( client, tenant_id, embedding_result.embedding_256, "dim_256", - top_k * self._two_stage_expand_factor + top_k * self._two_stage_expand_factor, + with_vectors=True, ) stage1_latency = (time.perf_counter() - stage1_start) * 1000 - logger.debug( + logger.info( f"[RAG-OPT] Stage 1: {len(candidates)} candidates in {stage1_latency:.2f}ms" ) stage2_start = time.perf_counter() reranked = [] for candidate in candidates: - stored_full_embedding = candidate.get("payload", {}).get("embedding_full", []) - if stored_full_embedding: - import numpy as np + vector_data = candidate.get("vector", {}) + stored_full_embedding = None + + if isinstance(vector_data, dict): + stored_full_embedding = vector_data.get("full", []) + elif isinstance(vector_data, list): + stored_full_embedding = vector_data + + if stored_full_embedding and len(stored_full_embedding) > 0: similarity = self._cosine_similarity( embedding_result.embedding_full, stored_full_embedding @@ -326,7 +348,7 @@ class OptimizedRetriever(BaseRetriever): results = reranked[:top_k] stage2_latency = (time.perf_counter() - stage2_start) * 1000 - logger.debug( + logger.info( f"[RAG-OPT] Stage 2: {len(results)} final results in {stage2_latency:.2f}ms" ) @@ -374,6 +396,92 @@ class OptimizedRetriever(BaseRetriever): return combined[:top_k] + async def _two_stage_hybrid_retrieve( + self, + tenant_id: str, + embedding_result: NomicEmbeddingResult, + query: str, + top_k: int, + ) -> list[dict[str, Any]]: + """ + Two-stage + Hybrid retrieval strategy. + + Stage 1: Fast retrieval with 256-dim vectors + BM25 in parallel + Stage 2: RRF fusion + Precise reranking with 768-dim vectors + + This combines the best of both worlds: + - Two-stage: Speed from 256-dim, precision from 768-dim reranking + - Hybrid: Semantic matching from vectors, keyword matching from BM25 + """ + import time + + client = await self._get_client() + + stage1_start = time.perf_counter() + + vector_task = self._search_with_dimension( + client, tenant_id, embedding_result.embedding_256, "dim_256", + top_k * self._two_stage_expand_factor, + with_vectors=True, + ) + + bm25_task = self._bm25_search(client, tenant_id, query, top_k * self._two_stage_expand_factor) + + vector_results, bm25_results = await asyncio.gather( + vector_task, bm25_task, return_exceptions=True + ) + + if isinstance(vector_results, Exception): + logger.warning(f"[RAG-OPT] Vector search failed: {vector_results}") + vector_results = [] + + if isinstance(bm25_results, Exception): + logger.warning(f"[RAG-OPT] BM25 search failed: {bm25_results}") + bm25_results = [] + + stage1_latency = (time.perf_counter() - stage1_start) * 1000 + logger.info( + f"[RAG-OPT] Two-stage Hybrid Stage 1: vector={len(vector_results)}, bm25={len(bm25_results)}, latency={stage1_latency:.2f}ms" + ) + + stage2_start = time.perf_counter() + + combined = self._rrf_combiner.combine( + vector_results, + bm25_results, + vector_weight=settings.rag_vector_weight, + bm25_weight=settings.rag_bm25_weight, + ) + + reranked = [] + for candidate in combined[:top_k * 2]: + vector_data = candidate.get("vector", {}) + stored_full_embedding = None + + if isinstance(vector_data, dict): + stored_full_embedding = vector_data.get("full", []) + elif isinstance(vector_data, list): + stored_full_embedding = vector_data + + if stored_full_embedding and len(stored_full_embedding) > 0: + similarity = self._cosine_similarity( + embedding_result.embedding_full, + stored_full_embedding + ) + candidate["score"] = similarity + candidate["stage"] = "two_stage_hybrid_reranked" + reranked.append(candidate) + + reranked.sort(key=lambda x: x.get("score", 0), reverse=True) + results = reranked[:top_k] + stage2_latency = (time.perf_counter() - stage2_start) * 1000 + + logger.info( + f"[RAG-OPT] Two-stage Hybrid Stage 2 (reranking): {len(results)} final results in {stage2_latency:.2f}ms" + ) + + return results + async def _vector_retrieve( self, tenant_id: str, @@ -393,12 +501,13 @@ class OptimizedRetriever(BaseRetriever): query_vector: list[float], vector_name: str, limit: int, + with_vectors: bool = False, ) -> list[dict[str, Any]]: """Search using specified vector dimension.""" try: logger.info( f"[RAG-OPT] Searching with vector_name={vector_name}, " - f"limit={limit}, vector_dim={len(query_vector)}" + f"limit={limit}, vector_dim={len(query_vector)}, with_vectors={with_vectors}" ) results = await client.search( @@ -406,6 +515,7 @@ class OptimizedRetriever(BaseRetriever): query_vector=query_vector, limit=limit, vector_name=vector_name, + with_vectors=with_vectors, ) logger.info( diff --git a/ai-service/scripts/cleanup_qdrant.py b/ai-service/scripts/cleanup_qdrant.py new file mode 100644 index 0000000..f84e9ef --- /dev/null +++ b/ai-service/scripts/cleanup_qdrant.py @@ -0,0 +1,89 @@ +""" +Script to cleanup Qdrant collections and data. +""" + +import asyncio +import logging +import sys + +sys.path.insert(0, "Q:\\agentProject\\ai-robot-core\\ai-service") + +from app.core.config import get_settings +from app.core.qdrant_client import get_qdrant_client + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +async def list_collections(): + """List all collections in Qdrant.""" + client = await get_qdrant_client() + qdrant = await client.get_client() + + collections = await qdrant.get_collections() + return [c.name for c in collections.collections] + + +async def delete_collection(collection_name: str): + """Delete a specific collection.""" + client = await get_qdrant_client() + qdrant = await client.get_client() + + try: + await qdrant.delete_collection(collection_name) + logger.info(f"Deleted collection: {collection_name}") + return True + except Exception as e: + logger.error(f"Failed to delete collection {collection_name}: {e}") + return False + + +async def delete_all_collections(): + """Delete all collections.""" + collections = await list_collections() + logger.info(f"Found {len(collections)} collections: {collections}") + + for name in collections: + await delete_collection(name) + + logger.info("All collections deleted") + + +async def delete_tenant_collection(tenant_id: str): + """Delete collection for a specific tenant.""" + client = await get_qdrant_client() + collection_name = client.get_collection_name(tenant_id) + + success = await delete_collection(collection_name) + if success: + logger.info(f"Deleted collection for tenant: {tenant_id}") + return success + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Cleanup Qdrant data") + parser.add_argument("--all", action="store_true", help="Delete all collections") + parser.add_argument("--tenant", type=str, help="Delete collection for specific tenant") + parser.add_argument("--list", action="store_true", help="List all collections") + + args = parser.parse_args() + + if args.list: + collections = asyncio.run(list_collections()) + print(f"Collections: {collections}") + elif args.all: + confirm = input("Are you sure you want to delete ALL collections? (yes/no): ") + if confirm.lower() == "yes": + asyncio.run(delete_all_collections()) + else: + print("Cancelled") + elif args.tenant: + confirm = input(f"Delete collection for tenant '{args.tenant}'? (yes/no): ") + if confirm.lower() == "yes": + asyncio.run(delete_tenant_collection(args.tenant)) + else: + print("Cancelled") + else: + parser.print_help()