feat(AISVC-T7): 嵌入模型可插拔设计与文档解析支持 [AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32, AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37, AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30]
- 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30]
- 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32]
- 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33]
- 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]
- 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37]
- 更新 OpenAPI 契约添加嵌入管理接口
- 添加数据库初始化脚本
- 更新规范文档标记 Phase 7 完成
2026-02-24 15:08:08 +00:00
|
|
|
"""
|
|
|
|
|
PDF document parser implementation.
|
|
|
|
|
[AC-AISVC-33] PDF parsing using PyMuPDF (fitz).
|
|
|
|
|
|
|
|
|
|
Extracts text content from PDF files.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
from app.services.document.base import (
|
|
|
|
|
DocumentParseException,
|
|
|
|
|
DocumentParser,
|
2026-02-24 17:16:59 +00:00
|
|
|
PageText,
|
feat(AISVC-T7): 嵌入模型可插拔设计与文档解析支持 [AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32, AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37, AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30]
- 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30]
- 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32]
- 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33]
- 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]
- 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37]
- 更新 OpenAPI 契约添加嵌入管理接口
- 添加数据库初始化脚本
- 更新规范文档标记 Phase 7 完成
2026-02-24 15:08:08 +00:00
|
|
|
ParseResult,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PDFParser(DocumentParser):
|
|
|
|
|
"""
|
|
|
|
|
Parser for PDF documents.
|
|
|
|
|
[AC-AISVC-33] Uses PyMuPDF for text extraction.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, extract_images: bool = False, **kwargs: Any):
|
|
|
|
|
self._extract_images = extract_images
|
|
|
|
|
self._extra_config = kwargs
|
|
|
|
|
self._fitz = None
|
|
|
|
|
|
|
|
|
|
def _get_fitz(self):
|
|
|
|
|
"""Lazy import of PyMuPDF."""
|
|
|
|
|
if self._fitz is None:
|
|
|
|
|
try:
|
|
|
|
|
import fitz
|
|
|
|
|
self._fitz = fitz
|
|
|
|
|
except ImportError:
|
|
|
|
|
raise DocumentParseException(
|
|
|
|
|
"PyMuPDF (fitz) not installed. Install with: pip install pymupdf",
|
|
|
|
|
parser="pdf"
|
|
|
|
|
)
|
|
|
|
|
return self._fitz
|
|
|
|
|
|
|
|
|
|
def parse(self, file_path: str | Path) -> ParseResult:
|
|
|
|
|
"""
|
|
|
|
|
Parse a PDF document and extract text content.
|
|
|
|
|
[AC-AISVC-33] Extracts text from all pages.
|
|
|
|
|
"""
|
|
|
|
|
path = Path(file_path)
|
|
|
|
|
|
|
|
|
|
if not path.exists():
|
|
|
|
|
raise DocumentParseException(
|
|
|
|
|
f"File not found: {path}",
|
|
|
|
|
file_path=str(path),
|
|
|
|
|
parser="pdf"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not self.supports_extension(path.suffix):
|
|
|
|
|
raise DocumentParseException(
|
|
|
|
|
f"Unsupported file extension: {path.suffix}",
|
|
|
|
|
file_path=str(path),
|
|
|
|
|
parser="pdf"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
fitz = self._get_fitz()
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
doc = fitz.open(path)
|
|
|
|
|
|
2026-02-24 17:16:59 +00:00
|
|
|
pages: list[PageText] = []
|
feat(AISVC-T7): 嵌入模型可插拔设计与文档解析支持 [AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32, AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37, AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30]
- 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30]
- 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32]
- 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33]
- 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]
- 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37]
- 更新 OpenAPI 契约添加嵌入管理接口
- 添加数据库初始化脚本
- 更新规范文档标记 Phase 7 完成
2026-02-24 15:08:08 +00:00
|
|
|
text_parts = []
|
|
|
|
|
page_count = len(doc)
|
|
|
|
|
|
|
|
|
|
for page_num in range(page_count):
|
|
|
|
|
page = doc[page_num]
|
2026-02-24 17:16:59 +00:00
|
|
|
text = page.get_text().strip()
|
|
|
|
|
if text:
|
|
|
|
|
pages.append(PageText(page=page_num + 1, text=text))
|
feat(AISVC-T7): 嵌入模型可插拔设计与文档解析支持 [AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32, AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37, AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30]
- 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30]
- 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32]
- 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33]
- 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]
- 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37]
- 更新 OpenAPI 契约添加嵌入管理接口
- 添加数据库初始化脚本
- 更新规范文档标记 Phase 7 完成
2026-02-24 15:08:08 +00:00
|
|
|
text_parts.append(f"[Page {page_num + 1}]\n{text}")
|
|
|
|
|
|
|
|
|
|
doc.close()
|
|
|
|
|
|
|
|
|
|
full_text = "\n\n".join(text_parts)
|
|
|
|
|
file_size = path.stat().st_size
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Parsed PDF: {path.name}, pages={page_count}, "
|
|
|
|
|
f"chars={len(full_text)}, size={file_size}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return ParseResult(
|
|
|
|
|
text=full_text,
|
|
|
|
|
source_path=str(path),
|
|
|
|
|
file_size=file_size,
|
|
|
|
|
page_count=page_count,
|
|
|
|
|
metadata={
|
|
|
|
|
"format": "pdf",
|
|
|
|
|
"page_count": page_count,
|
2026-02-24 17:16:59 +00:00
|
|
|
},
|
|
|
|
|
pages=pages,
|
feat(AISVC-T7): 嵌入模型可插拔设计与文档解析支持 [AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32, AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37, AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30]
- 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30]
- 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32]
- 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33]
- 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]
- 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37]
- 更新 OpenAPI 契约添加嵌入管理接口
- 添加数据库初始化脚本
- 更新规范文档标记 Phase 7 完成
2026-02-24 15:08:08 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
except DocumentParseException:
|
|
|
|
|
raise
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise DocumentParseException(
|
|
|
|
|
f"Failed to parse PDF: {e}",
|
|
|
|
|
file_path=str(path),
|
|
|
|
|
parser="pdf",
|
|
|
|
|
details={"error": str(e)}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def get_supported_extensions(self) -> list[str]:
|
|
|
|
|
"""Get supported file extensions."""
|
|
|
|
|
return [".pdf"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PDFPlumberParser(DocumentParser):
|
|
|
|
|
"""
|
|
|
|
|
Alternative PDF parser using pdfplumber.
|
|
|
|
|
[AC-AISVC-33] Uses pdfplumber for text extraction.
|
|
|
|
|
|
|
|
|
|
pdfplumber is better for table extraction but slower than PyMuPDF.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, extract_tables: bool = True, **kwargs: Any):
|
|
|
|
|
self._extract_tables = extract_tables
|
|
|
|
|
self._extra_config = kwargs
|
|
|
|
|
self._pdfplumber = None
|
|
|
|
|
|
|
|
|
|
def _get_pdfplumber(self):
|
|
|
|
|
"""Lazy import of pdfplumber."""
|
|
|
|
|
if self._pdfplumber is None:
|
|
|
|
|
try:
|
|
|
|
|
import pdfplumber
|
|
|
|
|
self._pdfplumber = pdfplumber
|
|
|
|
|
except ImportError:
|
|
|
|
|
raise DocumentParseException(
|
|
|
|
|
"pdfplumber not installed. Install with: pip install pdfplumber",
|
|
|
|
|
parser="pdfplumber"
|
|
|
|
|
)
|
|
|
|
|
return self._pdfplumber
|
|
|
|
|
|
|
|
|
|
def parse(self, file_path: str | Path) -> ParseResult:
|
|
|
|
|
"""
|
|
|
|
|
Parse a PDF document and extract text content.
|
|
|
|
|
[AC-AISVC-33] Extracts text and optionally tables.
|
|
|
|
|
"""
|
|
|
|
|
path = Path(file_path)
|
|
|
|
|
|
|
|
|
|
if not path.exists():
|
|
|
|
|
raise DocumentParseException(
|
|
|
|
|
f"File not found: {path}",
|
|
|
|
|
file_path=str(path),
|
|
|
|
|
parser="pdfplumber"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
pdfplumber = self._get_pdfplumber()
|
|
|
|
|
|
|
|
|
|
try:
|
2026-02-24 17:16:59 +00:00
|
|
|
pages: list[PageText] = []
|
feat(AISVC-T7): 嵌入模型可插拔设计与文档解析支持 [AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32, AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37, AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30]
- 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30]
- 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32]
- 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33]
- 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]
- 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37]
- 更新 OpenAPI 契约添加嵌入管理接口
- 添加数据库初始化脚本
- 更新规范文档标记 Phase 7 完成
2026-02-24 15:08:08 +00:00
|
|
|
text_parts = []
|
|
|
|
|
page_count = 0
|
|
|
|
|
|
|
|
|
|
with pdfplumber.open(path) as pdf:
|
|
|
|
|
page_count = len(pdf.pages)
|
|
|
|
|
|
|
|
|
|
for page_num, page in enumerate(pdf.pages):
|
|
|
|
|
text = page.extract_text() or ""
|
|
|
|
|
|
|
|
|
|
if self._extract_tables:
|
|
|
|
|
tables = page.extract_tables()
|
|
|
|
|
for table in tables:
|
|
|
|
|
table_text = self._format_table(table)
|
|
|
|
|
text += f"\n\n{table_text}"
|
|
|
|
|
|
2026-02-24 17:16:59 +00:00
|
|
|
text = text.strip()
|
|
|
|
|
if text:
|
|
|
|
|
pages.append(PageText(page=page_num + 1, text=text))
|
feat(AISVC-T7): 嵌入模型可插拔设计与文档解析支持 [AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32, AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37, AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30]
- 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30]
- 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32]
- 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33]
- 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]
- 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37]
- 更新 OpenAPI 契约添加嵌入管理接口
- 添加数据库初始化脚本
- 更新规范文档标记 Phase 7 完成
2026-02-24 15:08:08 +00:00
|
|
|
text_parts.append(f"[Page {page_num + 1}]\n{text}")
|
|
|
|
|
|
|
|
|
|
full_text = "\n\n".join(text_parts)
|
|
|
|
|
file_size = path.stat().st_size
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Parsed PDF (pdfplumber): {path.name}, pages={page_count}, "
|
|
|
|
|
f"chars={len(full_text)}, size={file_size}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return ParseResult(
|
|
|
|
|
text=full_text,
|
|
|
|
|
source_path=str(path),
|
|
|
|
|
file_size=file_size,
|
|
|
|
|
page_count=page_count,
|
|
|
|
|
metadata={
|
|
|
|
|
"format": "pdf",
|
|
|
|
|
"parser": "pdfplumber",
|
|
|
|
|
"page_count": page_count,
|
2026-02-24 17:16:59 +00:00
|
|
|
},
|
|
|
|
|
pages=pages,
|
feat(AISVC-T7): 嵌入模型可插拔设计与文档解析支持 [AC-AISVC-29, AC-AISVC-30, AC-AISVC-31, AC-AISVC-32, AC-AISVC-33, AC-AISVC-34, AC-AISVC-35, AC-AISVC-36, AC-AISVC-37, AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 新增 EmbeddingProvider 抽象基类和工厂模式 [AC-AISVC-29, AC-AISVC-30]
- 实现 OllamaEmbeddingProvider 和 OpenAIEmbeddingProvider [AC-AISVC-29, AC-AISVC-30]
- 新增 EmbeddingConfigManager 支持配置热更新 [AC-AISVC-31, AC-AISVC-32]
- 新增 DocumentParser 抽象接口和工厂类 [AC-AISVC-33]
- 实现 PDF/Word/Excel/Text 文档解析器 [AC-AISVC-33, AC-AISVC-34, AC-AISVC-35]
- 新增嵌入管理 API 端点 [AC-AISVC-38, AC-AISVC-39, AC-AISVC-40, AC-AISVC-41]
- 更新文档上传流程支持多格式文档解析 [AC-AISVC-36, AC-AISVC-37]
- 更新 OpenAPI 契约添加嵌入管理接口
- 添加数据库初始化脚本
- 更新规范文档标记 Phase 7 完成
2026-02-24 15:08:08 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
except DocumentParseException:
|
|
|
|
|
raise
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise DocumentParseException(
|
|
|
|
|
f"Failed to parse PDF: {e}",
|
|
|
|
|
file_path=str(path),
|
|
|
|
|
parser="pdfplumber",
|
|
|
|
|
details={"error": str(e)}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _format_table(self, table: list[list[str | None]]) -> str:
|
|
|
|
|
"""Format a table as text."""
|
|
|
|
|
if not table:
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
lines = []
|
|
|
|
|
for row in table:
|
|
|
|
|
cells = [str(cell) if cell else "" for cell in row]
|
|
|
|
|
lines.append(" | ".join(cells))
|
|
|
|
|
|
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
def get_supported_extensions(self) -> list[str]:
|
|
|
|
|
"""Get supported file extensions."""
|
|
|
|
|
return [".pdf"]
|