ai-robot-core/ai-service/app/services/document/factory.py

216 lines
6.5 KiB
Python

"""
Document parser factory.
[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Factory for document parsers.
Design reference: progress.md Section 7.2 - DocumentParserFactory
"""
import logging
from pathlib import Path
from typing import Any
from app.services.document.base import (
DocumentParseException,
DocumentParser,
ParseResult,
UnsupportedFormatError,
)
from app.services.document.excel_parser import CSVParser, ExcelParser
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
from app.services.document.text_parser import TextParser
from app.services.document.word_parser import WordParser
logger = logging.getLogger(__name__)
class DocumentParserFactory:
"""
Factory for creating document parsers.
[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Auto-selects parser based on file extension.
"""
_parsers: dict[str, type[DocumentParser]] = {}
_extension_map: dict[str, str] = {}
@classmethod
def _initialize(cls) -> None:
"""Initialize default parsers."""
if cls._parsers:
return
cls._parsers = {
"pdf": PDFParser,
"pdfplumber": PDFPlumberParser,
"word": WordParser,
"excel": ExcelParser,
"csv": CSVParser,
"text": TextParser,
}
cls._extension_map = {
".pdf": "pdf",
".docx": "word",
".xlsx": "excel",
".xls": "excel",
".csv": "csv",
".txt": "text",
".md": "text",
".markdown": "text",
".rst": "text",
".log": "text",
".json": "text",
".xml": "text",
".yaml": "text",
".yml": "text",
}
@classmethod
def register_parser(
cls,
name: str,
parser_class: type[DocumentParser],
extensions: list[str],
) -> None:
"""
Register a new document parser.
[AC-AISVC-33] Allows runtime registration of parsers.
"""
cls._initialize()
cls._parsers[name] = parser_class
for ext in extensions:
cls._extension_map[ext.lower()] = name
logger.info(f"Registered document parser: {name} for extensions: {extensions}")
@classmethod
def get_supported_extensions(cls) -> list[str]:
"""
Get all supported file extensions.
[AC-AISVC-37] Returns list of supported formats.
"""
cls._initialize()
return list(cls._extension_map.keys())
@classmethod
def get_parser_for_extension(cls, extension: str) -> DocumentParser:
"""
Get a parser instance for a file extension.
[AC-AISVC-33] Creates appropriate parser based on extension.
"""
cls._initialize()
normalized = extension.lower()
if not normalized.startswith("."):
normalized = f".{normalized}"
if normalized not in cls._extension_map:
raise UnsupportedFormatError(normalized, cls.get_supported_extensions())
parser_name = cls._extension_map[normalized]
parser_class = cls._parsers[parser_name]
return parser_class()
@classmethod
def parse_file(
cls,
file_path: str | Path,
parser_name: str | None = None,
parser_config: dict[str, Any] | None = None,
) -> ParseResult:
"""
Parse a document file.
[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Main entry point for parsing.
Args:
file_path: Path to the document file
parser_name: Optional specific parser to use
parser_config: Optional configuration for the parser
Returns:
ParseResult with extracted text and metadata
Raises:
UnsupportedFormatError: If file format is not supported
DocumentParseException: If parsing fails
"""
cls._initialize()
path = Path(file_path)
extension = path.suffix.lower()
if parser_name:
if parser_name not in cls._parsers:
raise DocumentParseException(
f"Unknown parser: {parser_name}",
file_path=str(path),
parser="factory"
)
parser_class = cls._parsers[parser_name]
parser = parser_class(**(parser_config or {}))
else:
parser = cls.get_parser_for_extension(extension)
if parser_config:
parser = type(parser)(**parser_config)
return parser.parse(path)
@classmethod
def get_parser_info(cls) -> list[dict[str, Any]]:
"""
Get information about available parsers.
[AC-AISVC-37] Returns parser metadata.
"""
cls._initialize()
info = []
for name, parser_class in cls._parsers.items():
temp_instance = parser_class.__new__(parser_class)
extensions = temp_instance.get_supported_extensions()
display_names = {
"pdf": "PDF 文档",
"pdfplumber": "PDF 文档 (pdfplumber)",
"word": "Word 文档",
"excel": "Excel 电子表格",
"csv": "CSV 文件",
"text": "文本文件",
}
descriptions = {
"pdf": "使用 PyMuPDF 解析 PDF 文档,速度快",
"pdfplumber": "使用 pdfplumber 解析 PDF 文档,表格提取效果更好",
"word": "解析 Word 文档 (.docx),保留段落结构",
"excel": "解析 Excel 电子表格,支持多工作表",
"csv": "解析 CSV 文件,自动检测编码",
"text": "解析纯文本文件,支持多种编码",
}
info.append({
"name": name,
"display_name": display_names.get(name, name),
"description": descriptions.get(name, ""),
"extensions": extensions,
})
return info
def parse_document(
file_path: str | Path,
parser_name: str | None = None,
parser_config: dict[str, Any] | None = None,
) -> ParseResult:
"""
Convenience function for parsing documents.
[AC-AISVC-33] Simple entry point for document parsing.
"""
return DocumentParserFactory.parse_file(file_path, parser_name, parser_config)
def get_supported_document_formats() -> list[str]:
"""
Get list of supported document formats.
[AC-AISVC-37] Returns supported format extensions.
"""
return DocumentParserFactory.get_supported_extensions()