216 lines
6.5 KiB
Python
216 lines
6.5 KiB
Python
"""
|
|
Document parser factory.
|
|
[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Factory for document parsers.
|
|
|
|
Design reference: progress.md Section 7.2 - DocumentParserFactory
|
|
"""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from app.services.document.base import (
|
|
DocumentParseException,
|
|
DocumentParser,
|
|
ParseResult,
|
|
UnsupportedFormatError,
|
|
)
|
|
from app.services.document.excel_parser import CSVParser, ExcelParser
|
|
from app.services.document.pdf_parser import PDFParser, PDFPlumberParser
|
|
from app.services.document.text_parser import TextParser
|
|
from app.services.document.word_parser import WordParser
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentParserFactory:
|
|
"""
|
|
Factory for creating document parsers.
|
|
[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Auto-selects parser based on file extension.
|
|
"""
|
|
|
|
_parsers: dict[str, type[DocumentParser]] = {}
|
|
_extension_map: dict[str, str] = {}
|
|
|
|
@classmethod
|
|
def _initialize(cls) -> None:
|
|
"""Initialize default parsers."""
|
|
if cls._parsers:
|
|
return
|
|
|
|
cls._parsers = {
|
|
"pdf": PDFParser,
|
|
"pdfplumber": PDFPlumberParser,
|
|
"word": WordParser,
|
|
"excel": ExcelParser,
|
|
"csv": CSVParser,
|
|
"text": TextParser,
|
|
}
|
|
|
|
cls._extension_map = {
|
|
".pdf": "pdf",
|
|
".docx": "word",
|
|
".xlsx": "excel",
|
|
".xls": "excel",
|
|
".csv": "csv",
|
|
".txt": "text",
|
|
".md": "text",
|
|
".markdown": "text",
|
|
".rst": "text",
|
|
".log": "text",
|
|
".json": "text",
|
|
".xml": "text",
|
|
".yaml": "text",
|
|
".yml": "text",
|
|
}
|
|
|
|
@classmethod
|
|
def register_parser(
|
|
cls,
|
|
name: str,
|
|
parser_class: type[DocumentParser],
|
|
extensions: list[str],
|
|
) -> None:
|
|
"""
|
|
Register a new document parser.
|
|
[AC-AISVC-33] Allows runtime registration of parsers.
|
|
"""
|
|
cls._initialize()
|
|
cls._parsers[name] = parser_class
|
|
for ext in extensions:
|
|
cls._extension_map[ext.lower()] = name
|
|
logger.info(f"Registered document parser: {name} for extensions: {extensions}")
|
|
|
|
@classmethod
|
|
def get_supported_extensions(cls) -> list[str]:
|
|
"""
|
|
Get all supported file extensions.
|
|
[AC-AISVC-37] Returns list of supported formats.
|
|
"""
|
|
cls._initialize()
|
|
return list(cls._extension_map.keys())
|
|
|
|
@classmethod
|
|
def get_parser_for_extension(cls, extension: str) -> DocumentParser:
|
|
"""
|
|
Get a parser instance for a file extension.
|
|
[AC-AISVC-33] Creates appropriate parser based on extension.
|
|
"""
|
|
cls._initialize()
|
|
|
|
normalized = extension.lower()
|
|
if not normalized.startswith("."):
|
|
normalized = f".{normalized}"
|
|
|
|
if normalized not in cls._extension_map:
|
|
raise UnsupportedFormatError(normalized, cls.get_supported_extensions())
|
|
|
|
parser_name = cls._extension_map[normalized]
|
|
parser_class = cls._parsers[parser_name]
|
|
|
|
return parser_class()
|
|
|
|
@classmethod
|
|
def parse_file(
|
|
cls,
|
|
file_path: str | Path,
|
|
parser_name: str | None = None,
|
|
parser_config: dict[str, Any] | None = None,
|
|
) -> ParseResult:
|
|
"""
|
|
Parse a document file.
|
|
[AC-AISVC-33, AC-AISVC-34, AC-AISVC-35] Main entry point for parsing.
|
|
|
|
Args:
|
|
file_path: Path to the document file
|
|
parser_name: Optional specific parser to use
|
|
parser_config: Optional configuration for the parser
|
|
|
|
Returns:
|
|
ParseResult with extracted text and metadata
|
|
|
|
Raises:
|
|
UnsupportedFormatError: If file format is not supported
|
|
DocumentParseException: If parsing fails
|
|
"""
|
|
cls._initialize()
|
|
|
|
path = Path(file_path)
|
|
extension = path.suffix.lower()
|
|
|
|
if parser_name:
|
|
if parser_name not in cls._parsers:
|
|
raise DocumentParseException(
|
|
f"Unknown parser: {parser_name}",
|
|
file_path=str(path),
|
|
parser="factory"
|
|
)
|
|
parser_class = cls._parsers[parser_name]
|
|
parser = parser_class(**(parser_config or {}))
|
|
else:
|
|
parser = cls.get_parser_for_extension(extension)
|
|
if parser_config:
|
|
parser = type(parser)(**parser_config)
|
|
|
|
return parser.parse(path)
|
|
|
|
@classmethod
|
|
def get_parser_info(cls) -> list[dict[str, Any]]:
|
|
"""
|
|
Get information about available parsers.
|
|
[AC-AISVC-37] Returns parser metadata.
|
|
"""
|
|
cls._initialize()
|
|
|
|
info = []
|
|
for name, parser_class in cls._parsers.items():
|
|
temp_instance = parser_class.__new__(parser_class)
|
|
extensions = temp_instance.get_supported_extensions()
|
|
|
|
display_names = {
|
|
"pdf": "PDF 文档",
|
|
"pdfplumber": "PDF 文档 (pdfplumber)",
|
|
"word": "Word 文档",
|
|
"excel": "Excel 电子表格",
|
|
"csv": "CSV 文件",
|
|
"text": "文本文件",
|
|
}
|
|
|
|
descriptions = {
|
|
"pdf": "使用 PyMuPDF 解析 PDF 文档,速度快",
|
|
"pdfplumber": "使用 pdfplumber 解析 PDF 文档,表格提取效果更好",
|
|
"word": "解析 Word 文档 (.docx),保留段落结构",
|
|
"excel": "解析 Excel 电子表格,支持多工作表",
|
|
"csv": "解析 CSV 文件,自动检测编码",
|
|
"text": "解析纯文本文件,支持多种编码",
|
|
}
|
|
|
|
info.append({
|
|
"name": name,
|
|
"display_name": display_names.get(name, name),
|
|
"description": descriptions.get(name, ""),
|
|
"extensions": extensions,
|
|
})
|
|
|
|
return info
|
|
|
|
|
|
def parse_document(
|
|
file_path: str | Path,
|
|
parser_name: str | None = None,
|
|
parser_config: dict[str, Any] | None = None,
|
|
) -> ParseResult:
|
|
"""
|
|
Convenience function for parsing documents.
|
|
[AC-AISVC-33] Simple entry point for document parsing.
|
|
"""
|
|
return DocumentParserFactory.parse_file(file_path, parser_name, parser_config)
|
|
|
|
|
|
def get_supported_document_formats() -> list[str]:
|
|
"""
|
|
Get list of supported document formats.
|
|
[AC-AISVC-37] Returns supported format extensions.
|
|
"""
|
|
return DocumentParserFactory.get_supported_extensions()
|