ai-robot-core/ai-service/app/services/document/pdf_parser.py

222 lines
6.7 KiB
Python
Raw Normal View History

"""
PDF document parser implementation.
[AC-AISVC-33] PDF parsing using PyMuPDF (fitz).
Extracts text content from PDF files.
"""
import logging
from pathlib import Path
from typing import Any
from app.services.document.base import (
DocumentParseException,
DocumentParser,
ParseResult,
)
logger = logging.getLogger(__name__)
class PDFParser(DocumentParser):
"""
Parser for PDF documents.
[AC-AISVC-33] Uses PyMuPDF for text extraction.
"""
def __init__(self, extract_images: bool = False, **kwargs: Any):
self._extract_images = extract_images
self._extra_config = kwargs
self._fitz = None
def _get_fitz(self):
"""Lazy import of PyMuPDF."""
if self._fitz is None:
try:
import fitz
self._fitz = fitz
except ImportError:
raise DocumentParseException(
"PyMuPDF (fitz) not installed. Install with: pip install pymupdf",
parser="pdf"
)
return self._fitz
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse a PDF document and extract text content.
[AC-AISVC-33] Extracts text from all pages.
"""
path = Path(file_path)
if not path.exists():
raise DocumentParseException(
f"File not found: {path}",
file_path=str(path),
parser="pdf"
)
if not self.supports_extension(path.suffix):
raise DocumentParseException(
f"Unsupported file extension: {path.suffix}",
file_path=str(path),
parser="pdf"
)
fitz = self._get_fitz()
try:
doc = fitz.open(path)
text_parts = []
page_count = len(doc)
for page_num in range(page_count):
page = doc[page_num]
text = page.get_text()
if text.strip():
text_parts.append(f"[Page {page_num + 1}]\n{text}")
doc.close()
full_text = "\n\n".join(text_parts)
file_size = path.stat().st_size
logger.info(
f"Parsed PDF: {path.name}, pages={page_count}, "
f"chars={len(full_text)}, size={file_size}"
)
return ParseResult(
text=full_text,
source_path=str(path),
file_size=file_size,
page_count=page_count,
metadata={
"format": "pdf",
"page_count": page_count,
}
)
except DocumentParseException:
raise
except Exception as e:
raise DocumentParseException(
f"Failed to parse PDF: {e}",
file_path=str(path),
parser="pdf",
details={"error": str(e)}
)
def get_supported_extensions(self) -> list[str]:
"""Get supported file extensions."""
return [".pdf"]
class PDFPlumberParser(DocumentParser):
"""
Alternative PDF parser using pdfplumber.
[AC-AISVC-33] Uses pdfplumber for text extraction.
pdfplumber is better for table extraction but slower than PyMuPDF.
"""
def __init__(self, extract_tables: bool = True, **kwargs: Any):
self._extract_tables = extract_tables
self._extra_config = kwargs
self._pdfplumber = None
def _get_pdfplumber(self):
"""Lazy import of pdfplumber."""
if self._pdfplumber is None:
try:
import pdfplumber
self._pdfplumber = pdfplumber
except ImportError:
raise DocumentParseException(
"pdfplumber not installed. Install with: pip install pdfplumber",
parser="pdfplumber"
)
return self._pdfplumber
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse a PDF document and extract text content.
[AC-AISVC-33] Extracts text and optionally tables.
"""
path = Path(file_path)
if not path.exists():
raise DocumentParseException(
f"File not found: {path}",
file_path=str(path),
parser="pdfplumber"
)
pdfplumber = self._get_pdfplumber()
try:
text_parts = []
page_count = 0
with pdfplumber.open(path) as pdf:
page_count = len(pdf.pages)
for page_num, page in enumerate(pdf.pages):
text = page.extract_text() or ""
if self._extract_tables:
tables = page.extract_tables()
for table in tables:
table_text = self._format_table(table)
text += f"\n\n{table_text}"
if text.strip():
text_parts.append(f"[Page {page_num + 1}]\n{text}")
full_text = "\n\n".join(text_parts)
file_size = path.stat().st_size
logger.info(
f"Parsed PDF (pdfplumber): {path.name}, pages={page_count}, "
f"chars={len(full_text)}, size={file_size}"
)
return ParseResult(
text=full_text,
source_path=str(path),
file_size=file_size,
page_count=page_count,
metadata={
"format": "pdf",
"parser": "pdfplumber",
"page_count": page_count,
}
)
except DocumentParseException:
raise
except Exception as e:
raise DocumentParseException(
f"Failed to parse PDF: {e}",
file_path=str(path),
parser="pdfplumber",
details={"error": str(e)}
)
def _format_table(self, table: list[list[str | None]]) -> str:
"""Format a table as text."""
if not table:
return ""
lines = []
for row in table:
cells = [str(cell) if cell else "" for cell in row]
lines.append(" | ".join(cells))
return "\n".join(lines)
def get_supported_extensions(self) -> list[str]:
"""Get supported file extensions."""
return [".pdf"]