100 lines
2.9 KiB
Python
100 lines
2.9 KiB
Python
"""
|
|
Text file parser implementation.
|
|
[AC-AISVC-33] Text file parsing for plain text and markdown.
|
|
"""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from app.services.document.base import (
|
|
DocumentParseException,
|
|
DocumentParser,
|
|
ParseResult,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
|
|
|
|
|
|
class TextParser(DocumentParser):
|
|
"""
|
|
Parser for plain text files.
|
|
[AC-AISVC-33] Direct text extraction with multiple encoding support.
|
|
"""
|
|
|
|
def __init__(self, encoding: str = "utf-8", **kwargs: Any):
|
|
self._encoding = encoding
|
|
self._extra_config = kwargs
|
|
|
|
def _try_encodings(self, path: Path) -> tuple[str, str]:
|
|
"""
|
|
Try multiple encodings to read the file.
|
|
Returns: (text, encoding_used)
|
|
"""
|
|
for enc in ENCODINGS_TO_TRY:
|
|
try:
|
|
with open(path, "r", encoding=enc) as f:
|
|
text = f.read()
|
|
logger.info(f"Successfully parsed with encoding: {enc}")
|
|
return text, enc
|
|
except (UnicodeDecodeError, LookupError):
|
|
continue
|
|
|
|
raise DocumentParseException(
|
|
f"Failed to decode file with any known encoding",
|
|
file_path=str(path),
|
|
parser="text"
|
|
)
|
|
|
|
def parse(self, file_path: str | Path) -> ParseResult:
|
|
"""
|
|
Parse a text file and extract content.
|
|
[AC-AISVC-33] Direct file reading.
|
|
"""
|
|
path = Path(file_path)
|
|
|
|
if not path.exists():
|
|
raise DocumentParseException(
|
|
f"File not found: {path}",
|
|
file_path=str(path),
|
|
parser="text"
|
|
)
|
|
|
|
try:
|
|
text, encoding_used = self._try_encodings(path)
|
|
|
|
file_size = path.stat().st_size
|
|
line_count = text.count("\n") + 1
|
|
|
|
logger.info(
|
|
f"Parsed text: {path.name}, lines={line_count}, "
|
|
f"chars={len(text)}, size={file_size}, encoding={encoding_used}"
|
|
)
|
|
|
|
return ParseResult(
|
|
text=text,
|
|
source_path=str(path),
|
|
file_size=file_size,
|
|
metadata={
|
|
"format": "text",
|
|
"line_count": line_count,
|
|
"encoding": encoding_used,
|
|
}
|
|
)
|
|
|
|
except DocumentParseException:
|
|
raise
|
|
except Exception as e:
|
|
raise DocumentParseException(
|
|
f"Failed to parse text file: {e}",
|
|
file_path=str(path),
|
|
parser="text",
|
|
details={"error": str(e)}
|
|
)
|
|
|
|
def get_supported_extensions(self) -> list[str]:
|
|
"""Get supported file extensions."""
|
|
return [".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"]
|