""" Text file parser implementation. [AC-AISVC-33] Text file parsing for plain text and markdown. """ import logging from pathlib import Path from typing import Any from app.services.document.base import ( DocumentParseException, DocumentParser, ParseResult, ) logger = logging.getLogger(__name__) ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"] class TextParser(DocumentParser): """ Parser for plain text files. [AC-AISVC-33] Direct text extraction with multiple encoding support. """ def __init__(self, encoding: str = "utf-8", **kwargs: Any): self._encoding = encoding self._extra_config = kwargs def _try_encodings(self, path: Path) -> tuple[str, str]: """ Try multiple encodings to read the file. Returns: (text, encoding_used) """ for enc in ENCODINGS_TO_TRY: try: with open(path, encoding=enc) as f: text = f.read() logger.info(f"Successfully parsed with encoding: {enc}") return text, enc except (UnicodeDecodeError, LookupError): continue raise DocumentParseException( "Failed to decode file with any known encoding", file_path=str(path), parser="text" ) def parse(self, file_path: str | Path) -> ParseResult: """ Parse a text file and extract content. [AC-AISVC-33] Direct file reading. """ path = Path(file_path) if not path.exists(): raise DocumentParseException( f"File not found: {path}", file_path=str(path), parser="text" ) try: text, encoding_used = self._try_encodings(path) file_size = path.stat().st_size line_count = text.count("\n") + 1 logger.info( f"Parsed text: {path.name}, lines={line_count}, " f"chars={len(text)}, size={file_size}, encoding={encoding_used}" ) return ParseResult( text=text, source_path=str(path), file_size=file_size, metadata={ "format": "text", "line_count": line_count, "encoding": encoding_used, } ) except DocumentParseException: raise except Exception as e: raise DocumentParseException( f"Failed to parse text file: {e}", file_path=str(path), parser="text", details={"error": str(e)} ) def get_supported_extensions(self) -> list[str]: """Get supported file extensions.""" return [".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"]