""" Text file parser implementation. [AC-AISVC-33] Text file parsing for plain text and markdown. """ import logging from pathlib import Path from typing import Any from app.services.document.base import ( DocumentParseException, DocumentParser, ParseResult, ) logger = logging.getLogger(__name__) class TextParser(DocumentParser): """ Parser for plain text files. [AC-AISVC-33] Direct text extraction. """ def __init__(self, encoding: str = "utf-8", **kwargs: Any): self._encoding = encoding self._extra_config = kwargs def parse(self, file_path: str | Path) -> ParseResult: """ Parse a text file and extract content. [AC-AISVC-33] Direct file reading. """ path = Path(file_path) if not path.exists(): raise DocumentParseException( f"File not found: {path}", file_path=str(path), parser="text" ) try: with open(path, "r", encoding=self._encoding) as f: text = f.read() file_size = path.stat().st_size line_count = text.count("\n") + 1 logger.info( f"Parsed text: {path.name}, lines={line_count}, " f"chars={len(text)}, size={file_size}" ) return ParseResult( text=text, source_path=str(path), file_size=file_size, metadata={ "format": "text", "line_count": line_count, "encoding": self._encoding, } ) except UnicodeDecodeError: try: with open(path, "r", encoding="gbk") as f: text = f.read() file_size = path.stat().st_size line_count = text.count("\n") + 1 return ParseResult( text=text, source_path=str(path), file_size=file_size, metadata={ "format": "text", "line_count": line_count, "encoding": "gbk", } ) except Exception as e: raise DocumentParseException( f"Failed to parse text file with encoding fallback: {e}", file_path=str(path), parser="text", details={"error": str(e)} ) except Exception as e: raise DocumentParseException( f"Failed to parse text file: {e}", file_path=str(path), parser="text", details={"error": str(e)} ) def get_supported_extensions(self) -> list[str]: """Get supported file extensions.""" return [".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"]