feat: 文档索引优化,支持多编码解码和按行分块 [AC-AISVC-21, AC-AISVC-22]

This commit is contained in:
MerCry 2026-02-25 23:09:24 +08:00
parent ac8c33cf94
commit 774744d534
2 changed files with 85 additions and 41 deletions

View File

@ -37,6 +37,42 @@ class TextChunk:
source: str | None = None source: str | None = None
def chunk_text_by_lines(
text: str,
min_line_length: int = 10,
source: str | None = None,
) -> list[TextChunk]:
"""
按行分块每行作为一个独立的检索单元
Args:
text: 要分块的文本
min_line_length: 最小行长度低于此长度的行会被跳过
source: 来源文件路径可选
Returns:
分块列表每个块对应一行文本
"""
lines = text.split('\n')
chunks: list[TextChunk] = []
for i, line in enumerate(lines):
line = line.strip()
if len(line) < min_line_length:
continue
chunks.append(TextChunk(
text=line,
start_token=i,
end_token=i + 1,
page=None,
source=source,
))
return chunks
def chunk_text_with_tiktoken( def chunk_text_with_tiktoken(
text: str, text: str,
chunk_size: int = 512, chunk_size: int = 512,
@ -318,8 +354,19 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"} text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
if file_ext in text_extensions or not file_ext: if file_ext in text_extensions or not file_ext:
logger.info(f"[INDEX] Treating as text file, decoding with UTF-8") logger.info(f"[INDEX] Treating as text file, trying multiple encodings")
text = content.decode("utf-8", errors="ignore") text = None
for encoding in ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]:
try:
text = content.decode(encoding)
logger.info(f"[INDEX] Successfully decoded with encoding: {encoding}")
break
except (UnicodeDecodeError, LookupError):
continue
if text is None:
text = content.decode("utf-8", errors="replace")
logger.warning(f"[INDEX] Failed to decode with known encodings, using utf-8 with replacement")
else: else:
logger.info(f"[INDEX] Binary file detected, will parse with document parser") logger.info(f"[INDEX] Binary file detected, will parse with document parser")
await kb_service.update_job_status( await kb_service.update_job_status(
@ -374,23 +421,22 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt
all_chunks: list[TextChunk] = [] all_chunks: list[TextChunk] = []
if parse_result and parse_result.pages: if parse_result and parse_result.pages:
logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using tiktoken chunking with page metadata") logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using line-based chunking with page metadata")
for page in parse_result.pages: for page in parse_result.pages:
page_chunks = chunk_text_with_tiktoken( page_chunks = chunk_text_by_lines(
page.text, page.text,
chunk_size=512, min_line_length=10,
overlap=100,
page=page.page,
source=filename, source=filename,
) )
for pc in page_chunks:
pc.page = page.page
all_chunks.extend(page_chunks) all_chunks.extend(page_chunks)
logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}") logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}")
else: else:
logger.info(f"[INDEX] Using tiktoken chunking without page metadata") logger.info(f"[INDEX] Using line-based chunking")
all_chunks = chunk_text_with_tiktoken( all_chunks = chunk_text_by_lines(
text, text,
chunk_size=512, min_line_length=10,
overlap=100,
source=filename, source=filename,
) )
logger.info(f"[INDEX] Total chunks: {len(all_chunks)}") logger.info(f"[INDEX] Total chunks: {len(all_chunks)}")

View File

@ -15,17 +15,39 @@ from app.services.document.base import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
class TextParser(DocumentParser): class TextParser(DocumentParser):
""" """
Parser for plain text files. Parser for plain text files.
[AC-AISVC-33] Direct text extraction. [AC-AISVC-33] Direct text extraction with multiple encoding support.
""" """
def __init__(self, encoding: str = "utf-8", **kwargs: Any): def __init__(self, encoding: str = "utf-8", **kwargs: Any):
self._encoding = encoding self._encoding = encoding
self._extra_config = kwargs self._extra_config = kwargs
def _try_encodings(self, path: Path) -> tuple[str, str]:
"""
Try multiple encodings to read the file.
Returns: (text, encoding_used)
"""
for enc in ENCODINGS_TO_TRY:
try:
with open(path, "r", encoding=enc) as f:
text = f.read()
logger.info(f"Successfully parsed with encoding: {enc}")
return text, enc
except (UnicodeDecodeError, LookupError):
continue
raise DocumentParseException(
f"Failed to decode file with any known encoding",
file_path=str(path),
parser="text"
)
def parse(self, file_path: str | Path) -> ParseResult: def parse(self, file_path: str | Path) -> ParseResult:
""" """
Parse a text file and extract content. Parse a text file and extract content.
@ -41,15 +63,14 @@ class TextParser(DocumentParser):
) )
try: try:
with open(path, "r", encoding=self._encoding) as f: text, encoding_used = self._try_encodings(path)
text = f.read()
file_size = path.stat().st_size file_size = path.stat().st_size
line_count = text.count("\n") + 1 line_count = text.count("\n") + 1
logger.info( logger.info(
f"Parsed text: {path.name}, lines={line_count}, " f"Parsed text: {path.name}, lines={line_count}, "
f"chars={len(text)}, size={file_size}" f"chars={len(text)}, size={file_size}, encoding={encoding_used}"
) )
return ParseResult( return ParseResult(
@ -59,35 +80,12 @@ class TextParser(DocumentParser):
metadata={ metadata={
"format": "text", "format": "text",
"line_count": line_count, "line_count": line_count,
"encoding": self._encoding, "encoding": encoding_used,
} }
) )
except UnicodeDecodeError: except DocumentParseException:
try: raise
with open(path, "r", encoding="gbk") as f:
text = f.read()
file_size = path.stat().st_size
line_count = text.count("\n") + 1
return ParseResult(
text=text,
source_path=str(path),
file_size=file_size,
metadata={
"format": "text",
"line_count": line_count,
"encoding": "gbk",
}
)
except Exception as e:
raise DocumentParseException(
f"Failed to parse text file with encoding fallback: {e}",
file_path=str(path),
parser="text",
details={"error": str(e)}
)
except Exception as e: except Exception as e:
raise DocumentParseException( raise DocumentParseException(
f"Failed to parse text file: {e}", f"Failed to parse text file: {e}",