feat(AISVC): Excel/CSV parser converts to JSON format for better RAG retrieval [AC-AISVC-35]
- ExcelParser: convert spreadsheet data to JSON records with header as keys - CSVParser: convert CSV data to JSON records preserving structure - Add _sheet field to identify worksheet source in Excel output - Preserve numeric types (int/float/bool) in JSON output - Support UTF-8 and GBK encoding fallback for CSV files
This commit is contained in:
parent
f2116b95f2
commit
e9fee2f80e
|
|
@ -2,9 +2,11 @@
|
||||||
Excel document parser implementation.
|
Excel document parser implementation.
|
||||||
[AC-AISVC-35] Excel (.xlsx) parsing using openpyxl.
|
[AC-AISVC-35] Excel (.xlsx) parsing using openpyxl.
|
||||||
|
|
||||||
Extracts text content from Excel spreadsheets.
|
Extracts text content from Excel spreadsheets and converts to JSON format
|
||||||
|
to preserve structural relationships for better RAG retrieval.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
@ -22,6 +24,7 @@ class ExcelParser(DocumentParser):
|
||||||
"""
|
"""
|
||||||
Parser for Excel documents.
|
Parser for Excel documents.
|
||||||
[AC-AISVC-35] Uses openpyxl for text extraction.
|
[AC-AISVC-35] Uses openpyxl for text extraction.
|
||||||
|
Converts spreadsheet data to JSON format to preserve structure.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
@ -48,10 +51,48 @@ class ExcelParser(DocumentParser):
|
||||||
)
|
)
|
||||||
return self._openpyxl
|
return self._openpyxl
|
||||||
|
|
||||||
|
def _sheet_to_records(self, sheet, sheet_name: str) -> list[dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Convert a worksheet to a list of record dictionaries.
|
||||||
|
First row is treated as header (column names).
|
||||||
|
"""
|
||||||
|
records = []
|
||||||
|
rows = list(sheet.iter_rows(max_row=self._max_rows_per_sheet, values_only=True))
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return records
|
||||||
|
|
||||||
|
headers = rows[0]
|
||||||
|
header_list = [str(h) if h is not None else f"column_{i}" for i, h in enumerate(headers)]
|
||||||
|
|
||||||
|
for row in rows[1:]:
|
||||||
|
record = {"_sheet": sheet_name}
|
||||||
|
has_content = False
|
||||||
|
|
||||||
|
for i, value in enumerate(row):
|
||||||
|
if i < len(header_list):
|
||||||
|
key = header_list[i]
|
||||||
|
else:
|
||||||
|
key = f"column_{i}"
|
||||||
|
|
||||||
|
if value is not None:
|
||||||
|
has_content = True
|
||||||
|
if isinstance(value, (int, float, bool)):
|
||||||
|
record[key] = value
|
||||||
|
else:
|
||||||
|
record[key] = str(value)
|
||||||
|
elif self._include_empty_cells:
|
||||||
|
record[key] = None
|
||||||
|
|
||||||
|
if has_content or self._include_empty_cells:
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
return records
|
||||||
|
|
||||||
def parse(self, file_path: str | Path) -> ParseResult:
|
def parse(self, file_path: str | Path) -> ParseResult:
|
||||||
"""
|
"""
|
||||||
Parse an Excel document and extract text content.
|
Parse an Excel document and extract text content as JSON.
|
||||||
[AC-AISVC-35] Converts spreadsheet data to structured text.
|
[AC-AISVC-35] Converts spreadsheet data to JSON format.
|
||||||
"""
|
"""
|
||||||
path = Path(file_path)
|
path = Path(file_path)
|
||||||
|
|
||||||
|
|
@ -74,53 +115,33 @@ class ExcelParser(DocumentParser):
|
||||||
try:
|
try:
|
||||||
workbook = openpyxl.load_workbook(path, read_only=True, data_only=True)
|
workbook = openpyxl.load_workbook(path, read_only=True, data_only=True)
|
||||||
|
|
||||||
text_parts = []
|
all_records: list[dict[str, Any]] = []
|
||||||
sheet_count = len(workbook.sheetnames)
|
sheet_count = len(workbook.sheetnames)
|
||||||
total_rows = 0
|
total_rows = 0
|
||||||
|
|
||||||
for sheet_name in workbook.sheetnames:
|
for sheet_name in workbook.sheetnames:
|
||||||
sheet = workbook[sheet_name]
|
sheet = workbook[sheet_name]
|
||||||
sheet_text_parts = []
|
records = self._sheet_to_records(sheet, sheet_name)
|
||||||
row_count = 0
|
all_records.extend(records)
|
||||||
|
total_rows += len(records)
|
||||||
for row in sheet.iter_rows(max_row=self._max_rows_per_sheet):
|
|
||||||
row_values = []
|
|
||||||
has_content = False
|
|
||||||
|
|
||||||
for cell in row:
|
|
||||||
value = cell.value
|
|
||||||
if value is not None:
|
|
||||||
has_content = True
|
|
||||||
row_values.append(str(value))
|
|
||||||
elif self._include_empty_cells:
|
|
||||||
row_values.append("")
|
|
||||||
else:
|
|
||||||
row_values.append("")
|
|
||||||
|
|
||||||
if has_content or self._include_empty_cells:
|
|
||||||
sheet_text_parts.append(" | ".join(row_values))
|
|
||||||
row_count += 1
|
|
||||||
|
|
||||||
if sheet_text_parts:
|
|
||||||
text_parts.append(f"[Sheet: {sheet_name}]\n" + "\n".join(sheet_text_parts))
|
|
||||||
total_rows += row_count
|
|
||||||
|
|
||||||
workbook.close()
|
workbook.close()
|
||||||
|
|
||||||
full_text = "\n\n".join(text_parts)
|
json_str = json.dumps(all_records, ensure_ascii=False, indent=2)
|
||||||
file_size = path.stat().st_size
|
file_size = path.stat().st_size
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Parsed Excel: {path.name}, sheets={sheet_count}, "
|
f"Parsed Excel (JSON): {path.name}, sheets={sheet_count}, "
|
||||||
f"rows={total_rows}, chars={len(full_text)}, size={file_size}"
|
f"rows={total_rows}, chars={len(json_str)}, size={file_size}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return ParseResult(
|
return ParseResult(
|
||||||
text=full_text,
|
text=json_str,
|
||||||
source_path=str(path),
|
source_path=str(path),
|
||||||
file_size=file_size,
|
file_size=file_size,
|
||||||
metadata={
|
metadata={
|
||||||
"format": "xlsx",
|
"format": "xlsx",
|
||||||
|
"output_format": "json",
|
||||||
"sheet_count": sheet_count,
|
"sheet_count": sheet_count,
|
||||||
"total_rows": total_rows,
|
"total_rows": total_rows,
|
||||||
}
|
}
|
||||||
|
|
@ -145,6 +166,7 @@ class CSVParser(DocumentParser):
|
||||||
"""
|
"""
|
||||||
Parser for CSV files.
|
Parser for CSV files.
|
||||||
[AC-AISVC-35] Uses Python's built-in csv module.
|
[AC-AISVC-35] Uses Python's built-in csv module.
|
||||||
|
Converts CSV data to JSON format to preserve structure.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, delimiter: str = ",", encoding: str = "utf-8", **kwargs: Any):
|
def __init__(self, delimiter: str = ",", encoding: str = "utf-8", **kwargs: Any):
|
||||||
|
|
@ -152,13 +174,46 @@ class CSVParser(DocumentParser):
|
||||||
self._encoding = encoding
|
self._encoding = encoding
|
||||||
self._extra_config = kwargs
|
self._extra_config = kwargs
|
||||||
|
|
||||||
def parse(self, file_path: str | Path) -> ParseResult:
|
def _parse_csv_to_records(self, path: Path, encoding: str) -> list[dict[str, Any]]:
|
||||||
"""
|
"""Parse CSV file and return list of record dictionaries."""
|
||||||
Parse a CSV file and extract text content.
|
|
||||||
[AC-AISVC-35] Converts CSV data to structured text.
|
|
||||||
"""
|
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
|
records = []
|
||||||
|
|
||||||
|
with open(path, "r", encoding=encoding, newline="") as f:
|
||||||
|
reader = csv.reader(f, delimiter=self._delimiter)
|
||||||
|
rows = list(reader)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return records
|
||||||
|
|
||||||
|
headers = rows[0]
|
||||||
|
header_list = [str(h) if h else f"column_{i}" for i, h in enumerate(headers)]
|
||||||
|
|
||||||
|
for row in rows[1:]:
|
||||||
|
record = {}
|
||||||
|
has_content = False
|
||||||
|
|
||||||
|
for i, value in enumerate(row):
|
||||||
|
if i < len(header_list):
|
||||||
|
key = header_list[i]
|
||||||
|
else:
|
||||||
|
key = f"column_{i}"
|
||||||
|
|
||||||
|
if value:
|
||||||
|
has_content = True
|
||||||
|
record[key] = value
|
||||||
|
|
||||||
|
if has_content:
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
return records
|
||||||
|
|
||||||
|
def parse(self, file_path: str | Path) -> ParseResult:
|
||||||
|
"""
|
||||||
|
Parse a CSV file and extract text content as JSON.
|
||||||
|
[AC-AISVC-35] Converts CSV data to JSON format.
|
||||||
|
"""
|
||||||
path = Path(file_path)
|
path = Path(file_path)
|
||||||
|
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
|
|
@ -169,56 +224,14 @@ class CSVParser(DocumentParser):
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text_parts = []
|
records = self._parse_csv_to_records(path, self._encoding)
|
||||||
row_count = 0
|
row_count = len(records)
|
||||||
|
used_encoding = self._encoding
|
||||||
with open(path, "r", encoding=self._encoding, newline="") as f:
|
|
||||||
reader = csv.reader(f, delimiter=self._delimiter)
|
|
||||||
for row in reader:
|
|
||||||
text_parts.append(" | ".join(row))
|
|
||||||
row_count += 1
|
|
||||||
|
|
||||||
full_text = "\n".join(text_parts)
|
|
||||||
file_size = path.stat().st_size
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Parsed CSV: {path.name}, rows={row_count}, "
|
|
||||||
f"chars={len(full_text)}, size={file_size}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return ParseResult(
|
|
||||||
text=full_text,
|
|
||||||
source_path=str(path),
|
|
||||||
file_size=file_size,
|
|
||||||
metadata={
|
|
||||||
"format": "csv",
|
|
||||||
"row_count": row_count,
|
|
||||||
"delimiter": self._delimiter,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
try:
|
try:
|
||||||
with open(path, "r", encoding="gbk", newline="") as f:
|
records = self._parse_csv_to_records(path, "gbk")
|
||||||
reader = csv.reader(f, delimiter=self._delimiter)
|
row_count = len(records)
|
||||||
for row in reader:
|
used_encoding = "gbk"
|
||||||
text_parts.append(" | ".join(row))
|
|
||||||
row_count += 1
|
|
||||||
|
|
||||||
full_text = "\n".join(text_parts)
|
|
||||||
file_size = path.stat().st_size
|
|
||||||
|
|
||||||
return ParseResult(
|
|
||||||
text=full_text,
|
|
||||||
source_path=str(path),
|
|
||||||
file_size=file_size,
|
|
||||||
metadata={
|
|
||||||
"format": "csv",
|
|
||||||
"row_count": row_count,
|
|
||||||
"delimiter": self._delimiter,
|
|
||||||
"encoding": "gbk",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise DocumentParseException(
|
raise DocumentParseException(
|
||||||
f"Failed to parse CSV with encoding fallback: {e}",
|
f"Failed to parse CSV with encoding fallback: {e}",
|
||||||
|
|
@ -234,6 +247,27 @@ class CSVParser(DocumentParser):
|
||||||
details={"error": str(e)}
|
details={"error": str(e)}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
json_str = json.dumps(records, ensure_ascii=False, indent=2)
|
||||||
|
file_size = path.stat().st_size
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Parsed CSV (JSON): {path.name}, rows={row_count}, "
|
||||||
|
f"chars={len(json_str)}, size={file_size}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ParseResult(
|
||||||
|
text=json_str,
|
||||||
|
source_path=str(path),
|
||||||
|
file_size=file_size,
|
||||||
|
metadata={
|
||||||
|
"format": "csv",
|
||||||
|
"output_format": "json",
|
||||||
|
"row_count": row_count,
|
||||||
|
"delimiter": self._delimiter,
|
||||||
|
"encoding": used_encoding,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def get_supported_extensions(self) -> list[str]:
|
def get_supported_extensions(self) -> list[str]:
|
||||||
"""Get supported file extensions."""
|
"""Get supported file extensions."""
|
||||||
return [".csv"]
|
return [".csv"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue