283 lines
8.2 KiB
Python
283 lines
8.2 KiB
Python
"""
|
||
Segment Humanizer for Mid Platform.
|
||
[AC-MARH-10] 分段策略组件(语义/长度切分)。
|
||
[AC-MARH-11] delay 策略租户化配置。
|
||
|
||
将文本按语义/长度切分为 segments,并生成拟人化 delay。
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
import uuid
|
||
from dataclasses import dataclass, field
|
||
from typing import Any
|
||
|
||
from app.models.mid.schemas import Segment, SegmentStats
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
DEFAULT_MIN_DELAY_MS = 50
|
||
DEFAULT_MAX_DELAY_MS = 500
|
||
DEFAULT_SEGMENT_MIN_LENGTH = 10
|
||
DEFAULT_SEGMENT_MAX_LENGTH = 200
|
||
|
||
|
||
@dataclass
|
||
class HumanizeConfig:
|
||
"""拟人化配置。"""
|
||
enabled: bool = True
|
||
min_delay_ms: int = DEFAULT_MIN_DELAY_MS
|
||
max_delay_ms: int = DEFAULT_MAX_DELAY_MS
|
||
length_bucket_strategy: str = "simple"
|
||
segment_min_length: int = DEFAULT_SEGMENT_MIN_LENGTH
|
||
segment_max_length: int = DEFAULT_SEGMENT_MAX_LENGTH
|
||
|
||
def to_dict(self) -> dict[str, Any]:
|
||
return {
|
||
"enabled": self.enabled,
|
||
"min_delay_ms": self.min_delay_ms,
|
||
"max_delay_ms": self.max_delay_ms,
|
||
"length_bucket_strategy": self.length_bucket_strategy,
|
||
"segment_min_length": self.segment_min_length,
|
||
"segment_max_length": self.segment_max_length,
|
||
}
|
||
|
||
@classmethod
|
||
def from_dict(cls, data: dict[str, Any]) -> "HumanizeConfig":
|
||
return cls(
|
||
enabled=data.get("enabled", True),
|
||
min_delay_ms=data.get("min_delay_ms", DEFAULT_MIN_DELAY_MS),
|
||
max_delay_ms=data.get("max_delay_ms", DEFAULT_MAX_DELAY_MS),
|
||
length_bucket_strategy=data.get("length_bucket_strategy", "simple"),
|
||
segment_min_length=data.get("segment_min_length", DEFAULT_SEGMENT_MIN_LENGTH),
|
||
segment_max_length=data.get("segment_max_length", DEFAULT_SEGMENT_MAX_LENGTH),
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class LengthBucket:
|
||
"""长度区间与对应 delay。"""
|
||
min_length: int
|
||
max_length: int
|
||
delay_ms: int
|
||
|
||
|
||
DEFAULT_LENGTH_BUCKETS = [
|
||
LengthBucket(min_length=0, max_length=20, delay_ms=100),
|
||
LengthBucket(min_length=20, max_length=50, delay_ms=200),
|
||
LengthBucket(min_length=50, max_length=100, delay_ms=300),
|
||
LengthBucket(min_length=100, max_length=200, delay_ms=400),
|
||
LengthBucket(min_length=200, max_length=10000, delay_ms=500),
|
||
]
|
||
|
||
|
||
class SegmentHumanizer:
|
||
"""
|
||
[AC-MARH-10, AC-MARH-11] 分段拟人化组件。
|
||
|
||
Features:
|
||
- 按语义/长度切分文本
|
||
- 生成拟人化 delay
|
||
- 支持租户配置覆盖
|
||
- 输出 segment_stats 统计
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
config: HumanizeConfig | None = None,
|
||
length_buckets: list[LengthBucket] | None = None,
|
||
):
|
||
self._config = config or HumanizeConfig()
|
||
self._length_buckets = length_buckets or DEFAULT_LENGTH_BUCKETS
|
||
|
||
def humanize(
|
||
self,
|
||
text: str,
|
||
override_config: HumanizeConfig | None = None,
|
||
) -> tuple[list[Segment], SegmentStats]:
|
||
"""
|
||
[AC-MARH-10] 将文本转换为拟人化分段。
|
||
|
||
Args:
|
||
text: 输入文本
|
||
override_config: 租户覆盖配置
|
||
|
||
Returns:
|
||
Tuple of (segments, segment_stats)
|
||
"""
|
||
config = override_config or self._config
|
||
|
||
if not config.enabled:
|
||
segments = [Segment(
|
||
segment_id=str(uuid.uuid4()),
|
||
text=text,
|
||
delay_after=0,
|
||
)]
|
||
stats = SegmentStats(
|
||
segment_count=1,
|
||
avg_segment_length=len(text),
|
||
humanize_strategy="disabled",
|
||
)
|
||
return segments, stats
|
||
|
||
raw_segments = self._split_text(text, config)
|
||
segments = []
|
||
|
||
for i, seg_text in enumerate(raw_segments):
|
||
is_last = i == len(raw_segments) - 1
|
||
delay_after = 0 if is_last else self._calculate_delay(seg_text, config)
|
||
|
||
segments.append(Segment(
|
||
segment_id=str(uuid.uuid4()),
|
||
text=seg_text,
|
||
delay_after=delay_after,
|
||
))
|
||
|
||
total_length = sum(len(s.text) for s in segments)
|
||
avg_length = total_length / len(segments) if segments else 0.0
|
||
|
||
stats = SegmentStats(
|
||
segment_count=len(segments),
|
||
avg_segment_length=avg_length,
|
||
humanize_strategy=config.length_bucket_strategy,
|
||
)
|
||
|
||
logger.info(
|
||
f"[AC-MARH-10] Humanized text: segments={len(segments)}, "
|
||
f"avg_length={avg_length:.1f}, strategy={config.length_bucket_strategy}"
|
||
)
|
||
|
||
return segments, stats
|
||
|
||
def _split_text(
|
||
self,
|
||
text: str,
|
||
config: HumanizeConfig,
|
||
) -> list[str]:
|
||
"""切分文本。"""
|
||
if config.length_bucket_strategy == "semantic":
|
||
return self._split_semantic(text, config)
|
||
else:
|
||
return self._split_simple(text, config)
|
||
|
||
def _split_simple(
|
||
self,
|
||
text: str,
|
||
config: HumanizeConfig,
|
||
) -> list[str]:
|
||
"""简单切分:按段落。"""
|
||
paragraphs = re.split(r'\n\s*\n', text.strip())
|
||
segments = []
|
||
|
||
for para in paragraphs:
|
||
para = para.strip()
|
||
if not para:
|
||
continue
|
||
|
||
if len(para) <= config.segment_max_length:
|
||
segments.append(para)
|
||
else:
|
||
sub_segments = self._split_by_length(para, config.segment_max_length)
|
||
segments.extend(sub_segments)
|
||
|
||
if not segments:
|
||
segments = [text.strip()]
|
||
|
||
return [s for s in segments if s.strip()]
|
||
|
||
def _split_semantic(
|
||
self,
|
||
text: str,
|
||
config: HumanizeConfig,
|
||
) -> list[str]:
|
||
"""语义切分:按句子边界。"""
|
||
sentence_endings = re.compile(r'([。!?.!?]+)')
|
||
parts = sentence_endings.split(text.strip())
|
||
|
||
sentences = []
|
||
current = ""
|
||
for i, part in enumerate(parts):
|
||
current += part
|
||
if sentence_endings.match(part):
|
||
sentences.append(current.strip())
|
||
current = ""
|
||
|
||
if current.strip():
|
||
sentences.append(current.strip())
|
||
|
||
if not sentences:
|
||
sentences = [text.strip()]
|
||
|
||
segments = []
|
||
current_segment = ""
|
||
|
||
for sentence in sentences:
|
||
if len(current_segment) + len(sentence) <= config.segment_max_length:
|
||
current_segment += sentence
|
||
else:
|
||
if current_segment:
|
||
segments.append(current_segment)
|
||
current_segment = sentence
|
||
|
||
if current_segment:
|
||
segments.append(current_segment)
|
||
|
||
return [s for s in segments if s.strip()]
|
||
|
||
def _split_by_length(
|
||
self,
|
||
text: str,
|
||
max_length: int,
|
||
) -> list[str]:
|
||
"""按长度切分。"""
|
||
segments = []
|
||
remaining = text
|
||
|
||
while remaining:
|
||
if len(remaining) <= max_length:
|
||
segments.append(remaining.strip())
|
||
break
|
||
|
||
split_pos = max_length
|
||
for i in range(max_length - 1, max(0, max_length - 20), -1):
|
||
if remaining[i] in ',,;;:: ':
|
||
split_pos = i + 1
|
||
break
|
||
|
||
segments.append(remaining[:split_pos].strip())
|
||
remaining = remaining[split_pos:]
|
||
|
||
return [s for s in segments if s.strip()]
|
||
|
||
def _calculate_delay(
|
||
self,
|
||
text: str,
|
||
config: HumanizeConfig,
|
||
) -> int:
|
||
"""[AC-MARH-11] 计算拟人化 delay。"""
|
||
text_length = len(text)
|
||
|
||
for bucket in self._length_buckets:
|
||
if bucket.min_length <= text_length < bucket.max_length:
|
||
delay = bucket.delay_ms
|
||
return max(config.min_delay_ms, min(delay, config.max_delay_ms))
|
||
|
||
return config.min_delay_ms
|
||
|
||
def get_config(self) -> HumanizeConfig:
|
||
"""获取当前配置。"""
|
||
return self._config
|
||
|
||
|
||
_segment_humanizer: SegmentHumanizer | None = None
|
||
|
||
|
||
def get_segment_humanizer(
|
||
config: HumanizeConfig | None = None,
|
||
) -> SegmentHumanizer:
|
||
"""获取或创建 SegmentHumanizer 实例。"""
|
||
global _segment_humanizer
|
||
if _segment_humanizer is None:
|
||
_segment_humanizer = SegmentHumanizer(config=config)
|
||
return _segment_humanizer
|