|
|
@@ -0,0 +1,213 @@
|
|
|
+"""
|
|
|
+中文文本处理工具
|
|
|
+
|
|
|
+提供中文文本标准化、分词、权重计算和同义词扩展功能。
|
|
|
+"""
|
|
|
+
|
|
|
+import re
|
|
|
+import string
|
|
|
+from typing import List, Dict, Tuple, Any
|
|
|
+import jieba
|
|
|
+from src.common.logging_config import get_logger
|
|
|
+
|
|
|
+logger = get_logger(__name__)
|
|
|
+
|
|
|
+
|
|
|
+class ChineseTextProcessor:
|
|
|
+ """
|
|
|
+ 中文文本处理器
|
|
|
+
|
|
|
+ 提供以下功能:
|
|
|
+ 1. 文本标准化
|
|
|
+ 2. 中文分词
|
|
|
+ 3. 停用词过滤
|
|
|
+ 4. 权重计算
|
|
|
+ 5. 同义词扩展
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ """
|
|
|
+ 初始化中文文本处理器
|
|
|
+ """
|
|
|
+ # 加载停用词
|
|
|
+ self.stopwords = self._load_stopwords()
|
|
|
+
|
|
|
+ def _load_stopwords(self) -> set:
|
|
|
+ """
|
|
|
+ 加载停用词表
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ set: 停用词集合
|
|
|
+ """
|
|
|
+ # 基本停用词
|
|
|
+ stopwords = {
|
|
|
+ '的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也',
|
|
|
+ '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '个',
|
|
|
+ '得', '地', '天', '子', '日', '月', '年', '时', '分', '秒', '中', '国', '人', '民',
|
|
|
+ '大', '小', '多', '少', '上', '下', '左', '右', '前', '后', '里', '外', '内', '外',
|
|
|
+ '高', '低', '长', '短', '宽', '窄', '厚', '薄', '远', '近', '早', '晚', '今', '明',
|
|
|
+ '昨', '天', '周', '月', '年', '春', '夏', '秋', '冬', '东', '南', '西', '北',
|
|
|
+ # 英文停用词
|
|
|
+ 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what', 'which', 'this',
|
|
|
+ 'that', 'these', 'those', 'then', 'just', 'so', 'than', 'such', 'both', 'through',
|
|
|
+ 'about', 'for', 'is', 'of', 'while', 'during', 'to', 'from', 'in', 'on'
|
|
|
+ }
|
|
|
+ return stopwords
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ def normalize_text(self, text: str) -> str:
|
|
|
+ """
|
|
|
+ 文本标准化
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: 原始文本
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ str: 标准化后的文本
|
|
|
+ """
|
|
|
+ if not text:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 1. 移除 WWW 相关内容
|
|
|
+ text = re.sub(r'https?://\S+', '', text)
|
|
|
+ text = re.sub(r'www\.\S+', '', text)
|
|
|
+
|
|
|
+ # 2. 中英文之间添加空格
|
|
|
+ text = re.sub(r'([a-zA-Z])([\u4e00-\u9fa5])', r'\1 \2', text)
|
|
|
+ text = re.sub(r'([\u4e00-\u9fa5])([a-zA-Z])', r'\1 \2', text)
|
|
|
+
|
|
|
+ # 3. 替换特殊字符为空格
|
|
|
+ text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', ' ', text)
|
|
|
+
|
|
|
+ # 4. 全角转半角
|
|
|
+ def full_to_half(s):
|
|
|
+ result = []
|
|
|
+ for char in s:
|
|
|
+ code = ord(char)
|
|
|
+ if code == 12288: # 全角空格
|
|
|
+ result.append(' ')
|
|
|
+ elif 65281 <= code <= 65374: # 全角字符
|
|
|
+ result.append(chr(code - 65248))
|
|
|
+ else:
|
|
|
+ result.append(char)
|
|
|
+ return ''.join(result)
|
|
|
+
|
|
|
+ text = full_to_half(text)
|
|
|
+
|
|
|
+ # 5. 转换为小写
|
|
|
+ text = text.lower()
|
|
|
+
|
|
|
+ # 6. 移除多余空格
|
|
|
+ text = re.sub(r'\s+', ' ', text).strip()
|
|
|
+
|
|
|
+ return text
|
|
|
+
|
|
|
+ def split(self, text: str) -> List[str]:
|
|
|
+ """
|
|
|
+ 中文分词
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: 标准化后的文本
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ List[str]: 分词结果
|
|
|
+ """
|
|
|
+ if not text:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 使用 jieba 分词
|
|
|
+ words = jieba.cut(text)
|
|
|
+
|
|
|
+ # 过滤停用词和无意义字符
|
|
|
+ filtered_words = []
|
|
|
+ for word in words:
|
|
|
+ # 过滤停用词
|
|
|
+ if word in self.stopwords:
|
|
|
+ continue
|
|
|
+ # 过滤无意义字符
|
|
|
+ if len(word) < 1:
|
|
|
+ continue
|
|
|
+ # 过滤纯数字
|
|
|
+ if word.isdigit():
|
|
|
+ continue
|
|
|
+ # 过滤纯空格
|
|
|
+ if word.isspace():
|
|
|
+ continue
|
|
|
+ filtered_words.append(word)
|
|
|
+
|
|
|
+ return filtered_words
|
|
|
+
|
|
|
+ def weights(self, words: List[str]) -> Dict[str, float]:
|
|
|
+ """
|
|
|
+ 计算词权重
|
|
|
+
|
|
|
+ Args:
|
|
|
+ words: 分词结果
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Dict[str, float]: 词权重映射
|
|
|
+ """
|
|
|
+ if not words:
|
|
|
+ return {}
|
|
|
+
|
|
|
+ # 简单的词频权重计算
|
|
|
+ word_freq = {}
|
|
|
+ for word in words:
|
|
|
+ word_freq[word] = word_freq.get(word, 0) + 1
|
|
|
+
|
|
|
+ # 归一化权重
|
|
|
+ total_freq = sum(word_freq.values())
|
|
|
+ weights = {}
|
|
|
+ for word, freq in word_freq.items():
|
|
|
+ # 词长因子:长词权重更高
|
|
|
+ length_factor = min(len(word) / 4, 1.0)
|
|
|
+ # 频率因子
|
|
|
+ freq_factor = freq / total_freq
|
|
|
+ # 综合权重
|
|
|
+ weight = (freq_factor * 0.7) + (length_factor * 0.3)
|
|
|
+ weights[word] = round(weight, 4)
|
|
|
+
|
|
|
+ return weights
|
|
|
+
|
|
|
+ def lookup(self, word: str) -> List[str]:
|
|
|
+ """
|
|
|
+ 查找同义词(已禁用)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ word: 原始词
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ List[str]: 空列表(同义词功能已禁用)
|
|
|
+ """
|
|
|
+ return []
|
|
|
+
|
|
|
+ def process_text(self, text: str) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 完整文本处理流程
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: 原始文本
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Dict[str, Any]: 处理结果
|
|
|
+ """
|
|
|
+ # 1. 文本标准化
|
|
|
+ normalized_text = self.normalize_text(text)
|
|
|
+
|
|
|
+ # 2. 分词
|
|
|
+ words = self.split(normalized_text)
|
|
|
+
|
|
|
+ # 3. 权重计算
|
|
|
+ word_weights = self.weights(words)
|
|
|
+
|
|
|
+ return {
|
|
|
+ 'original_text': text,
|
|
|
+ 'normalized_text': normalized_text,
|
|
|
+ 'words': words,
|
|
|
+ 'word_weights': word_weights
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+# 全局实例
|
|
|
+chinese_processor = ChineseTextProcessor()
|