Files
ChouJuGEO/modules/content_uniqueness.py
T
2026-04-30 18:37:46 +08:00

333 lines
12 KiB
Python

"""
内容独特性检测模块
检测批量生成内容的相似度,避免多篇文章说同一件事
"""
import re
import hashlib
from typing import Dict, List, Optional, Tuple
from collections import Counter
import math
class ContentUniquenessChecker:
"""内容独特性检查器"""
def __init__(self, similarity_threshold: float = 0.7):
"""
Args:
similarity_threshold: 相似度阈值,超过此值认为内容过于相似
"""
self.similarity_threshold = similarity_threshold
def check_batch_uniqueness(self, contents: List[str]) -> Dict:
"""
批量检查内容独特性
Args:
contents: 内容列表
Returns:
检查结果
"""
if len(contents) < 2:
return {
"is_unique": True,
"message": "内容数量不足,无需检查"
}
# 计算两两相似度
similarity_matrix = []
high_similarity_pairs = []
for i in range(len(contents)):
for j in range(i + 1, len(contents)):
similarity = self.calculate_similarity(contents[i], contents[j])
similarity_matrix.append({
"pair": (i, j),
"similarity": similarity
})
if similarity > self.similarity_threshold:
high_similarity_pairs.append({
"content_index_1": i,
"content_index_2": j,
"similarity": similarity,
"preview_1": contents[i][:100] + "...",
"preview_2": contents[j][:100] + "..."
})
# 计算整体独特性分数
if similarity_matrix:
avg_similarity = sum(s["similarity"] for s in similarity_matrix) / len(similarity_matrix)
max_similarity = max(s["similarity"] for s in similarity_matrix)
else:
avg_similarity = 0
max_similarity = 0
# 计算独特性分数 (0-100)
uniqueness_score = max(0, (1 - avg_similarity) * 100)
return {
"is_unique": len(high_similarity_pairs) == 0,
"total_contents": len(contents),
"high_similarity_pairs": high_similarity_pairs,
"avg_similarity": avg_similarity,
"max_similarity": max_similarity,
"uniqueness_score": uniqueness_score,
"suggestions": self._generate_suggestions(high_similarity_pairs, avg_similarity)
}
def calculate_similarity(self, text1: str, text2: str) -> float:
"""
计算两段文本的相似度
Args:
text1: 文本1
text2: 文本2
Returns:
相似度分数 (0-1)
"""
# 使用多种方法综合计算
# 1. 词汇重叠度 (Jaccard 相似度)
words1 = set(self._tokenize(text1))
words2 = set(self._tokenize(text2))
if not words1 or not words2:
return 0
intersection = words1 & words2
union = words1 | words2
jaccard_similarity = len(intersection) / len(union)
# 2. 结构相似度 (基于句子结构)
structure_similarity = self._calculate_structure_similarity(text1, text2)
# 3. 关键信息重叠度
key_info_similarity = self._calculate_key_info_similarity(text1, text2)
# 综合相似度
total_similarity = (
jaccard_similarity * 0.4 +
structure_similarity * 0.3 +
key_info_similarity * 0.3
)
return total_similarity
def _tokenize(self, text: str) -> List[str]:
"""分词(简单实现)"""
# 移除标点符号,按空格分词
text = re.sub(r'[^\w\s]', ' ', text)
words = text.lower().split()
# 过滤停用词
stop_words = {'', '', '', '', '', '', '', '', '', '', '',
'', '一个', '', '', '', '', '', '', '', '',
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'can', 'shall'}
return [w for w in words if w not in stop_words and len(w) > 1]
def _calculate_structure_similarity(self, text1: str, text2: str) -> float:
"""计算结构相似度"""
# 提取结构特征
features1 = self._extract_structure_features(text1)
features2 = self._extract_structure_features(text2)
# 比较特征
similarity = 0
total_features = 0
for key in set(features1.keys()) | set(features2.keys()):
if key in features1 and key in features2:
# 数值型特征
if isinstance(features1[key], (int, float)):
max_val = max(abs(features1[key]), abs(features2[key]))
if max_val > 0:
similarity += 1 - abs(features1[key] - features2[key]) / max_val
# 列表型特征
elif isinstance(features1[key], list):
set1 = set(features1[key])
set2 = set(features2[key])
if set1 or set2:
similarity += len(set1 & set2) / len(set1 | set2)
total_features += 1
return similarity / total_features if total_features > 0 else 0
def _extract_structure_features(self, text: str) -> Dict:
"""提取文本结构特征"""
lines = text.split('\n')
return {
"total_chars": len(text),
"total_lines": len(lines),
"avg_line_length": sum(len(line) for line in lines) / len(lines) if lines else 0,
"has_headers": any(line.startswith('#') for line in lines),
"has_list": any(re.match(r'^\s*[-*•]\s', line) for line in lines),
"has_numbered_list": any(re.match(r'^\s*\d+[.、]\s', line) for line in lines),
"header_count": sum(1 for line in lines if line.startswith('#')),
"paragraph_count": sum(1 for line in lines if line.strip() == '') + 1
}
def _calculate_key_info_similarity(self, text1: str, text2: str) -> float:
"""计算关键信息重叠度"""
# 提取数字
numbers1 = set(re.findall(r'\d+', text1))
numbers2 = set(re.findall(r'\d+', text2))
# 提取引号内容
quotes1 = set(re.findall(r'[""「」『』](.+?)[""「」『』]', text1))
quotes2 = set(re.findall(r'[""「」『』](.+?)[""「」『』]', text2))
# 提取英文单词(可能是专业术语)
english1 = set(re.findall(r'[A-Za-z]+', text1))
english2 = set(re.findall(r'[A-Za-z]+', text2))
# 计算重叠度
number_overlap = len(numbers1 & numbers2) / max(len(numbers1 | numbers2), 1)
quote_overlap = len(quotes1 & quotes2) / max(len(quotes1 | quotes2), 1)
english_overlap = len(english1 & english2) / max(len(english1 | english2), 1)
return (number_overlap + quote_overlap + english_overlap) / 3
def _generate_suggestions(self, high_similarity_pairs: List[Dict],
avg_similarity: float) -> List[str]:
"""生成改进建议"""
suggestions = []
if high_similarity_pairs:
suggestions.append(f"发现 {len(high_similarity_pairs)} 对高度相似的内容,建议修改其中一篇")
# 给出具体建议
for pair in high_similarity_pairs[:3]:
suggestions.append(
f"内容 {pair['content_index_1']+1}{pair['content_index_2']+1} "
f"相似度为 {pair['similarity']:.0%},建议调整角度或添加独特案例"
)
if avg_similarity > 0.5:
suggestions.append("整体相似度较高,建议:")
suggestions.append("1. 为每篇内容选择不同的切入角度")
suggestions.append("2. 添加独特的案例或数据")
suggestions.append("3. 使用不同的表达方式和结构")
if not suggestions:
suggestions.append("内容独特性良好,无需修改")
return suggestions
def find_duplicate_sentences(self, contents: List[str],
min_length: int = 20) -> List[Dict]:
"""
查找重复句子
Args:
contents: 内容列表
min_length: 最小句子长度
Returns:
重复句子列表
"""
# 提取所有句子
sentence_sources = {} # sentence -> [content_index]
for i, content in enumerate(contents):
sentences = re.split(r'[。!?.!?]', content)
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) >= min_length:
if sentence not in sentence_sources:
sentence_sources[sentence] = []
sentence_sources[sentence].append(i)
# 找出重复句子
duplicates = []
for sentence, sources in sentence_sources.items():
if len(set(sources)) > 1: # 出现在多篇内容中
duplicates.append({
"sentence": sentence,
"appears_in": list(set(sources)),
"count": len(set(sources))
})
# 按出现次数排序
duplicates.sort(key=lambda x: x["count"], reverse=True)
return duplicates
def generate_uniqueness_report(self, contents: List[str]) -> Dict:
"""
生成独特性报告
Args:
contents: 内容列表
Returns:
报告数据
"""
# 批量检查
batch_result = self.check_batch_uniqueness(contents)
# 查找重复句子
duplicate_sentences = self.find_duplicate_sentences(contents)
# 计算内容指纹
fingerprints = [self._calculate_fingerprint(content) for content in contents]
unique_fingerprints = len(set(fingerprints))
return {
**batch_result,
"duplicate_sentences": duplicate_sentences,
"unique_fingerprints": unique_fingerprints,
"fingerprint_uniqueness": unique_fingerprints / len(contents) if contents else 0
}
def _calculate_fingerprint(self, text: str) -> str:
"""计算文本指纹"""
# 提取关键特征
words = self._tokenize(text)
# 取前100个词的哈希作为指纹
fingerprint_text = ' '.join(words[:100])
return hashlib.md5(fingerprint_text.encode()).hexdigest()[:16]
def check_content_similarity(content1: str, content2: str) -> Dict:
"""
检查两段内容的相似度
Args:
content1: 内容1
content2: 内容2
Returns:
相似度分析结果
"""
checker = ContentUniquenessChecker()
similarity = checker.calculate_similarity(content1, content2)
# 找出共同句子
sentences1 = set(re.split(r'[。!?.!?]', content1))
sentences2 = set(re.split(r'[。!?.!?]', content2))
common_sentences = []
for s1 in sentences1:
s1 = s1.strip()
if len(s1) >= 20:
for s2 in sentences2:
s2 = s2.strip()
if s1 == s2:
common_sentences.append(s1)
return {
"similarity": similarity,
"is_similar": similarity > 0.7,
"common_sentences": common_sentences,
"suggestion": "内容过于相似,建议修改" if similarity > 0.7 else "内容独特性良好"
}