Files
ChouJuGEO/modules/content_uniqueness.py
T

333 lines
12 KiB
Python
Raw Permalink Normal View History

"""
内容独特性检测模块
检测批量生成内容的相似度,避免多篇文章说同一件事
"""
import re
import hashlib
from typing import Dict, List, Optional, Tuple
from collections import Counter
import math
class ContentUniquenessChecker:
"""内容独特性检查器"""
def __init__(self, similarity_threshold: float = 0.7):
"""
Args:
similarity_threshold: 相似度阈值,超过此值认为内容过于相似
"""
self.similarity_threshold = similarity_threshold
def check_batch_uniqueness(self, contents: List[str]) -> Dict:
"""
批量检查内容独特性
Args:
contents: 内容列表
Returns:
检查结果
"""
if len(contents) < 2:
return {
"is_unique": True,
"message": "内容数量不足,无需检查"
}
# 计算两两相似度
similarity_matrix = []
high_similarity_pairs = []
for i in range(len(contents)):
for j in range(i + 1, len(contents)):
similarity = self.calculate_similarity(contents[i], contents[j])
similarity_matrix.append({
"pair": (i, j),
"similarity": similarity
})
if similarity > self.similarity_threshold:
high_similarity_pairs.append({
"content_index_1": i,
"content_index_2": j,
"similarity": similarity,
"preview_1": contents[i][:100] + "...",
"preview_2": contents[j][:100] + "..."
})
# 计算整体独特性分数
if similarity_matrix:
avg_similarity = sum(s["similarity"] for s in similarity_matrix) / len(similarity_matrix)
max_similarity = max(s["similarity"] for s in similarity_matrix)
else:
avg_similarity = 0
max_similarity = 0
# 计算独特性分数 (0-100)
uniqueness_score = max(0, (1 - avg_similarity) * 100)
return {
"is_unique": len(high_similarity_pairs) == 0,
"total_contents": len(contents),
"high_similarity_pairs": high_similarity_pairs,
"avg_similarity": avg_similarity,
"max_similarity": max_similarity,
"uniqueness_score": uniqueness_score,
"suggestions": self._generate_suggestions(high_similarity_pairs, avg_similarity)
}
def calculate_similarity(self, text1: str, text2: str) -> float:
"""
计算两段文本的相似度
Args:
text1: 文本1
text2: 文本2
Returns:
相似度分数 (0-1)
"""
# 使用多种方法综合计算
# 1. 词汇重叠度 (Jaccard 相似度)
words1 = set(self._tokenize(text1))
words2 = set(self._tokenize(text2))
if not words1 or not words2:
return 0
intersection = words1 & words2
union = words1 | words2
jaccard_similarity = len(intersection) / len(union)
# 2. 结构相似度 (基于句子结构)
structure_similarity = self._calculate_structure_similarity(text1, text2)
# 3. 关键信息重叠度
key_info_similarity = self._calculate_key_info_similarity(text1, text2)
# 综合相似度
total_similarity = (
jaccard_similarity * 0.4 +
structure_similarity * 0.3 +
key_info_similarity * 0.3
)
return total_similarity
def _tokenize(self, text: str) -> List[str]:
"""分词(简单实现)"""
# 移除标点符号,按空格分词
text = re.sub(r'[^\w\s]', ' ', text)
words = text.lower().split()
# 过滤停用词
stop_words = {'', '', '', '', '', '', '', '', '', '', '',
'', '一个', '', '', '', '', '', '', '', '',
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'can', 'shall'}
return [w for w in words if w not in stop_words and len(w) > 1]
def _calculate_structure_similarity(self, text1: str, text2: str) -> float:
"""计算结构相似度"""
# 提取结构特征
features1 = self._extract_structure_features(text1)
features2 = self._extract_structure_features(text2)
# 比较特征
similarity = 0
total_features = 0
for key in set(features1.keys()) | set(features2.keys()):
if key in features1 and key in features2:
# 数值型特征
if isinstance(features1[key], (int, float)):
max_val = max(abs(features1[key]), abs(features2[key]))
if max_val > 0:
similarity += 1 - abs(features1[key] - features2[key]) / max_val
# 列表型特征
elif isinstance(features1[key], list):
set1 = set(features1[key])
set2 = set(features2[key])
if set1 or set2:
similarity += len(set1 & set2) / len(set1 | set2)
total_features += 1
return similarity / total_features if total_features > 0 else 0
def _extract_structure_features(self, text: str) -> Dict:
"""提取文本结构特征"""
lines = text.split('\n')
return {
"total_chars": len(text),
"total_lines": len(lines),
"avg_line_length": sum(len(line) for line in lines) / len(lines) if lines else 0,
"has_headers": any(line.startswith('#') for line in lines),
"has_list": any(re.match(r'^\s*[-*•]\s', line) for line in lines),
"has_numbered_list": any(re.match(r'^\s*\d+[.、]\s', line) for line in lines),
"header_count": sum(1 for line in lines if line.startswith('#')),
"paragraph_count": sum(1 for line in lines if line.strip() == '') + 1
}
def _calculate_key_info_similarity(self, text1: str, text2: str) -> float:
"""计算关键信息重叠度"""
# 提取数字
numbers1 = set(re.findall(r'\d+', text1))
numbers2 = set(re.findall(r'\d+', text2))
# 提取引号内容
quotes1 = set(re.findall(r'[""「」『』](.+?)[""「」『』]', text1))
quotes2 = set(re.findall(r'[""「」『』](.+?)[""「」『』]', text2))
# 提取英文单词(可能是专业术语)
english1 = set(re.findall(r'[A-Za-z]+', text1))
english2 = set(re.findall(r'[A-Za-z]+', text2))
# 计算重叠度
number_overlap = len(numbers1 & numbers2) / max(len(numbers1 | numbers2), 1)
quote_overlap = len(quotes1 & quotes2) / max(len(quotes1 | quotes2), 1)
english_overlap = len(english1 & english2) / max(len(english1 | english2), 1)
return (number_overlap + quote_overlap + english_overlap) / 3
def _generate_suggestions(self, high_similarity_pairs: List[Dict],
avg_similarity: float) -> List[str]:
"""生成改进建议"""
suggestions = []
if high_similarity_pairs:
suggestions.append(f"发现 {len(high_similarity_pairs)} 对高度相似的内容,建议修改其中一篇")
# 给出具体建议
for pair in high_similarity_pairs[:3]:
suggestions.append(
f"内容 {pair['content_index_1']+1}{pair['content_index_2']+1} "
f"相似度为 {pair['similarity']:.0%},建议调整角度或添加独特案例"
)
if avg_similarity > 0.5:
suggestions.append("整体相似度较高,建议:")
suggestions.append("1. 为每篇内容选择不同的切入角度")
suggestions.append("2. 添加独特的案例或数据")
suggestions.append("3. 使用不同的表达方式和结构")
if not suggestions:
suggestions.append("内容独特性良好,无需修改")
return suggestions
def find_duplicate_sentences(self, contents: List[str],
min_length: int = 20) -> List[Dict]:
"""
查找重复句子
Args:
contents: 内容列表
min_length: 最小句子长度
Returns:
重复句子列表
"""
# 提取所有句子
sentence_sources = {} # sentence -> [content_index]
for i, content in enumerate(contents):
sentences = re.split(r'[。!?.!?]', content)
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) >= min_length:
if sentence not in sentence_sources:
sentence_sources[sentence] = []
sentence_sources[sentence].append(i)
# 找出重复句子
duplicates = []
for sentence, sources in sentence_sources.items():
if len(set(sources)) > 1: # 出现在多篇内容中
duplicates.append({
"sentence": sentence,
"appears_in": list(set(sources)),
"count": len(set(sources))
})
# 按出现次数排序
duplicates.sort(key=lambda x: x["count"], reverse=True)
return duplicates
def generate_uniqueness_report(self, contents: List[str]) -> Dict:
"""
生成独特性报告
Args:
contents: 内容列表
Returns:
报告数据
"""
# 批量检查
batch_result = self.check_batch_uniqueness(contents)
# 查找重复句子
duplicate_sentences = self.find_duplicate_sentences(contents)
# 计算内容指纹
fingerprints = [self._calculate_fingerprint(content) for content in contents]
unique_fingerprints = len(set(fingerprints))
return {
**batch_result,
"duplicate_sentences": duplicate_sentences,
"unique_fingerprints": unique_fingerprints,
"fingerprint_uniqueness": unique_fingerprints / len(contents) if contents else 0
}
def _calculate_fingerprint(self, text: str) -> str:
"""计算文本指纹"""
# 提取关键特征
words = self._tokenize(text)
# 取前100个词的哈希作为指纹
fingerprint_text = ' '.join(words[:100])
return hashlib.md5(fingerprint_text.encode()).hexdigest()[:16]
def check_content_similarity(content1: str, content2: str) -> Dict:
"""
检查两段内容的相似度
Args:
content1: 内容1
content2: 内容2
Returns:
相似度分析结果
"""
checker = ContentUniquenessChecker()
similarity = checker.calculate_similarity(content1, content2)
# 找出共同句子
sentences1 = set(re.split(r'[。!?.!?]', content1))
sentences2 = set(re.split(r'[。!?.!?]', content2))
common_sentences = []
for s1 in sentences1:
s1 = s1.strip()
if len(s1) >= 20:
for s2 in sentences2:
s2 = s2.strip()
if s1 == s2:
common_sentences.append(s1)
return {
"similarity": similarity,
"is_similar": similarity > 0.7,
"common_sentences": common_sentences,
"suggestion": "内容过于相似,建议修改" if similarity > 0.7 else "内容独特性良好"
}