Files

395 lines
14 KiB
Python
Raw Permalink Normal View History

"""
内容质量指标分析模块
计算 Trust Density、Citation Share、Authority Score、Engagement Potential 等指标
"""
import re
from typing import Dict, List, Optional, Tuple
from collections import Counter
class ContentMetricsAnalyzer:
"""内容质量指标分析器"""
def __init__(self):
# 信任信号模式(来源占位、数据、案例等)
self.trust_signal_patterns = [
# 来源占位模式
r'根据[^,。;:\n]{2,30}(?:报告|研究|数据|统计|调查|分析|标准|规范|文档|指南)',
r'参考[^,。;:\n]{2,30}(?:报告|研究|数据|统计|调查|分析|标准|规范|文档|指南)',
r'来自[^,。;:\n]{2,30}(?:报告|研究|数据|统计|调查|分析)',
r'据[^,。;:\n]{2,30}(?:显示|表明|统计|调查|分析)',
r'[^,。;:\n]{2,20}(?:报告|研究|数据|统计|调查)显示',
r'[^,。;:\n]{2,20}(?:报告|研究|数据|统计|调查)表明',
# 数据点模式
r'\d+%', # 百分比
r'\d+\.\d+%', # 小数百分比
r'\d+%', # 约XX%
r'超过\d+%', # 超过XX%
r'达到\d+%', # 达到XX%
r'\d+倍', # XX倍
r'\d+个', # XX个
r'\d+项', # XX项
r'\d+次', # XX次
r'\d+年', # XX年(时间数据)
r'\d+月', # XX月
# 案例模式
r'案例[:][^,。;\n]{5,100}',
r'例如[^,。;\n]{5,100}',
r'以[^,。;\n]{2,30}为例',
r'某[^,。;\n]{2,20}(?:企业|公司|用户|项目|团队)',
r'实际[^,。;\n]{2,30}(?:测试|应用|使用|经验)',
r'使用[^,。;\n]{2,30}(?:发现|表明|显示)',
]
# 结构化元素模式
self.structure_patterns = [
r'^#{1,6}\s+.+', # Markdown 标题
r'^\d+[\.、]\s+.+', # 编号列表
r'^[-*+]\s+.+', # 无序列表
r'^\s*[-*+]\s+.+', # 缩进列表
r'```[\s\S]*?```', # 代码块
r'`[^`]+`', # 行内代码
r'^\s*[Qq][:].*', # FAQ 问题
r'^\s*[Aa][:].*', # FAQ 答案
r'\|.*\|', # 表格
r'^>.*', # 引用块
]
def count_trust_signals(self, content: str) -> int:
"""
统计信任信号数量
Args:
content: 内容文本
Returns:
信任信号数量
"""
# 去重:如果同一个位置匹配多个模式,只算一次
# 简化处理:使用集合去重匹配位置附近的一小段文本
unique_matches = set()
for pattern in self.trust_signal_patterns:
for match in re.finditer(pattern, content, re.MULTILINE | re.IGNORECASE):
pos_key = content[max(0, match.start() - 10): match.end() + 10]
unique_matches.add(pos_key)
return len(unique_matches)
def count_citations(self, content: str) -> int:
"""
统计来源占位数量(Citation)
Args:
content: 内容文本
Returns:
来源占位数量
"""
citation_patterns = [
r'根据[^,。;:\n]{2,30}(?:报告|研究|数据|统计|调查|分析|标准|规范|文档|指南)',
r'参考[^,。;:\n]{2,30}(?:报告|研究|数据|统计|调查|分析|标准|规范|文档|指南)',
r'来自[^,。;:\n]{2,30}(?:报告|研究|数据|统计|调查|分析)',
r'据[^,。;:\n]{2,30}(?:显示|表明|统计|调查|分析)',
r'[^,。;:\n]{2,20}(?:报告|研究|数据|统计|调查)(?:显示|表明)',
]
citations = set()
for pattern in citation_patterns:
for match in re.finditer(pattern, content, re.MULTILINE | re.IGNORECASE):
# 使用匹配位置作为唯一标识
pos_key = (match.start(), match.end())
citations.add(pos_key)
return len(citations)
def count_brand_mentions(self, content: str, brand: str) -> int:
"""
统计品牌提及次数
Args:
content: 内容文本
brand: 品牌名称
Returns:
品牌提及次数
"""
if not brand:
return 0
# 使用单词边界匹配,避免部分匹配
pattern = r'\b' + re.escape(brand) + r'\b'
matches = re.findall(pattern, content, re.IGNORECASE)
return len(matches)
def count_structure_elements(self, content: str) -> Dict[str, int]:
"""
统计结构化元素数量
Args:
content: 内容文本
Returns:
结构化元素统计字典
"""
lines = content.split('\n')
structure_count = {
'headings': 0, # 标题
'lists': 0, # 列表
'code_blocks': 0, # 代码块
'faq_pairs': 0, # FAQ 对
'tables': 0, # 表格
'quotes': 0, # 引用
}
# 统计标题
for line in lines:
if re.match(r'^#{1,6}\s+.+', line):
structure_count['headings'] += 1
elif re.match(r'^\d+[\.、]\s+.+', line) or re.match(r'^[-*+]\s+.+', line):
structure_count['lists'] += 1
elif re.match(r'^\s*[Qq][:].*', line):
structure_count['faq_pairs'] += 1
elif re.match(r'^\s*\|.*\|', line):
structure_count['tables'] += 1
elif re.match(r'^>.*', line):
structure_count['quotes'] += 1
# 统计代码块
code_blocks = re.findall(r'```[\s\S]*?```', content)
structure_count['code_blocks'] = len(code_blocks)
return structure_count
def calculate_trust_density(self, content: str) -> float:
"""
计算 Trust Density(每100字信任信号数)
Args:
content: 内容文本
Returns:
Trust Density 值
"""
if not content:
return 0.0
# 计算实际文本长度(去除空白字符)
text_length = len(re.sub(r'\s+', '', content))
if text_length == 0:
return 0.0
trust_signals = self.count_trust_signals(content)
# 每100字信任信号数
trust_density = (trust_signals / text_length) * 100
return round(trust_density, 2)
def calculate_citation_share(self, content: str, brand: str) -> float:
"""
计算 Citation Share(品牌引用比例)
Args:
content: 内容文本
brand: 品牌名称
Returns:
Citation Share 值(0-100
"""
if not content or not brand:
return 0.0
brand_mentions = self.count_brand_mentions(content, brand)
# 统计所有可能的提及(品牌、竞品、通用术语等)
# 简化处理:统计所有可能的品牌/产品提及
# 使用常见品牌提及模式
all_mentions_pattern = r'\b[A-Z][a-zA-Z0-9]{2,20}\b' # 大写开头的单词(可能是品牌)
all_mentions = len(re.findall(all_mentions_pattern, content))
# 如果总提及数太少,使用品牌提及次数作为分母
if all_mentions < brand_mentions * 2:
all_mentions = brand_mentions * 2
if all_mentions == 0:
return 0.0
citation_share = (brand_mentions / all_mentions) * 100
return round(min(citation_share, 100.0), 2)
def calculate_authority_score(self, content: str) -> float:
"""
计算 Authority Score(权威性得分,0-100
基于来源占位数量、数据密度等
Args:
content: 内容文本
Returns:
Authority Score 值(0-100
"""
if not content:
return 0.0
citations = self.count_citations(content)
trust_signals = self.count_trust_signals(content)
text_length = len(re.sub(r'\s+', '', content))
if text_length == 0:
return 0.0
# 计算各项得分
# 来源占位得分(最多30分)
citation_score = min(citations * 5, 30)
# 信任信号密度得分(最多40分)
trust_density = (trust_signals / text_length) * 1000 # 每1000字信任信号数
trust_score = min(trust_density * 4, 40)
# 数据点得分(最多30分)
data_points = len(re.findall(r'\d+%', content)) + len(re.findall(r'\d+\.\d+%', content))
data_score = min(data_points * 2, 30)
authority_score = citation_score + trust_score + data_score
return round(min(authority_score, 100.0), 2)
def calculate_engagement_potential(self, content: str) -> float:
"""
计算 Engagement Potential(参与度潜力,0-100
基于结构化程度、互动元素等
Args:
content: 内容文本
Returns:
Engagement Potential 值(0-100
"""
if not content:
return 0.0
structure = self.count_structure_elements(content)
text_length = len(re.sub(r'\s+', '', content))
if text_length == 0:
return 0.0
# 计算各项得分
# 标题得分(最多20分)
heading_score = min(structure['headings'] * 2, 20)
# 列表得分(最多25分)
list_score = min(structure['lists'] * 1.5, 25)
# FAQ 得分(最多25分)
faq_score = min(structure['faq_pairs'] * 3, 25)
# 代码块得分(最多15分)
code_score = min(structure['code_blocks'] * 5, 15)
# 表格得分(最多10分)
table_score = min(structure['tables'] * 2, 10)
# 引用得分(最多5分)
quote_score = min(structure['quotes'] * 1, 5)
engagement_score = heading_score + list_score + faq_score + code_score + table_score + quote_score
return round(min(engagement_score, 100.0), 2)
def analyze_content(self, content: str, brand: str) -> Dict[str, any]:
"""
综合分析内容,返回所有指标
Args:
content: 内容文本
brand: 品牌名称
Returns:
包含所有指标的字典
"""
if not content:
return {
'trust_density': 0.0,
'citation_share': 0.0,
'authority_score': 0.0,
'engagement_potential': 0.0,
'trust_signals': 0,
'citations': 0,
'brand_mentions': 0,
'structure_elements': {},
'text_length': 0,
}
text_length = len(re.sub(r'\s+', '', content))
trust_signals = self.count_trust_signals(content)
citations = self.count_citations(content)
brand_mentions = self.count_brand_mentions(content, brand)
structure = self.count_structure_elements(content)
return {
'trust_density': self.calculate_trust_density(content),
'citation_share': self.calculate_citation_share(content, brand),
'authority_score': self.calculate_authority_score(content),
'engagement_potential': self.calculate_engagement_potential(content),
'trust_signals': trust_signals,
'citations': citations,
'brand_mentions': brand_mentions,
'structure_elements': structure,
'text_length': text_length,
}
def analyze_batch(self, contents: List[Dict[str, str]], brand: str) -> List[Dict[str, any]]:
"""
批量分析内容
Args:
contents: 内容列表,每个元素包含 'content' 字段
brand: 品牌名称
Returns:
分析结果列表
"""
results = []
for item in contents:
content = item.get('content', '')
metrics = self.analyze_content(content, brand)
# 保留原始数据
metrics['keyword'] = item.get('keyword', '')
metrics['platform'] = item.get('platform', '')
results.append(metrics)
return results
def get_metrics_summary(self, results: List[Dict[str, any]]) -> Dict[str, any]:
"""
获取指标汇总统计
Args:
results: 分析结果列表
Returns:
汇总统计字典
"""
if not results:
return {
'avg_trust_density': 0.0,
'avg_citation_share': 0.0,
'avg_authority_score': 0.0,
'avg_engagement_potential': 0.0,
'total_trust_signals': 0,
'total_citations': 0,
'total_brand_mentions': 0,
'count': 0,
}
return {
'avg_trust_density': round(sum(r.get('trust_density', 0) for r in results) / len(results), 2),
'avg_citation_share': round(sum(r.get('citation_share', 0) for r in results) / len(results), 2),
'avg_authority_score': round(sum(r.get('authority_score', 0) for r in results) / len(results), 2),
'avg_engagement_potential': round(sum(r.get('engagement_potential', 0) for r in results) / len(results), 2),
'total_trust_signals': sum(r.get('trust_signals', 0) for r in results),
'total_citations': sum(r.get('citations', 0) for r in results),
'total_brand_mentions': sum(r.get('brand_mentions', 0) for r in results),
'count': len(results),
}