feat: 重构项目结构并添加平台同步基础架构

- 重构项目目录结构，将功能模块移至 modules/ 目录 - 创建平台同步基础架构，包括发布器基类和 GitHub 发布器 - 新增 UI 状态管理模块 (modules/ui/state.py) 统一管理会话状态 - 更新依赖配置，添加平台同步所需依赖 (httpx, pyperclip) - 整理文档结构，将所有文档分类移至 docs/ 目录 - 添加 .cursorrules 文件定义项目开发规范 - 清理根目录重复文件，保持项目结构整洁
2026-01-30 10:21:29 +08:00
parent 77d5ec70f8
commit 8f7f082c3d
102 changed files with 33742 additions and 1526 deletions
@@ -0,0 +1,394 @@
+"""
+内容质量指标分析模块
+计算 Trust Density、Citation Share、Authority Score、Engagement Potential 等指标
+"""
+import re
+from typing import Dict, List, Optional, Tuple
+from collections import Counter
+
+
+class ContentMetricsAnalyzer:
+    """内容质量指标分析器"""
+    
+    def __init__(self):
+        # 信任信号模式（来源占位、数据、案例等）
+        self.trust_signal_patterns = [
+            # 来源占位模式
+            r'根据[^，。；：\n]{2,30}(?:报告|研究|数据|统计|调查|分析|标准|规范|文档|指南)',
+            r'参考[^，。；：\n]{2,30}(?:报告|研究|数据|统计|调查|分析|标准|规范|文档|指南)',
+            r'来自[^，。；：\n]{2,30}(?:报告|研究|数据|统计|调查|分析)',
+            r'据[^，。；：\n]{2,30}(?:显示|表明|统计|调查|分析)',
+            r'[^，。；：\n]{2,20}(?:报告|研究|数据|统计|调查)显示',
+            r'[^，。；：\n]{2,20}(?:报告|研究|数据|统计|调查)表明',
+            # 数据点模式
+            r'\d+%',  # 百分比
+            r'\d+\.\d+%',  # 小数百分比
+            r'约\d+%',  # 约XX%
+            r'超过\d+%',  # 超过XX%
+            r'达到\d+%',  # 达到XX%
+            r'\d+倍',  # XX倍
+            r'\d+个',  # XX个
+            r'\d+项',  # XX项
+            r'\d+次',  # XX次
+            r'\d+年',  # XX年（时间数据）
+            r'\d+月',  # XX月
+            # 案例模式
+            r'案例[：:][^，。；\n]{5,100}',
+            r'例如[^，。；\n]{5,100}',
+            r'以[^，。；\n]{2,30}为例',
+            r'某[^，。；\n]{2,20}(?:企业|公司|用户|项目|团队)',
+            r'实际[^，。；\n]{2,30}(?:测试|应用|使用|经验)',
+            r'使用[^，。；\n]{2,30}(?:发现|表明|显示)',
+        ]
+        
+        # 结构化元素模式
+        self.structure_patterns = [
+            r'^#{1,6}\s+.+',  # Markdown 标题
+            r'^\d+[\.、]\s+.+',  # 编号列表
+            r'^[-*+]\s+.+',  # 无序列表
+            r'^\s*[-*+]\s+.+',  # 缩进列表
+            r'```[\s\S]*?```',  # 代码块
+            r'`[^`]+`',  # 行内代码
+            r'^\s*[Qq][：:].*',  # FAQ 问题
+            r'^\s*[Aa][：:].*',  # FAQ 答案
+            r'\|.*\|',  # 表格
+            r'^>.*',  # 引用块
+        ]
+    
+    def count_trust_signals(self, content: str) -> int:
+        """
+        统计信任信号数量
+        
+        Args:
+            content: 内容文本
+            
+        Returns:
+            信任信号数量
+        """
+        # 去重：如果同一个位置匹配多个模式，只算一次
+        # 简化处理：使用集合去重匹配位置附近的一小段文本
+        unique_matches = set()
+        for pattern in self.trust_signal_patterns:
+            for match in re.finditer(pattern, content, re.MULTILINE | re.IGNORECASE):
+                pos_key = content[max(0, match.start() - 10): match.end() + 10]
+                unique_matches.add(pos_key)
+        
+        return len(unique_matches)
+    
+    def count_citations(self, content: str) -> int:
+        """
+        统计来源占位数量（Citation）
+        
+        Args:
+            content: 内容文本
+            
+        Returns:
+            来源占位数量
+        """
+        citation_patterns = [
+            r'根据[^，。；：\n]{2,30}(?:报告|研究|数据|统计|调查|分析|标准|规范|文档|指南)',
+            r'参考[^，。；：\n]{2,30}(?:报告|研究|数据|统计|调查|分析|标准|规范|文档|指南)',
+            r'来自[^，。；：\n]{2,30}(?:报告|研究|数据|统计|调查|分析)',
+            r'据[^，。；：\n]{2,30}(?:显示|表明|统计|调查|分析)',
+            r'[^，。；：\n]{2,20}(?:报告|研究|数据|统计|调查)(?:显示|表明)',
+        ]
+        
+        citations = set()
+        for pattern in citation_patterns:
+            for match in re.finditer(pattern, content, re.MULTILINE | re.IGNORECASE):
+                # 使用匹配位置作为唯一标识
+                pos_key = (match.start(), match.end())
+                citations.add(pos_key)
+        
+        return len(citations)
+    
+    def count_brand_mentions(self, content: str, brand: str) -> int:
+        """
+        统计品牌提及次数
+        
+        Args:
+            content: 内容文本
+            brand: 品牌名称
+            
+        Returns:
+            品牌提及次数
+        """
+        if not brand:
+            return 0
+        
+        # 使用单词边界匹配，避免部分匹配
+        pattern = r'\b' + re.escape(brand) + r'\b'
+        matches = re.findall(pattern, content, re.IGNORECASE)
+        return len(matches)
+    
+    def count_structure_elements(self, content: str) -> Dict[str, int]:
+        """
+        统计结构化元素数量
+        
+        Args:
+            content: 内容文本
+            
+        Returns:
+            结构化元素统计字典
+        """
+        lines = content.split('\n')
+        structure_count = {
+            'headings': 0,  # 标题
+            'lists': 0,  # 列表
+            'code_blocks': 0,  # 代码块
+            'faq_pairs': 0,  # FAQ 对
+            'tables': 0,  # 表格
+            'quotes': 0,  # 引用
+        }
+        
+        # 统计标题
+        for line in lines:
+            if re.match(r'^#{1,6}\s+.+', line):
+                structure_count['headings'] += 1
+            elif re.match(r'^\d+[\.、]\s+.+', line) or re.match(r'^[-*+]\s+.+', line):
+                structure_count['lists'] += 1
+            elif re.match(r'^\s*[Qq][：:].*', line):
+                structure_count['faq_pairs'] += 1
+            elif re.match(r'^\s*\|.*\|', line):
+                structure_count['tables'] += 1
+            elif re.match(r'^>.*', line):
+                structure_count['quotes'] += 1
+        
+        # 统计代码块
+        code_blocks = re.findall(r'```[\s\S]*?```', content)
+        structure_count['code_blocks'] = len(code_blocks)
+        
+        return structure_count
+    
+    def calculate_trust_density(self, content: str) -> float:
+        """
+        计算 Trust Density（每100字信任信号数）
+        
+        Args:
+            content: 内容文本
+            
+        Returns:
+            Trust Density 值
+        """
+        if not content:
+            return 0.0
+        
+        # 计算实际文本长度（去除空白字符）
+        text_length = len(re.sub(r'\s+', '', content))
+        if text_length == 0:
+            return 0.0
+        
+        trust_signals = self.count_trust_signals(content)
+        # 每100字信任信号数
+        trust_density = (trust_signals / text_length) * 100
+        
+        return round(trust_density, 2)
+    
+    def calculate_citation_share(self, content: str, brand: str) -> float:
+        """
+        计算 Citation Share（品牌引用比例）
+        
+        Args:
+            content: 内容文本
+            brand: 品牌名称
+            
+        Returns:
+            Citation Share 值（0-100）
+        """
+        if not content or not brand:
+            return 0.0
+        
+        brand_mentions = self.count_brand_mentions(content, brand)
+        
+        # 统计所有可能的提及（品牌、竞品、通用术语等）
+        # 简化处理：统计所有可能的品牌/产品提及
+        # 使用常见品牌提及模式
+        all_mentions_pattern = r'\b[A-Z][a-zA-Z0-9]{2,20}\b'  # 大写开头的单词（可能是品牌）
+        all_mentions = len(re.findall(all_mentions_pattern, content))
+        
+        # 如果总提及数太少，使用品牌提及次数作为分母
+        if all_mentions < brand_mentions * 2:
+            all_mentions = brand_mentions * 2
+        
+        if all_mentions == 0:
+            return 0.0
+        
+        citation_share = (brand_mentions / all_mentions) * 100
+        return round(min(citation_share, 100.0), 2)
+    
+    def calculate_authority_score(self, content: str) -> float:
+        """
+        计算 Authority Score（权威性得分，0-100）
+        
+        基于来源占位数量、数据密度等
+        
+        Args:
+            content: 内容文本
+            
+        Returns:
+            Authority Score 值（0-100）
+        """
+        if not content:
+            return 0.0
+        
+        citations = self.count_citations(content)
+        trust_signals = self.count_trust_signals(content)
+        text_length = len(re.sub(r'\s+', '', content))
+        
+        if text_length == 0:
+            return 0.0
+        
+        # 计算各项得分
+        # 来源占位得分（最多30分）
+        citation_score = min(citations * 5, 30)
+        
+        # 信任信号密度得分（最多40分）
+        trust_density = (trust_signals / text_length) * 1000  # 每1000字信任信号数
+        trust_score = min(trust_density * 4, 40)
+        
+        # 数据点得分（最多30分）
+        data_points = len(re.findall(r'\d+%', content)) + len(re.findall(r'\d+\.\d+%', content))
+        data_score = min(data_points * 2, 30)
+        
+        authority_score = citation_score + trust_score + data_score
+        return round(min(authority_score, 100.0), 2)
+    
+    def calculate_engagement_potential(self, content: str) -> float:
+        """
+        计算 Engagement Potential（参与度潜力，0-100）
+        
+        基于结构化程度、互动元素等
+        
+        Args:
+            content: 内容文本
+            
+        Returns:
+            Engagement Potential 值（0-100）
+        """
+        if not content:
+            return 0.0
+        
+        structure = self.count_structure_elements(content)
+        text_length = len(re.sub(r'\s+', '', content))
+        
+        if text_length == 0:
+            return 0.0
+        
+        # 计算各项得分
+        # 标题得分（最多20分）
+        heading_score = min(structure['headings'] * 2, 20)
+        
+        # 列表得分（最多25分）
+        list_score = min(structure['lists'] * 1.5, 25)
+        
+        # FAQ 得分（最多25分）
+        faq_score = min(structure['faq_pairs'] * 3, 25)
+        
+        # 代码块得分（最多15分）
+        code_score = min(structure['code_blocks'] * 5, 15)
+        
+        # 表格得分（最多10分）
+        table_score = min(structure['tables'] * 2, 10)
+        
+        # 引用得分（最多5分）
+        quote_score = min(structure['quotes'] * 1, 5)
+        
+        engagement_score = heading_score + list_score + faq_score + code_score + table_score + quote_score
+        return round(min(engagement_score, 100.0), 2)
+    
+    def analyze_content(self, content: str, brand: str) -> Dict[str, any]:
+        """
+        综合分析内容，返回所有指标
+        
+        Args:
+            content: 内容文本
+            brand: 品牌名称
+            
+        Returns:
+            包含所有指标的字典
+        """
+        if not content:
+            return {
+                'trust_density': 0.0,
+                'citation_share': 0.0,
+                'authority_score': 0.0,
+                'engagement_potential': 0.0,
+                'trust_signals': 0,
+                'citations': 0,
+                'brand_mentions': 0,
+                'structure_elements': {},
+                'text_length': 0,
+            }
+        
+        text_length = len(re.sub(r'\s+', '', content))
+        trust_signals = self.count_trust_signals(content)
+        citations = self.count_citations(content)
+        brand_mentions = self.count_brand_mentions(content, brand)
+        structure = self.count_structure_elements(content)
+        
+        return {
+            'trust_density': self.calculate_trust_density(content),
+            'citation_share': self.calculate_citation_share(content, brand),
+            'authority_score': self.calculate_authority_score(content),
+            'engagement_potential': self.calculate_engagement_potential(content),
+            'trust_signals': trust_signals,
+            'citations': citations,
+            'brand_mentions': brand_mentions,
+            'structure_elements': structure,
+            'text_length': text_length,
+        }
+    
+    def analyze_batch(self, contents: List[Dict[str, str]], brand: str) -> List[Dict[str, any]]:
+        """
+        批量分析内容
+        
+        Args:
+            contents: 内容列表，每个元素包含 'content' 字段
+            brand: 品牌名称
+            
+        Returns:
+            分析结果列表
+        """
+        results = []
+        for item in contents:
+            content = item.get('content', '')
+            metrics = self.analyze_content(content, brand)
+            # 保留原始数据
+            metrics['keyword'] = item.get('keyword', '')
+            metrics['platform'] = item.get('platform', '')
+            results.append(metrics)
+        
+        return results
+    
+    def get_metrics_summary(self, results: List[Dict[str, any]]) -> Dict[str, any]:
+        """
+        获取指标汇总统计
+        
+        Args:
+            results: 分析结果列表
+            
+        Returns:
+            汇总统计字典
+        """
+        if not results:
+            return {
+                'avg_trust_density': 0.0,
+                'avg_citation_share': 0.0,
+                'avg_authority_score': 0.0,
+                'avg_engagement_potential': 0.0,
+                'total_trust_signals': 0,
+                'total_citations': 0,
+                'total_brand_mentions': 0,
+                'count': 0,
+            }
+        
+        return {
+            'avg_trust_density': round(sum(r.get('trust_density', 0) for r in results) / len(results), 2),
+            'avg_citation_share': round(sum(r.get('citation_share', 0) for r in results) / len(results), 2),
+            'avg_authority_score': round(sum(r.get('authority_score', 0) for r in results) / len(results), 2),
+            'avg_engagement_potential': round(sum(r.get('engagement_potential', 0) for r in results) / len(results), 2),
+            'total_trust_signals': sum(r.get('trust_signals', 0) for r in results),
+            'total_citations': sum(r.get('citations', 0) for r in results),
+            'total_brand_mentions': sum(r.get('brand_mentions', 0) for r in results),
+            'count': len(results),
+        }