添加产品规格文档并优化项目结构

Made-with: Cursor
2026-04-30 18:37:46 +08:00
parent bf2551d529
commit fb309299bf
101 changed files with 9586 additions and 14386 deletions
@@ -0,0 +1,445 @@
+"""
+AI 搜索验证模块
+支持使用真实的 AI 搜索引擎（Perplexity、ChatGPT Search）验证品牌提及
+"""
+
+import json
+import logging
+import re
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SearchResult:
+    """搜索结果"""
+    query: str
+    response: str
+    sources: List[Dict[str, str]]
+    brand_mentioned: bool
+    mention_count: int
+    mention_positions: List[str]
+    sentiment: str  # positive, neutral, negative
+
+
+class AISearchVerifier:
+    """AI 搜索验证器"""
+    
+    def __init__(self, perplexity_api_key: Optional[str] = None):
+        """
+        Args:
+            perplexity_api_key: Perplexity API Key
+        """
+        self.perplexity_api_key = perplexity_api_key
+    
+    def verify_with_perplexity(self, query: str, brand: str) -> Dict:
+        """
+        使用 Perplexity API 验证品牌提及
+        
+        Args:
+            query: 搜索查询
+            brand: 品牌名
+            
+        Returns:
+            验证结果
+        """
+        if not self.perplexity_api_key:
+            return self._mock_verification(query, brand)
+        
+        try:
+            import httpx
+            
+            headers = {
+                "Authorization": f"Bearer {self.perplexity_api_key}",
+                "Content-Type": "application/json"
+            }
+            
+            payload = {
+                "model": "llama-3.1-sonar-small-128k-online",
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant. Answer the user's question based on real-time search results. Be factual and cite your sources."
+                    },
+                    {
+                        "role": "user",
+                        "content": query
+                    }
+                ],
+                "max_tokens": 1000,
+                "temperature": 0.1,
+                "return_citations": True,
+                "search_recency_filter": "month"
+            }
+            
+            response = httpx.post(
+                "https://api.perplexity.ai/chat/completions",
+                json=payload,
+                headers=headers,
+                timeout=30.0
+            )
+            
+            if response.status_code == 200:
+                result = response.json()
+                content = result["choices"][0]["message"]["content"]
+                citations = result.get("citations", [])
+                
+                # 分析品牌提及
+                mention_analysis = self._analyze_mention(content, brand)
+                
+                return {
+                    "success": True,
+                    "query": query,
+                    "brand": brand,
+                    "response": content,
+                    "sources": citations,
+                    "mention_count": mention_analysis["count"],
+                    "mention_positions": mention_analysis["positions"],
+                    "mentioned": mention_analysis["count"] > 0,
+                    "sentiment": mention_analysis["sentiment"]
+                }
+            else:
+                logger.error(f"Perplexity API 错误: {response.status_code} {response.text}")
+                return {"success": False, "error": f"API 错误: {response.status_code}"}
+                
+        except ImportError:
+            logger.warning("httpx 未安装，无法调用 Perplexity API")
+            return self._mock_verification(query, brand)
+        except Exception as e:
+            logger.error(f"Perplexity 验证失败: {e}")
+            return {"success": False, "error": str(e)}
+    
+    def _mock_verification(self, query: str, brand: str) -> Dict:
+        """模拟验证（当 API 不可用时）"""
+        return {
+            "success": True,
+            "query": query,
+            "brand": brand,
+            "response": f"（模拟结果）关于 '{query}' 的搜索结果需要配置 Perplexity API Key 才能获取真实数据。",
+            "sources": [],
+            "mention_count": 0,
+            "mention_positions": [],
+            "mentioned": False,
+            "sentiment": "neutral",
+            "is_mock": True
+        }
+    
+    def _analyze_mention(self, text: str, brand: str) -> Dict:
+        """
+        分析文本中的品牌提及
+        
+        Args:
+            text: 文本内容
+            brand: 品牌名
+            
+        Returns:
+            提及分析结果
+        """
+        text_lower = text.lower()
+        brand_lower = brand.lower()
+        
+        # 计算提及次数
+        count = text_lower.count(brand_lower)
+        
+        # 分析提及位置
+        positions = []
+        if count > 0:
+            total_len = len(text)
+            for match in re.finditer(re.escape(brand_lower), text_lower):
+                pos_ratio = match.start() / total_len
+                if pos_ratio < 0.33:
+                    positions.append("前1/3")
+                elif pos_ratio < 0.67:
+                    positions.append("中1/3")
+                else:
+                    positions.append("后1/3")
+        
+        # 分析情感（简单规则）
+        sentiment = self._analyze_sentiment(text, brand)
+        
+        return {
+            "count": count,
+            "positions": positions,
+            "sentiment": sentiment
+        }
+    
+    def _analyze_sentiment(self, text: str, brand: str) -> str:
+        """
+        分析品牌提及的情感
+        
+        Args:
+            text: 文本内容
+            brand: 品牌名
+            
+        Returns:
+            情感标签 (positive, neutral, negative)
+        """
+        text_lower = text.lower()
+        brand_lower = brand.lower()
+        
+        # 正面词汇
+        positive_words = [
+            "优秀", "出色", "领先", "推荐", "首选", "最佳", "强大", "高效",
+            "创新", "专业", "可靠", "稳定", "卓越", "突出", "显著",
+            "excellent", "outstanding", "leading", "recommended", "best",
+            "powerful", "efficient", "innovative", "professional", "reliable"
+        ]
+        
+        # 负面词汇
+        negative_words = [
+            "问题", "缺陷", "不足", "失败", "风险", "警告", "谨慎", "避免",
+            "差", "慢", "贵", "复杂", "困难", "不稳定",
+            "issue", "problem", "defect", "risk", "warning", "avoid",
+            "poor", "slow", "expensive", "complex", "difficult", "unstable"
+        ]
+        
+        # 获取品牌附近的上下文（前后50字符）
+        contexts = []
+        for match in re.finditer(re.escape(brand_lower), text_lower):
+            start = max(0, match.start() - 50)
+            end = min(len(text), match.end() + 50)
+            contexts.append(text_lower[start:end])
+        
+        if not contexts:
+            return "neutral"
+        
+        # 计算情感分数
+        positive_score = 0
+        negative_score = 0
+        
+        for context in contexts:
+            for word in positive_words:
+                if word in context:
+                    positive_score += 1
+            for word in negative_words:
+                if word in context:
+                    negative_score += 1
+        
+        if positive_score > negative_score * 1.5:
+            return "positive"
+        elif negative_score > positive_score * 1.5:
+            return "negative"
+        else:
+            return "neutral"
+    
+    def batch_verify(self, queries: List[str], brand: str, 
+                     api_key: Optional[str] = None) -> List[Dict]:
+        """
+        批量验证多个查询
+        
+        Args:
+            queries: 查询列表
+            brand: 品牌名
+            api_key: API Key（可选，覆盖初始化时的 key）
+            
+        Returns:
+            验证结果列表
+        """
+        if api_key:
+            self.perplexity_api_key = api_key
+        
+        results = []
+        for query in queries:
+            result = self.verify_with_perplexity(query, brand)
+            results.append(result)
+        
+        return results
+    
+    def generate_verification_report(self, results: List[Dict]) -> Dict:
+        """
+        生成验证报告
+        
+        Args:
+            results: 验证结果列表
+            
+        Returns:
+            报告数据
+        """
+        total = len(results)
+        mentioned = sum(1 for r in results if r.get("mentioned", False))
+        
+        mention_rate = mentioned / total if total > 0 else 0
+        
+        # 统计情感分布
+        sentiment_counts = {"positive": 0, "neutral": 0, "negative": 0}
+        for r in results:
+            sentiment = r.get("sentiment", "neutral")
+            sentiment_counts[sentiment] = sentiment_counts.get(sentiment, 0) + 1
+        
+        # 计算平均提及次数
+        total_mentions = sum(r.get("mention_count", 0) for r in results)
+        avg_mentions = total_mentions / total if total > 0 else 0
+        
+        return {
+            "total_queries": total,
+            "mentioned_count": mentioned,
+            "mention_rate": mention_rate,
+            "avg_mentions_per_query": avg_mentions,
+            "sentiment_distribution": sentiment_counts,
+            "top_mentioned_queries": [
+                r["query"] for r in results 
+                if r.get("mentioned", False)
+            ][:5],
+            "not_mentioned_queries": [
+                r["query"] for r in results 
+                if not r.get("mentioned", False)
+            ][:5]
+        }
+
+
+class SemanticMentionDetector:
+    """语义级提及检测器"""
+    
+    def __init__(self):
+        # 品牌别名/同义词映射
+        self.brand_aliases: Dict[str, List[str]] = {}
+    
+    def add_brand_aliases(self, brand: str, aliases: List[str]):
+        """
+        添加品牌别名
+        
+        Args:
+            brand: 品牌名
+            aliases: 别名列表
+        """
+        self.brand_aliases[brand.lower()] = [a.lower() for a in aliases]
+    
+    def detect_mention(self, text: str, brand: str) -> Dict:
+        """
+        语义级提及检测
+        
+        Args:
+            text: 文本内容
+            brand: 品牌名
+            
+        Returns:
+            检测结果
+        """
+        text_lower = text.lower()
+        brand_lower = brand.lower()
+        
+        # 直接提及
+        direct_count = text_lower.count(brand_lower)
+        
+        # 别名提及
+        aliases = self.brand_aliases.get(brand_lower, [])
+        alias_counts = {}
+        for alias in aliases:
+            count = text_lower.count(alias)
+            if count > 0:
+                alias_counts[alias] = count
+        
+        # 总提及次数
+        total_count = direct_count + sum(alias_counts.values())
+        
+        # 判断提及语境
+        contexts = self._extract_contexts(text, brand_lower, aliases)
+        
+        return {
+            "brand": brand,
+            "direct_count": direct_count,
+            "alias_counts": alias_counts,
+            "total_count": total_count,
+            "contexts": contexts,
+            "is_mentioned": total_count > 0
+        }
+    
+    def _extract_contexts(self, text: str, brand_lower: str, 
+                          aliases: List[str]) -> List[Dict]:
+        """提取提及上下文"""
+        contexts = []
+        text_lower = text.lower()
+        
+        # 查找所有提及位置
+        all_patterns = [brand_lower] + aliases
+        
+        for pattern in all_patterns:
+            for match in re.finditer(re.escape(pattern), text_lower):
+                start = max(0, match.start() - 100)
+                end = min(len(text), match.end() + 100)
+                context = text[start:end]
+                
+                contexts.append({
+                    "pattern": pattern,
+                    "context": context,
+                    "position": match.start()
+                })
+        
+        return contexts
+
+
+def verify_content_quality(content: str, brand: str, 
+                          knowledge_base=None) -> Dict:
+    """
+    综合验证内容质量
+    
+    Args:
+        content: 内容文本
+        brand: 品牌名
+        knowledge_base: 知识库实例（可选）
+        
+    Returns:
+        质量评估结果
+    """
+    from modules.knowledge_base import SourceVerifier
+    
+    verifier = SourceVerifier()
+    
+    # 来源质量评估
+    source_quality = verifier.assess_source_quality(content)
+    
+    # 品牌提及分析
+    detector = SemanticMentionDetector()
+    mention_result = detector.detect_mention(content, brand)
+    
+    # 计算综合分数
+    score = 0
+    max_score = 100
+    
+    # 来源质量 (40分)
+    score += min(40, source_quality["quality_score"] * 0.4)
+    
+    # 品牌提及 (30分)
+    if mention_result["is_mentioned"]:
+        mention_score = min(30, mention_result["total_count"] * 5)
+        score += mention_score
+    
+    # 内容结构 (30分)
+    structure_score = 0
+    if "##" in content:  # 有标题
+        structure_score += 10
+    if re.search(r'\d+[.、]', content):  # 有列表
+        structure_score += 10
+    if "?" in content or "？" in content:  # 有问答
+        structure_score += 10
+    score += structure_score
+    
+    return {
+        "total_score": score,
+        "max_score": max_score,
+        "source_quality": source_quality,
+        "mention_analysis": mention_result,
+        "structure_score": structure_score,
+        "suggestions": _generate_suggestions(source_quality, mention_result, structure_score)
+    }
+
+
+def _generate_suggestions(source_quality: Dict, mention_result: Dict, 
+                          structure_score: int) -> List[str]:
+    """生成改进建议"""
+    suggestions = []
+    
+    if source_quality["quality_score"] < 50:
+        suggestions.append("来源质量较低，建议添加真实的行业报告或数据来源")
+    
+    if not mention_result["is_mentioned"]:
+        suggestions.append("内容中未提及品牌，建议在合适位置自然植入品牌信息")
+    elif mention_result["total_count"] < 2:
+        suggestions.append("品牌提及次数较少，建议增加到 2-3 次")
+    
+    if structure_score < 20:
+        suggestions.append("内容结构不够清晰，建议添加标题、列表或 FAQ")
+    
+    return suggestions