ChouJuGEO/modules/ai_search_verifier.py

"""
AI 搜索验证模块
支持使用真实的 AI 搜索引擎（Perplexity、ChatGPT Search）验证品牌提及
"""

import json
import logging
import re
from typing import Dict, List, Optional, Any
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class SearchResult:
    """搜索结果"""
    query: str
    response: str
    sources: List[Dict[str, str]]
    brand_mentioned: bool
    mention_count: int
    mention_positions: List[str]
    sentiment: str  # positive, neutral, negative


class AISearchVerifier:
    """AI 搜索验证器"""

    def __init__(self, perplexity_api_key: Optional[str] = None):
        """
        Args:
            perplexity_api_key: Perplexity API Key
        """
        self.perplexity_api_key = perplexity_api_key

    def verify_with_perplexity(self, query: str, brand: str) -> Dict:
        """
        使用 Perplexity API 验证品牌提及

        Args:
            query: 搜索查询
            brand: 品牌名

        Returns:
            验证结果
        """
        if not self.perplexity_api_key:
            return self._mock_verification(query, brand)

        try:
            import httpx

            headers = {
                "Authorization": f"Bearer {self.perplexity_api_key}",
                "Content-Type": "application/json"
            }

            payload = {
                "model": "llama-3.1-sonar-small-128k-online",
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a helpful assistant. Answer the user's question based on real-time search results. Be factual and cite your sources."
                    },
                    {
                        "role": "user",
                        "content": query
                    }
                ],
                "max_tokens": 1000,
                "temperature": 0.1,
                "return_citations": True,
                "search_recency_filter": "month"
            }

            response = httpx.post(
                "https://api.perplexity.ai/chat/completions",
                json=payload,
                headers=headers,
                timeout=30.0
            )

            if response.status_code == 200:
                result = response.json()
                content = result["choices"][0]["message"]["content"]
                citations = result.get("citations", [])

                # 分析品牌提及
                mention_analysis = self._analyze_mention(content, brand)

                return {
                    "success": True,
                    "query": query,
                    "brand": brand,
                    "response": content,
                    "sources": citations,
                    "mention_count": mention_analysis["count"],
                    "mention_positions": mention_analysis["positions"],
                    "mentioned": mention_analysis["count"] > 0,
                    "sentiment": mention_analysis["sentiment"]
                }
            else:
                logger.error(f"Perplexity API 错误: {response.status_code} {response.text}")
                return {"success": False, "error": f"API 错误: {response.status_code}"}

        except ImportError:
            logger.warning("httpx 未安装，无法调用 Perplexity API")
            return self._mock_verification(query, brand)
        except Exception as e:
            logger.error(f"Perplexity 验证失败: {e}")
            return {"success": False, "error": str(e)}

    def _mock_verification(self, query: str, brand: str) -> Dict:
        """模拟验证（当 API 不可用时）"""
        return {
            "success": True,
            "query": query,
            "brand": brand,
            "response": f"（模拟结果）关于 '{query}' 的搜索结果需要配置 Perplexity API Key 才能获取真实数据。",
            "sources": [],
            "mention_count": 0,
            "mention_positions": [],
            "mentioned": False,
            "sentiment": "neutral",
            "is_mock": True
        }

    def _analyze_mention(self, text: str, brand: str) -> Dict:
        """
        分析文本中的品牌提及

        Args:
            text: 文本内容
            brand: 品牌名

        Returns:
            提及分析结果
        """
        text_lower = text.lower()
        brand_lower = brand.lower()

        # 计算提及次数
        count = text_lower.count(brand_lower)

        # 分析提及位置
        positions = []
        if count > 0:
            total_len = len(text)
            for match in re.finditer(re.escape(brand_lower), text_lower):
                pos_ratio = match.start() / total_len
                if pos_ratio < 0.33:
                    positions.append("前1/3")
                elif pos_ratio < 0.67:
                    positions.append("中1/3")
                else:
                    positions.append("后1/3")

        # 分析情感（简单规则）
        sentiment = self._analyze_sentiment(text, brand)

        return {
            "count": count,
            "positions": positions,
            "sentiment": sentiment
        }

    def _analyze_sentiment(self, text: str, brand: str) -> str:
        """
        分析品牌提及的情感

        Args:
            text: 文本内容
            brand: 品牌名

        Returns:
            情感标签 (positive, neutral, negative)
        """
        text_lower = text.lower()
        brand_lower = brand.lower()

        # 正面词汇
        positive_words = [
            "优秀", "出色", "领先", "推荐", "首选", "最佳", "强大", "高效",
            "创新", "专业", "可靠", "稳定", "卓越", "突出", "显著",
            "excellent", "outstanding", "leading", "recommended", "best",
            "powerful", "efficient", "innovative", "professional", "reliable"
        ]

        # 负面词汇
        negative_words = [
            "问题", "缺陷", "不足", "失败", "风险", "警告", "谨慎", "避免",
            "差", "慢", "贵", "复杂", "困难", "不稳定",
            "issue", "problem", "defect", "risk", "warning", "avoid",
            "poor", "slow", "expensive", "complex", "difficult", "unstable"
        ]

        # 获取品牌附近的上下文（前后50字符）
        contexts = []
        for match in re.finditer(re.escape(brand_lower), text_lower):
            start = max(0, match.start() - 50)
            end = min(len(text), match.end() + 50)
            contexts.append(text_lower[start:end])

        if not contexts:
            return "neutral"

        # 计算情感分数
        positive_score = 0
        negative_score = 0

        for context in contexts:
            for word in positive_words:
                if word in context:
                    positive_score += 1
            for word in negative_words:
                if word in context:
                    negative_score += 1

        if positive_score > negative_score * 1.5:
            return "positive"
        elif negative_score > positive_score * 1.5:
            return "negative"
        else:
            return "neutral"

    def batch_verify(self, queries: List[str], brand: str,
                     api_key: Optional[str] = None) -> List[Dict]:
        """
        批量验证多个查询

        Args:
            queries: 查询列表
            brand: 品牌名
            api_key: API Key（可选，覆盖初始化时的 key）

        Returns:
            验证结果列表
        """
        if api_key:
            self.perplexity_api_key = api_key

        results = []
        for query in queries:
            result = self.verify_with_perplexity(query, brand)
            results.append(result)

        return results

    def generate_verification_report(self, results: List[Dict]) -> Dict:
        """
        生成验证报告

        Args:
            results: 验证结果列表

        Returns:
            报告数据
        """
        total = len(results)
        mentioned = sum(1 for r in results if r.get("mentioned", False))

        mention_rate = mentioned / total if total > 0 else 0

        # 统计情感分布
        sentiment_counts = {"positive": 0, "neutral": 0, "negative": 0}
        for r in results:
            sentiment = r.get("sentiment", "neutral")
            sentiment_counts[sentiment] = sentiment_counts.get(sentiment, 0) + 1

        # 计算平均提及次数
        total_mentions = sum(r.get("mention_count", 0) for r in results)
        avg_mentions = total_mentions / total if total > 0 else 0

        return {
            "total_queries": total,
            "mentioned_count": mentioned,
            "mention_rate": mention_rate,
            "avg_mentions_per_query": avg_mentions,
            "sentiment_distribution": sentiment_counts,
            "top_mentioned_queries": [
                r["query"] for r in results
                if r.get("mentioned", False)
            ][:5],
            "not_mentioned_queries": [
                r["query"] for r in results
                if not r.get("mentioned", False)
            ][:5]
        }


class SemanticMentionDetector:
    """语义级提及检测器"""

    def __init__(self):
        # 品牌别名/同义词映射
        self.brand_aliases: Dict[str, List[str]] = {}

    def add_brand_aliases(self, brand: str, aliases: List[str]):
        """
        添加品牌别名

        Args:
            brand: 品牌名
            aliases: 别名列表
        """
        self.brand_aliases[brand.lower()] = [a.lower() for a in aliases]

    def detect_mention(self, text: str, brand: str) -> Dict:
        """
        语义级提及检测

        Args:
            text: 文本内容
            brand: 品牌名

        Returns:
            检测结果
        """
        text_lower = text.lower()
        brand_lower = brand.lower()

        # 直接提及
        direct_count = text_lower.count(brand_lower)

        # 别名提及
        aliases = self.brand_aliases.get(brand_lower, [])
        alias_counts = {}
        for alias in aliases:
            count = text_lower.count(alias)
            if count > 0:
                alias_counts[alias] = count

        # 总提及次数
        total_count = direct_count + sum(alias_counts.values())

        # 判断提及语境
        contexts = self._extract_contexts(text, brand_lower, aliases)

        return {
            "brand": brand,
            "direct_count": direct_count,
            "alias_counts": alias_counts,
            "total_count": total_count,
            "contexts": contexts,
            "is_mentioned": total_count > 0
        }

    def _extract_contexts(self, text: str, brand_lower: str,
                          aliases: List[str]) -> List[Dict]:
        """提取提及上下文"""
        contexts = []
        text_lower = text.lower()

        # 查找所有提及位置
        all_patterns = [brand_lower] + aliases

        for pattern in all_patterns:
            for match in re.finditer(re.escape(pattern), text_lower):
                start = max(0, match.start() - 100)
                end = min(len(text), match.end() + 100)
                context = text[start:end]

                contexts.append({
                    "pattern": pattern,
                    "context": context,
                    "position": match.start()
                })

        return contexts


def verify_content_quality(content: str, brand: str,
                          knowledge_base=None) -> Dict:
    """
    综合验证内容质量

    Args:
        content: 内容文本
        brand: 品牌名
        knowledge_base: 知识库实例（可选）

    Returns:
        质量评估结果
    """
    from modules.knowledge_base import SourceVerifier

    verifier = SourceVerifier()

    # 来源质量评估
    source_quality = verifier.assess_source_quality(content)

    # 品牌提及分析
    detector = SemanticMentionDetector()
    mention_result = detector.detect_mention(content, brand)

    # 计算综合分数
    score = 0
    max_score = 100

    # 来源质量 (40分)
    score += min(40, source_quality["quality_score"] * 0.4)

    # 品牌提及 (30分)
    if mention_result["is_mentioned"]:
        mention_score = min(30, mention_result["total_count"] * 5)
        score += mention_score

    # 内容结构 (30分)
    structure_score = 0
    if "##" in content:  # 有标题
        structure_score += 10
    if re.search(r'\d+[.、]', content):  # 有列表
        structure_score += 10
    if "?" in content or "？" in content:  # 有问答
        structure_score += 10
    score += structure_score

    return {
        "total_score": score,
        "max_score": max_score,
        "source_quality": source_quality,
        "mention_analysis": mention_result,
        "structure_score": structure_score,
        "suggestions": _generate_suggestions(source_quality, mention_result, structure_score)
    }


def _generate_suggestions(source_quality: Dict, mention_result: Dict,
                          structure_score: int) -> List[str]:
    """生成改进建议"""
    suggestions = []

    if source_quality["quality_score"] < 50:
        suggestions.append("来源质量较低，建议添加真实的行业报告或数据来源")

    if not mention_result["is_mentioned"]:
        suggestions.append("内容中未提及品牌，建议在合适位置自然植入品牌信息")
    elif mention_result["total_count"] < 2:
        suggestions.append("品牌提及次数较少，建议增加到 2-3 次")

    if structure_score < 20:
        suggestions.append("内容结构不够清晰，建议添加标题、列表或 FAQ")

    return suggestions