modules/topic_cluster.py

"""
话题集群生成模块
基于关键词进行语义聚类，生成话题集群，分析话题关联，提供内容规划建议
"""
from typing import List, Dict, Set, Optional, Tuple
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import json
import re
from collections import defaultdict
from difflib import SequenceMatcher
import math


class TopicCluster:
    """话题集群生成器"""
    
    def __init__(self):
        # 话题聚类 Prompt
        self.clustering_prompt_template = """
你是话题聚类专家，专门将关键词聚类为话题集群，帮助用户系统化规划内容策略。

【关键词列表】
{keywords}

【品牌】{brand}
【优势】{advantages}
【聚类数量】{cluster_count}（建议范围：3-10个话题集群）

【话题聚类要求】

1. **语义相似性**
   - 将语义相似的关键词归为同一话题集群
   - 每个话题集群应该围绕一个核心主题
   - 话题之间应该有明显的区分度

2. **话题命名**
   - 为每个话题集群生成一个简洁、有代表性的名称（2-8字）
   - 话题名称应该能概括该集群的核心主题
   - 使用用户容易理解的语言

3. **话题描述**
   - 为每个话题集群生成一段描述（20-50字）
   - 说明该话题的核心内容和价值

4. **关键词分配**
   - 每个关键词应该只属于一个话题集群
   - 如果关键词可以属于多个话题，选择最相关的一个
   - 确保所有关键词都被分配

5. **话题关联**
   - 识别话题之间的关联关系
   - 标记强关联（直接相关）和弱关联（间接相关）

【输出格式】
请严格按照以下 JSON 格式输出，不要添加任何其他内容：

{{
  "clusters": [
    {{
      "id": 1,
      "name": "<话题名称>",
      "description": "<话题描述>",
      "keywords": ["<关键词1>", "<关键词2>", ...],
      "keyword_count": <关键词数量>,
      "priority": "<优先级：高/中/低>"
    }},
    ...
  ],
  "relationships": [
    {{
      "from": <话题ID>,
      "to": <话题ID>,
      "strength": "<关联强度：强/弱>",
      "type": "<关联类型：功能相关/场景相关/用户相关等>"
    }},
    ...
  ],
  "cluster_stats": {{
    "total_clusters": <话题总数>,
    "total_keywords": <关键词总数>,
    "avg_keywords_per_cluster": <平均每个话题的关键词数量>,
    "max_keywords": <最大话题的关键词数量>,
    "min_keywords": <最小话题的关键词数量>
  }}
}}

【开始聚类】
"""
        
        # 内容规划 Prompt
        self.content_planning_prompt_template = """
你是内容策略专家，基于话题集群生成内容规划建议。

【话题集群】
{clusters}

【品牌】{brand}
【优势】{advantages}

【内容规划要求】

1. **内容盲区分析**
   - 识别哪些话题集群缺少内容
   - 分析话题覆盖的完整性
   - 发现内容空白点

2. **内容优先级**
   - 根据话题的重要性和覆盖度，给出内容创作优先级
   - 优先覆盖高价值、低覆盖的话题

3. **内容建议**
   - 为每个话题集群提供内容创作建议
   - 包括：内容类型、发布平台、关键词策略等

4. **内容矩阵**
   - 建议话题之间的内容关联策略
   - 如何通过内容矩阵提升整体覆盖面

【输出格式】
请严格按照以下 JSON 格式输出，不要添加任何其他内容：

{{
  "content_gaps": [
    {{
      "cluster_id": <话题ID>,
      "cluster_name": "<话题名称>",
      "gap_type": "<盲区类型：完全空白/内容不足/关联缺失>",
      "description": "<盲区描述>",
      "priority": "<优先级：高/中/低>"
    }},
    ...
  ],
  "content_priorities": [
    {{
      "cluster_id": <话题ID>,
      "cluster_name": "<话题名称>",
      "priority": "<优先级：高/中/低>",
      "reason": "<优先级原因>",
      "recommended_content_count": <建议内容数量>
    }},
    ...
  ],
  "content_suggestions": [
    {{
      "cluster_id": <话题ID>,
      "cluster_name": "<话题名称>",
      "content_types": ["<内容类型1>", "<内容类型2>", ...],
      "platforms": ["<平台1>", "<平台2>", ...],
      "keyword_strategy": "<关键词策略>",
      "content_ideas": ["<内容创意1>", "<内容创意2>", ...]
    }},
    ...
  ],
  "content_matrix": {{
    "strategy": "<内容矩阵策略描述>",
    "cross_cluster_opportunities": [
      {{
        "clusters": ["<话题1>", "<话题2>"],
        "opportunity": "<关联机会描述>",
        "content_type": "<建议内容类型>"
      }},
      ...
    ]
  }}
}}

【开始规划】
"""
    
    def cluster_keywords(
        self,
        keywords: List[str],
        brand: str,
        advantages: str,
        cluster_count: int,
        llm_chain
    ) -> Dict:
        """
        将关键词聚类为话题集群
        
        Args:
            keywords: 关键词列表
            brand: 品牌名称
            advantages: 品牌优势
            cluster_count: 期望的话题集群数量（3-10）
            llm_chain: LangChain 链对象
            
        Returns:
            包含话题集群、关联关系和统计信息的字典
        """
        if not keywords:
            return {
                "clusters": [],
                "relationships": [],
                "cluster_stats": {
                    "total_clusters": 0,
                    "total_keywords": 0,
                    "avg_keywords_per_cluster": 0,
                    "max_keywords": 0,
                    "min_keywords": 0
                }
            }
        
        # 限制关键词数量，避免 Prompt 过长
        keywords_to_cluster = keywords[:100]  # 最多处理100个关键词
        
        # 限制聚类数量在合理范围
        cluster_count = max(3, min(10, cluster_count))
        
        try:
            prompt = PromptTemplate.from_template(self.clustering_prompt_template)
            chain = prompt | llm_chain | StrOutputParser()
            
            result = chain.invoke({
                "keywords": json.dumps(keywords_to_cluster, ensure_ascii=False, indent=2),
                "brand": brand,
                "advantages": advantages,
                "cluster_count": cluster_count
            })
            
            # 解析结果
            cluster_data = self._parse_clustering_result(result, keywords_to_cluster)
            
            return cluster_data
            
        except Exception as e:
            # 如果聚类失败，返回基于规则的简单聚类
            return self._rule_based_clustering(keywords_to_cluster, cluster_count)
    
    def _parse_clustering_result(self, result: str, original_keywords: List[str]) -> Dict:
        """解析聚类结果"""
        # 尝试提取 JSON
        json_match = re.search(r'\{.*\}', result, re.DOTALL)
        if json_match:
            try:
                data = json.loads(json_match.group())
                # 验证数据结构
                if "clusters" in data:
                    # 验证和清理数据
                    data = self._validate_cluster_data(data, original_keywords)
                    return data
            except json.JSONDecodeError:
                pass
        
        # 如果无法解析 JSON，使用基于规则的聚类
        return self._rule_based_clustering(original_keywords, min(5, len(original_keywords) // 5))
    
    def _validate_cluster_data(self, data: Dict, original_keywords: List[str]) -> Dict:
        """验证和清理聚类数据"""
        if "clusters" not in data:
            return self._rule_based_clustering(original_keywords, 5)
        
        clusters = data.get("clusters", [])
        validated_clusters = []
        assigned_keywords = set()
        
        # 验证每个集群
        for cluster in clusters:
            if not isinstance(cluster, dict):
                continue
            
            cluster_id = cluster.get("id")
            name = cluster.get("name", "").strip()
            keywords = cluster.get("keywords", [])
            
            if not name or not keywords:
                continue
            
            # 过滤无效关键词
            valid_keywords = []
            for kw in keywords:
                if isinstance(kw, str) and kw.strip() and kw.strip() in original_keywords:
                    kw_clean = kw.strip()
                    if kw_clean not in assigned_keywords:
                        valid_keywords.append(kw_clean)
                        assigned_keywords.add(kw_clean)
            
            if valid_keywords:
                validated_clusters.append({
                    "id": cluster_id if cluster_id else len(validated_clusters) + 1,
                    "name": name,
                    "description": cluster.get("description", ""),
                    "keywords": valid_keywords,
                    "keyword_count": len(valid_keywords),
                    "priority": cluster.get("priority", "中")
                })
        
        # 分配未分配的关键词到最近的集群
        unassigned = [kw for kw in original_keywords if kw not in assigned_keywords]
        if unassigned and validated_clusters:
            for kw in unassigned:
                # 找到最相似的集群
                best_cluster = None
                best_similarity = 0
                for cluster in validated_clusters:
                    # 计算与集群关键词的平均相似度
                    similarities = [
                        SequenceMatcher(None, kw.lower(), ckw.lower()).ratio()
                        for ckw in cluster["keywords"][:5]  # 只比较前5个
                    ]
                    avg_sim = sum(similarities) / len(similarities) if similarities else 0
                    if avg_sim > best_similarity:
                        best_similarity = avg_sim
                        best_cluster = cluster
                
                if best_cluster and best_similarity > 0.3:
                    best_cluster["keywords"].append(kw)
                    best_cluster["keyword_count"] = len(best_cluster["keywords"])
        
        # 更新统计信息
        total_keywords = sum(c["keyword_count"] for c in validated_clusters)
        cluster_counts = [c["keyword_count"] for c in validated_clusters]
        
        data["clusters"] = validated_clusters
        data["cluster_stats"] = {
            "total_clusters": len(validated_clusters),
            "total_keywords": total_keywords,
            "avg_keywords_per_cluster": total_keywords / len(validated_clusters) if validated_clusters else 0,
            "max_keywords": max(cluster_counts) if cluster_counts else 0,
            "min_keywords": min(cluster_counts) if cluster_counts else 0
        }
        
        # 验证关联关系
        if "relationships" in data:
            relationships = []
            cluster_ids = {c["id"] for c in validated_clusters}
            for rel in data["relationships"]:
                if isinstance(rel, dict):
                    from_id = rel.get("from")
                    to_id = rel.get("to")
                    if from_id in cluster_ids and to_id in cluster_ids and from_id != to_id:
                        relationships.append(rel)
            data["relationships"] = relationships
        
        return data
    
    def _rule_based_clustering(
        self,
        keywords: List[str],
        target_clusters: int
    ) -> Dict:
        """
        基于规则的简单聚类（备用方案，不依赖 LLM）
        
        Args:
            keywords: 关键词列表
            target_clusters: 目标集群数量
            
        Returns:
            聚类结果字典
        """
        if not keywords:
            return {
                "clusters": [],
                "relationships": [],
                "cluster_stats": {
                    "total_clusters": 0,
                    "total_keywords": 0,
                    "avg_keywords_per_cluster": 0,
                    "max_keywords": 0,
                    "min_keywords": 0
                }
            }
        
        # 简单的基于关键词相似度的聚类
        clusters = []
        remaining_keywords = keywords.copy()
        
        # 计算关键词之间的相似度矩阵
        similarity_matrix = {}
        for i, kw1 in enumerate(keywords):
            for j, kw2 in enumerate(keywords[i+1:], i+1):
                sim = SequenceMatcher(None, kw1.lower(), kw2.lower()).ratio()
                similarity_matrix[(i, j)] = sim
        
        # 简单的聚类算法：找到相似度高的关键词组
        used_indices = set()
        cluster_id = 1
        
        # 按相似度排序
        sorted_pairs = sorted(similarity_matrix.items(), key=lambda x: x[1], reverse=True)
        
        for (i, j), sim in sorted_pairs:
            if i in used_indices or j in used_indices:
                continue
            
            if sim > 0.5:  # 相似度阈值
                # 创建新集群
                cluster_keywords = [keywords[i], keywords[j]]
                used_indices.add(i)
                used_indices.add(j)
                
                # 尝试添加其他相似的关键词
                for k, kw in enumerate(keywords):
                    if k in used_indices or k == i or k == j:
                        continue
                    
                    # 计算与集群的平均相似度
                    avg_sim = (sim + SequenceMatcher(None, kw.lower(), keywords[i].lower()).ratio() + 
                              SequenceMatcher(None, kw.lower(), keywords[j].lower()).ratio()) / 3
                    
                    if avg_sim > 0.4:
                        cluster_keywords.append(kw)
                        used_indices.add(k)
                
                # 生成集群名称（使用第一个关键词的主要部分）
                cluster_name = self._extract_topic_name(cluster_keywords[0])
                
                clusters.append({
                    "id": cluster_id,
                    "name": cluster_name,
                    "description": f"包含 {len(cluster_keywords)} 个相关关键词",
                    "keywords": cluster_keywords,
                    "keyword_count": len(cluster_keywords),
                    "priority": "中"
                })
                cluster_id += 1
                
                if len(clusters) >= target_clusters:
                    break
        
        # 分配剩余关键词到最近的集群
        for i, kw in enumerate(keywords):
            if i not in used_indices:
                if clusters:
                    # 找到最相似的集群
                    best_cluster = None
                    best_sim = 0
                    for cluster in clusters:
                        avg_sim = sum(
                            SequenceMatcher(None, kw.lower(), ckw.lower()).ratio()
                            for ckw in cluster["keywords"][:3]
                        ) / min(3, len(cluster["keywords"]))
                        if avg_sim > best_sim:
                            best_sim = avg_sim
                            best_cluster = cluster
                    
                    if best_cluster and best_sim > 0.2:
                        best_cluster["keywords"].append(kw)
                        best_cluster["keyword_count"] = len(best_cluster["keywords"])
                    else:
                        # 创建新集群
                        clusters.append({
                            "id": cluster_id,
                            "name": self._extract_topic_name(kw),
                            "description": f"包含 1 个关键词",
                            "keywords": [kw],
                            "keyword_count": 1,
                            "priority": "低"
                        })
                        cluster_id += 1
                else:
                    # 创建第一个集群
                    clusters.append({
                        "id": cluster_id,
                        "name": self._extract_topic_name(kw),
                        "description": f"包含 1 个关键词",
                        "keywords": [kw],
                        "keyword_count": 1,
                        "priority": "中"
                    })
                    cluster_id += 1
        
        # 生成简单的关联关系
        relationships = []
        for i, cluster1 in enumerate(clusters):
            for j, cluster2 in enumerate(clusters[i+1:], i+1):
                # 计算集群之间的相似度
                similarities = [
                    SequenceMatcher(None, kw1.lower(), kw2.lower()).ratio()
                    for kw1 in cluster1["keywords"][:3]
                    for kw2 in cluster2["keywords"][:3]
                ]
                avg_sim = sum(similarities) / len(similarities) if similarities else 0
                
                if avg_sim > 0.3:
                    relationships.append({
                        "from": cluster1["id"],
                        "to": cluster2["id"],
                        "strength": "强" if avg_sim > 0.5 else "弱",
                        "type": "语义相关"
                    })
        
        # 计算统计信息
        total_keywords = sum(c["keyword_count"] for c in clusters)
        cluster_counts = [c["keyword_count"] for c in clusters]
        
        return {
            "clusters": clusters,
            "relationships": relationships,
            "cluster_stats": {
                "total_clusters": len(clusters),
                "total_keywords": total_keywords,
                "avg_keywords_per_cluster": total_keywords / len(clusters) if clusters else 0,
                "max_keywords": max(cluster_counts) if cluster_counts else 0,
                "min_keywords": min(cluster_counts) if cluster_counts else 0
            }
        }
    
    def _extract_topic_name(self, keyword: str) -> str:
        """从关键词中提取话题名称"""
        # 简单的提取逻辑：取关键词的前几个字或核心词
        if len(keyword) <= 6:
            return keyword
        
        # 尝试提取核心词（去除常见修饰词）
        common_modifiers = ["的", "和", "与", "或", "及", "等", "如何", "怎么", "什么", "哪个", "哪家"]
        words = keyword
        for mod in common_modifiers:
            words = words.replace(mod, " ")
        
        words = words.split()
        if words:
            return words[0][:8] if len(words[0]) > 8 else words[0]
        
        return keyword[:8]
    
    def generate_content_planning(
        self,
        clusters: List[Dict],
        brand: str,
        advantages: str,
        llm_chain
    ) -> Dict:
        """
        基于话题集群生成内容规划建议
        
        Args:
            clusters: 话题集群列表
            brand: 品牌名称
            advantages: 品牌优势
            llm_chain: LangChain 链对象
            
        Returns:
            内容规划建议字典
        """
        if not clusters:
            return {
                "content_gaps": [],
                "content_priorities": [],
                "content_suggestions": [],
                "content_matrix": {
                    "strategy": "",
                    "cross_cluster_opportunities": []
                }
            }
        
        try:
            prompt = PromptTemplate.from_template(self.content_planning_prompt_template)
            chain = prompt | llm_chain | StrOutputParser()
            
            result = chain.invoke({
                "clusters": json.dumps(clusters, ensure_ascii=False, indent=2),
                "brand": brand,
                "advantages": advantages
            })
            
            # 解析结果
            planning_data = self._parse_planning_result(result)
            
            return planning_data
            
        except Exception as e:
            # 如果规划失败，返回基于规则的简单规划
            return self._rule_based_planning(clusters)
    
    def _parse_planning_result(self, result: str) -> Dict:
        """解析内容规划结果"""
        json_match = re.search(r'\{.*\}', result, re.DOTALL)
        if json_match:
            try:
                data = json.loads(json_match.group())
                # 验证数据结构
                if "content_gaps" in data or "content_priorities" in data:
                    return data
            except json.JSONDecodeError:
                pass
        
        # 如果无法解析，返回空结果
        return {
            "content_gaps": [],
            "content_priorities": [],
            "content_suggestions": [],
            "content_matrix": {
                "strategy": "",
                "cross_cluster_opportunities": []
            }
        }
    
    def _rule_based_planning(self, clusters: List[Dict]) -> Dict:
        """基于规则的简单内容规划（备用方案）"""
        content_gaps = []
        content_priorities = []
        content_suggestions = []
        
        for cluster in clusters:
            cluster_id = cluster.get("id")
            cluster_name = cluster.get("name", "")
            keyword_count = cluster.get("keyword_count", 0)
            
            # 根据关键词数量判断优先级
            if keyword_count >= 10:
                priority = "高"
            elif keyword_count >= 5:
                priority = "中"
            else:
                priority = "低"
            
            content_priorities.append({
                "cluster_id": cluster_id,
                "cluster_name": cluster_name,
                "priority": priority,
                "reason": f"包含 {keyword_count} 个关键词",
                "recommended_content_count": max(1, keyword_count // 3)
            })
            
            # 生成简单的内容建议
            content_suggestions.append({
                "cluster_id": cluster_id,
                "cluster_name": cluster_name,
                "content_types": ["文章", "指南", "案例"],
                "platforms": ["博客", "知乎", "小红书"],
                "keyword_strategy": f"围绕 {cluster_name} 主题创作内容",
                "content_ideas": [
                    f"{cluster_name} 完整指南",
                    f"{cluster_name} 最佳实践",
                    f"{cluster_name} 案例分析"
                ]
            })
        
        return {
            "content_gaps": content_gaps,
            "content_priorities": content_priorities,
            "content_suggestions": content_suggestions,
            "content_matrix": {
                "strategy": "建议围绕各话题集群系统化创作内容，建立完整的内容矩阵",
                "cross_cluster_opportunities": []
            }
        }
    
    def analyze_cluster_coverage(
        self,
        clusters: List[Dict],
        historical_keywords: List[str]
    ) -> Dict:
        """
        分析话题集群的覆盖情况
        
        Args:
            clusters: 话题集群列表
            historical_keywords: 历史关键词列表（用于分析覆盖度）
            
        Returns:
            覆盖分析结果
        """
        if not clusters:
            return {
                "coverage_ratio": 0.0,
                "cluster_distribution": {},
                "gaps": []
            }
        
        # 统计每个集群的关键词数量
        cluster_distribution = {
            cluster["name"]: cluster["keyword_count"]
            for cluster in clusters
        }
        
        # 计算覆盖比例（如果有历史关键词）
        coverage_ratio = 0.0
        if historical_keywords:
            cluster_keywords = set()
            for cluster in clusters:
                cluster_keywords.update(cluster.get("keywords", []))
            
            covered = len(cluster_keywords & set(historical_keywords))
            coverage_ratio = covered / len(historical_keywords) if historical_keywords else 0.0
        
        # 识别覆盖盲区（关键词数量少的集群）
        gaps = [
            {
                "cluster_name": cluster["name"],
                "keyword_count": cluster["keyword_count"],
                "priority": "高" if cluster["keyword_count"] < 3 else "中"
            }
            for cluster in clusters
            if cluster["keyword_count"] < 5
        ]
        
        return {
            "coverage_ratio": coverage_ratio,
            "cluster_distribution": cluster_distribution,
            "gaps": gaps
        }
    
    def get_visualization_data(
        self,
        clusters: List[Dict],
        relationships: List[Dict]
    ) -> Dict:
        """
        生成可视化数据（用于网络图和树状图）
        
        Args:
            clusters: 话题集群列表
            relationships: 关联关系列表
            
        Returns:
            可视化数据字典
        """
        # 节点数据（话题集群）
        nodes = [
            {
                "id": cluster["id"],
                "name": cluster["name"],
                "size": cluster["keyword_count"],
                "keywords": cluster["keywords"],
                "description": cluster.get("description", "")
            }
            for cluster in clusters
        ]
        
        # 边数据（关联关系）
        edges = [
            {
                "source": rel["from"],
                "target": rel["to"],
                "strength": rel.get("strength", "弱"),
                "type": rel.get("type", "相关")
            }
            for rel in relationships
        ]
        
        return {
            "nodes": nodes,
            "edges": edges
        }
feat: 重构项目结构并添加平台同步基础架构 2026-01-30 10:21:29 +08:00			`"""`
			`话题集群生成模块`
			`基于关键词进行语义聚类，生成话题集群，分析话题关联，提供内容规划建议`
			`"""`
			`from typing import List, Dict, Set, Optional, Tuple`
			`from langchain_core.prompts import PromptTemplate`
			`from langchain_core.output_parsers import StrOutputParser`
			`import json`
			`import re`
			`from collections import defaultdict`
			`from difflib import SequenceMatcher`
			`import math`


			`class TopicCluster:`
			`"""话题集群生成器"""`

			`def __init__(self):`
			`# 话题聚类 Prompt`
			`self.clustering_prompt_template = """`
			`你是话题聚类专家，专门将关键词聚类为话题集群，帮助用户系统化规划内容策略。`

			`【关键词列表】`
			`{keywords}`

			`【品牌】{brand}`
			`【优势】{advantages}`
			`【聚类数量】{cluster_count}（建议范围：3-10个话题集群）`

			`【话题聚类要求】`

			`1. 语义相似性`
			`- 将语义相似的关键词归为同一话题集群`
			`- 每个话题集群应该围绕一个核心主题`
			`- 话题之间应该有明显的区分度`

			`2. 话题命名`
			`- 为每个话题集群生成一个简洁、有代表性的名称（2-8字）`
			`- 话题名称应该能概括该集群的核心主题`
			`- 使用用户容易理解的语言`

			`3. 话题描述`
			`- 为每个话题集群生成一段描述（20-50字）`
			`- 说明该话题的核心内容和价值`

			`4. 关键词分配`
			`- 每个关键词应该只属于一个话题集群`
			`- 如果关键词可以属于多个话题，选择最相关的一个`
			`- 确保所有关键词都被分配`

			`5. 话题关联`
			`- 识别话题之间的关联关系`
			`- 标记强关联（直接相关）和弱关联（间接相关）`

			`【输出格式】`
			`请严格按照以下 JSON 格式输出，不要添加任何其他内容：`

			`{{`
			`"clusters": [`
			`{{`
			`"id": 1,`
			`"name": "<话题名称>",`
			`"description": "<话题描述>",`
			`"keywords": ["<关键词1>", "<关键词2>", ...],`
			`"keyword_count": <关键词数量>,`
			`"priority": "<优先级：高/中/低>"`
			`}},`
			`...`
			`],`
			`"relationships": [`
			`{{`
			`"from": <话题ID>,`
			`"to": <话题ID>,`
			`"strength": "<关联强度：强/弱>",`
			`"type": "<关联类型：功能相关/场景相关/用户相关等>"`
			`}},`
			`...`
			`],`
			`"cluster_stats": {{`
			`"total_clusters": <话题总数>,`
			`"total_keywords": <关键词总数>,`
			`"avg_keywords_per_cluster": <平均每个话题的关键词数量>,`
			`"max_keywords": <最大话题的关键词数量>,`
			`"min_keywords": <最小话题的关键词数量>`
			`}}`
			`}}`

			`【开始聚类】`
			`"""`

			`# 内容规划 Prompt`
			`self.content_planning_prompt_template = """`
			`你是内容策略专家，基于话题集群生成内容规划建议。`

			`【话题集群】`
			`{clusters}`

			`【品牌】{brand}`
			`【优势】{advantages}`

			`【内容规划要求】`

			`1. 内容盲区分析`
			`- 识别哪些话题集群缺少内容`
			`- 分析话题覆盖的完整性`
			`- 发现内容空白点`

			`2. 内容优先级`
			`- 根据话题的重要性和覆盖度，给出内容创作优先级`
			`- 优先覆盖高价值、低覆盖的话题`

			`3. 内容建议`
			`- 为每个话题集群提供内容创作建议`
			`- 包括：内容类型、发布平台、关键词策略等`

			`4. 内容矩阵`
			`- 建议话题之间的内容关联策略`
			`- 如何通过内容矩阵提升整体覆盖面`

			`【输出格式】`
			`请严格按照以下 JSON 格式输出，不要添加任何其他内容：`

			`{{`
			`"content_gaps": [`
			`{{`
			`"cluster_id": <话题ID>,`
			`"cluster_name": "<话题名称>",`
			`"gap_type": "<盲区类型：完全空白/内容不足/关联缺失>",`
			`"description": "<盲区描述>",`
			`"priority": "<优先级：高/中/低>"`
			`}},`
			`...`
			`],`
			`"content_priorities": [`
			`{{`
			`"cluster_id": <话题ID>,`
			`"cluster_name": "<话题名称>",`
			`"priority": "<优先级：高/中/低>",`
			`"reason": "<优先级原因>",`
			`"recommended_content_count": <建议内容数量>`
			`}},`
			`...`
			`],`
			`"content_suggestions": [`
			`{{`
			`"cluster_id": <话题ID>,`
			`"cluster_name": "<话题名称>",`
			`"content_types": ["<内容类型1>", "<内容类型2>", ...],`
			`"platforms": ["<平台1>", "<平台2>", ...],`
			`"keyword_strategy": "<关键词策略>",`
			`"content_ideas": ["<内容创意1>", "<内容创意2>", ...]`
			`}},`
			`...`
			`],`
			`"content_matrix": {{`
			`"strategy": "<内容矩阵策略描述>",`
			`"cross_cluster_opportunities": [`
			`{{`
			`"clusters": ["<话题1>", "<话题2>"],`
			`"opportunity": "<关联机会描述>",`
			`"content_type": "<建议内容类型>"`
			`}},`
			`...`
			`]`
			`}}`
			`}}`

			`【开始规划】`
			`"""`

			`def cluster_keywords(`
			`self,`
			`keywords: List[str],`
			`brand: str,`
			`advantages: str,`
			`cluster_count: int,`
			`llm_chain`
			`) -> Dict:`
			`"""`
			`将关键词聚类为话题集群`

			`Args:`
			`keywords: 关键词列表`
			`brand: 品牌名称`
			`advantages: 品牌优势`
			`cluster_count: 期望的话题集群数量（3-10）`
			`llm_chain: LangChain 链对象`

			`Returns:`
			`包含话题集群、关联关系和统计信息的字典`
			`"""`
			`if not keywords:`
			`return {`
			`"clusters": [],`
			`"relationships": [],`
			`"cluster_stats": {`
			`"total_clusters": 0,`
			`"total_keywords": 0,`
			`"avg_keywords_per_cluster": 0,`
			`"max_keywords": 0,`
			`"min_keywords": 0`
			`}`
			`}`

			`# 限制关键词数量，避免 Prompt 过长`
			`keywords_to_cluster = keywords[:100] # 最多处理100个关键词`

			`# 限制聚类数量在合理范围`
			`cluster_count = max(3, min(10, cluster_count))`

			`try:`
			`prompt = PromptTemplate.from_template(self.clustering_prompt_template)`
			`chain = prompt \| llm_chain \| StrOutputParser()`

			`result = chain.invoke({`
			`"keywords": json.dumps(keywords_to_cluster, ensure_ascii=False, indent=2),`
			`"brand": brand,`
			`"advantages": advantages,`
			`"cluster_count": cluster_count`
			`})`

			`# 解析结果`
			`cluster_data = self._parse_clustering_result(result, keywords_to_cluster)`

			`return cluster_data`

			`except Exception as e:`
			`# 如果聚类失败，返回基于规则的简单聚类`
			`return self._rule_based_clustering(keywords_to_cluster, cluster_count)`

			`def _parse_clustering_result(self, result: str, original_keywords: List[str]) -> Dict:`
			`"""解析聚类结果"""`
			`# 尝试提取 JSON`
			`json_match = re.search(r'\{.*\}', result, re.DOTALL)`
			`if json_match:`
			`try:`
			`data = json.loads(json_match.group())`
			`# 验证数据结构`
			`if "clusters" in data:`
			`# 验证和清理数据`
			`data = self._validate_cluster_data(data, original_keywords)`
			`return data`
			`except json.JSONDecodeError:`
			`pass`

			`# 如果无法解析 JSON，使用基于规则的聚类`
			`return self._rule_based_clustering(original_keywords, min(5, len(original_keywords) // 5))`

			`def _validate_cluster_data(self, data: Dict, original_keywords: List[str]) -> Dict:`
			`"""验证和清理聚类数据"""`
			`if "clusters" not in data:`
			`return self._rule_based_clustering(original_keywords, 5)`

			`clusters = data.get("clusters", [])`
			`validated_clusters = []`
			`assigned_keywords = set()`

			`# 验证每个集群`
			`for cluster in clusters:`
			`if not isinstance(cluster, dict):`
			`continue`

			`cluster_id = cluster.get("id")`
			`name = cluster.get("name", "").strip()`
			`keywords = cluster.get("keywords", [])`

			`if not name or not keywords:`
			`continue`

			`# 过滤无效关键词`
			`valid_keywords = []`
			`for kw in keywords:`
			`if isinstance(kw, str) and kw.strip() and kw.strip() in original_keywords:`
			`kw_clean = kw.strip()`
			`if kw_clean not in assigned_keywords:`
			`valid_keywords.append(kw_clean)`
			`assigned_keywords.add(kw_clean)`

			`if valid_keywords:`
			`validated_clusters.append({`
			`"id": cluster_id if cluster_id else len(validated_clusters) + 1,`
			`"name": name,`
			`"description": cluster.get("description", ""),`
			`"keywords": valid_keywords,`
			`"keyword_count": len(valid_keywords),`
			`"priority": cluster.get("priority", "中")`
			`})`

			`# 分配未分配的关键词到最近的集群`
			`unassigned = [kw for kw in original_keywords if kw not in assigned_keywords]`
			`if unassigned and validated_clusters:`
			`for kw in unassigned:`
			`# 找到最相似的集群`
			`best_cluster = None`
			`best_similarity = 0`
			`for cluster in validated_clusters:`
			`# 计算与集群关键词的平均相似度`
			`similarities = [`
			`SequenceMatcher(None, kw.lower(), ckw.lower()).ratio()`
			`for ckw in cluster["keywords"][:5] # 只比较前5个`
			`]`
			`avg_sim = sum(similarities) / len(similarities) if similarities else 0`
			`if avg_sim > best_similarity:`
			`best_similarity = avg_sim`
			`best_cluster = cluster`

			`if best_cluster and best_similarity > 0.3:`
			`best_cluster["keywords"].append(kw)`
			`best_cluster["keyword_count"] = len(best_cluster["keywords"])`

			`# 更新统计信息`
			`total_keywords = sum(c["keyword_count"] for c in validated_clusters)`
			`cluster_counts = [c["keyword_count"] for c in validated_clusters]`

			`data["clusters"] = validated_clusters`
			`data["cluster_stats"] = {`
			`"total_clusters": len(validated_clusters),`
			`"total_keywords": total_keywords,`
			`"avg_keywords_per_cluster": total_keywords / len(validated_clusters) if validated_clusters else 0,`
			`"max_keywords": max(cluster_counts) if cluster_counts else 0,`
			`"min_keywords": min(cluster_counts) if cluster_counts else 0`
			`}`

			`# 验证关联关系`
			`if "relationships" in data:`
			`relationships = []`
			`cluster_ids = {c["id"] for c in validated_clusters}`
			`for rel in data["relationships"]:`
			`if isinstance(rel, dict):`
			`from_id = rel.get("from")`
			`to_id = rel.get("to")`
			`if from_id in cluster_ids and to_id in cluster_ids and from_id != to_id:`
			`relationships.append(rel)`
			`data["relationships"] = relationships`

			`return data`

			`def _rule_based_clustering(`
			`self,`
			`keywords: List[str],`
			`target_clusters: int`
			`) -> Dict:`
			`"""`
			`基于规则的简单聚类（备用方案，不依赖 LLM）`

			`Args:`
			`keywords: 关键词列表`
			`target_clusters: 目标集群数量`

			`Returns:`
			`聚类结果字典`
			`"""`
			`if not keywords:`
			`return {`
			`"clusters": [],`
			`"relationships": [],`
			`"cluster_stats": {`
			`"total_clusters": 0,`
			`"total_keywords": 0,`
			`"avg_keywords_per_cluster": 0,`
			`"max_keywords": 0,`
			`"min_keywords": 0`
			`}`
			`}`

			`# 简单的基于关键词相似度的聚类`
			`clusters = []`
			`remaining_keywords = keywords.copy()`

			`# 计算关键词之间的相似度矩阵`
			`similarity_matrix = {}`
			`for i, kw1 in enumerate(keywords):`
			`for j, kw2 in enumerate(keywords[i+1:], i+1):`
			`sim = SequenceMatcher(None, kw1.lower(), kw2.lower()).ratio()`
			`similarity_matrix[(i, j)] = sim`

			`# 简单的聚类算法：找到相似度高的关键词组`
			`used_indices = set()`
			`cluster_id = 1`

			`# 按相似度排序`
			`sorted_pairs = sorted(similarity_matrix.items(), key=lambda x: x[1], reverse=True)`

			`for (i, j), sim in sorted_pairs:`
			`if i in used_indices or j in used_indices:`
			`continue`

			`if sim > 0.5: # 相似度阈值`
			`# 创建新集群`
			`cluster_keywords = [keywords[i], keywords[j]]`
			`used_indices.add(i)`
			`used_indices.add(j)`

			`# 尝试添加其他相似的关键词`
			`for k, kw in enumerate(keywords):`
			`if k in used_indices or k == i or k == j:`
			`continue`

			`# 计算与集群的平均相似度`
			`avg_sim = (sim + SequenceMatcher(None, kw.lower(), keywords[i].lower()).ratio() +`
			`SequenceMatcher(None, kw.lower(), keywords[j].lower()).ratio()) / 3`

			`if avg_sim > 0.4:`
			`cluster_keywords.append(kw)`
			`used_indices.add(k)`

			`# 生成集群名称（使用第一个关键词的主要部分）`
			`cluster_name = self._extract_topic_name(cluster_keywords[0])`

			`clusters.append({`
			`"id": cluster_id,`
			`"name": cluster_name,`
			`"description": f"包含 {len(cluster_keywords)} 个相关关键词",`
			`"keywords": cluster_keywords,`
			`"keyword_count": len(cluster_keywords),`
			`"priority": "中"`
			`})`
			`cluster_id += 1`

			`if len(clusters) >= target_clusters:`
			`break`

			`# 分配剩余关键词到最近的集群`
			`for i, kw in enumerate(keywords):`
			`if i not in used_indices:`
			`if clusters:`
			`# 找到最相似的集群`
			`best_cluster = None`
			`best_sim = 0`
			`for cluster in clusters:`
			`avg_sim = sum(`
			`SequenceMatcher(None, kw.lower(), ckw.lower()).ratio()`
			`for ckw in cluster["keywords"][:3]`
			`) / min(3, len(cluster["keywords"]))`
			`if avg_sim > best_sim:`
			`best_sim = avg_sim`
			`best_cluster = cluster`

			`if best_cluster and best_sim > 0.2:`
			`best_cluster["keywords"].append(kw)`
			`best_cluster["keyword_count"] = len(best_cluster["keywords"])`
			`else:`
			`# 创建新集群`
			`clusters.append({`
			`"id": cluster_id,`
			`"name": self._extract_topic_name(kw),`
			`"description": f"包含 1 个关键词",`
			`"keywords": [kw],`
			`"keyword_count": 1,`
			`"priority": "低"`
			`})`
			`cluster_id += 1`
			`else:`
			`# 创建第一个集群`
			`clusters.append({`
			`"id": cluster_id,`
			`"name": self._extract_topic_name(kw),`
			`"description": f"包含 1 个关键词",`
			`"keywords": [kw],`
			`"keyword_count": 1,`
			`"priority": "中"`
			`})`
			`cluster_id += 1`

			`# 生成简单的关联关系`
			`relationships = []`
			`for i, cluster1 in enumerate(clusters):`
			`for j, cluster2 in enumerate(clusters[i+1:], i+1):`
			`# 计算集群之间的相似度`
			`similarities = [`
			`SequenceMatcher(None, kw1.lower(), kw2.lower()).ratio()`
			`for kw1 in cluster1["keywords"][:3]`
			`for kw2 in cluster2["keywords"][:3]`
			`]`
			`avg_sim = sum(similarities) / len(similarities) if similarities else 0`

			`if avg_sim > 0.3:`
			`relationships.append({`
			`"from": cluster1["id"],`
			`"to": cluster2["id"],`
			`"strength": "强" if avg_sim > 0.5 else "弱",`
			`"type": "语义相关"`
			`})`

			`# 计算统计信息`
			`total_keywords = sum(c["keyword_count"] for c in clusters)`
			`cluster_counts = [c["keyword_count"] for c in clusters]`

			`return {`
			`"clusters": clusters,`
			`"relationships": relationships,`
			`"cluster_stats": {`
			`"total_clusters": len(clusters),`
			`"total_keywords": total_keywords,`
			`"avg_keywords_per_cluster": total_keywords / len(clusters) if clusters else 0,`
			`"max_keywords": max(cluster_counts) if cluster_counts else 0,`
			`"min_keywords": min(cluster_counts) if cluster_counts else 0`
			`}`
			`}`

			`def _extract_topic_name(self, keyword: str) -> str:`
			`"""从关键词中提取话题名称"""`
			`# 简单的提取逻辑：取关键词的前几个字或核心词`
			`if len(keyword) <= 6:`
			`return keyword`

			`# 尝试提取核心词（去除常见修饰词）`
			`common_modifiers = ["的", "和", "与", "或", "及", "等", "如何", "怎么", "什么", "哪个", "哪家"]`
			`words = keyword`
			`for mod in common_modifiers:`
			`words = words.replace(mod, " ")`

			`words = words.split()`
			`if words:`
			`return words[0][:8] if len(words[0]) > 8 else words[0]`

			`return keyword[:8]`

			`def generate_content_planning(`
			`self,`
			`clusters: List[Dict],`
			`brand: str,`
			`advantages: str,`
			`llm_chain`
			`) -> Dict:`
			`"""`
			`基于话题集群生成内容规划建议`

			`Args:`
			`clusters: 话题集群列表`
			`brand: 品牌名称`
			`advantages: 品牌优势`
			`llm_chain: LangChain 链对象`

			`Returns:`
			`内容规划建议字典`
			`"""`
			`if not clusters:`
			`return {`
			`"content_gaps": [],`
			`"content_priorities": [],`
			`"content_suggestions": [],`
			`"content_matrix": {`
			`"strategy": "",`
			`"cross_cluster_opportunities": []`
			`}`
			`}`

			`try:`
			`prompt = PromptTemplate.from_template(self.content_planning_prompt_template)`
			`chain = prompt \| llm_chain \| StrOutputParser()`

			`result = chain.invoke({`
			`"clusters": json.dumps(clusters, ensure_ascii=False, indent=2),`
			`"brand": brand,`
			`"advantages": advantages`
			`})`

			`# 解析结果`
			`planning_data = self._parse_planning_result(result)`

			`return planning_data`

			`except Exception as e:`
			`# 如果规划失败，返回基于规则的简单规划`
			`return self._rule_based_planning(clusters)`

			`def _parse_planning_result(self, result: str) -> Dict:`
			`"""解析内容规划结果"""`
			`json_match = re.search(r'\{.*\}', result, re.DOTALL)`
			`if json_match:`
			`try:`
			`data = json.loads(json_match.group())`
			`# 验证数据结构`
			`if "content_gaps" in data or "content_priorities" in data:`
			`return data`
			`except json.JSONDecodeError:`
			`pass`

			`# 如果无法解析，返回空结果`
			`return {`
			`"content_gaps": [],`
			`"content_priorities": [],`
			`"content_suggestions": [],`
			`"content_matrix": {`
			`"strategy": "",`
			`"cross_cluster_opportunities": []`
			`}`
			`}`

			`def _rule_based_planning(self, clusters: List[Dict]) -> Dict:`
			`"""基于规则的简单内容规划（备用方案）"""`
			`content_gaps = []`
			`content_priorities = []`
			`content_suggestions = []`

			`for cluster in clusters:`
			`cluster_id = cluster.get("id")`
			`cluster_name = cluster.get("name", "")`
			`keyword_count = cluster.get("keyword_count", 0)`

			`# 根据关键词数量判断优先级`
			`if keyword_count >= 10:`
			`priority = "高"`
			`elif keyword_count >= 5:`
			`priority = "中"`
			`else:`
			`priority = "低"`

			`content_priorities.append({`
			`"cluster_id": cluster_id,`
			`"cluster_name": cluster_name,`
			`"priority": priority,`
			`"reason": f"包含 {keyword_count} 个关键词",`
			`"recommended_content_count": max(1, keyword_count // 3)`
			`})`

			`# 生成简单的内容建议`
			`content_suggestions.append({`
			`"cluster_id": cluster_id,`
			`"cluster_name": cluster_name,`
			`"content_types": ["文章", "指南", "案例"],`
			`"platforms": ["博客", "知乎", "小红书"],`
			`"keyword_strategy": f"围绕 {cluster_name} 主题创作内容",`
			`"content_ideas": [`
			`f"{cluster_name} 完整指南",`
			`f"{cluster_name} 最佳实践",`
			`f"{cluster_name} 案例分析"`
			`]`
			`})`

			`return {`
			`"content_gaps": content_gaps,`
			`"content_priorities": content_priorities,`
			`"content_suggestions": content_suggestions,`
			`"content_matrix": {`
			`"strategy": "建议围绕各话题集群系统化创作内容，建立完整的内容矩阵",`
			`"cross_cluster_opportunities": []`
			`}`
			`}`

			`def analyze_cluster_coverage(`
			`self,`
			`clusters: List[Dict],`
			`historical_keywords: List[str]`
			`) -> Dict:`
			`"""`
			`分析话题集群的覆盖情况`

			`Args:`
			`clusters: 话题集群列表`
			`historical_keywords: 历史关键词列表（用于分析覆盖度）`

			`Returns:`
			`覆盖分析结果`
			`"""`
			`if not clusters:`
			`return {`
			`"coverage_ratio": 0.0,`
			`"cluster_distribution": {},`
			`"gaps": []`
			`}`

			`# 统计每个集群的关键词数量`
			`cluster_distribution = {`
			`cluster["name"]: cluster["keyword_count"]`
			`for cluster in clusters`
			`}`

			`# 计算覆盖比例（如果有历史关键词）`
			`coverage_ratio = 0.0`
			`if historical_keywords:`
			`cluster_keywords = set()`
			`for cluster in clusters:`
			`cluster_keywords.update(cluster.get("keywords", []))`

			`covered = len(cluster_keywords & set(historical_keywords))`
			`coverage_ratio = covered / len(historical_keywords) if historical_keywords else 0.0`

			`# 识别覆盖盲区（关键词数量少的集群）`
			`gaps = [`
			`{`
			`"cluster_name": cluster["name"],`
			`"keyword_count": cluster["keyword_count"],`
			`"priority": "高" if cluster["keyword_count"] < 3 else "中"`
			`}`
			`for cluster in clusters`
			`if cluster["keyword_count"] < 5`
			`]`

			`return {`
			`"coverage_ratio": coverage_ratio,`
			`"cluster_distribution": cluster_distribution,`
			`"gaps": gaps`
			`}`

			`def get_visualization_data(`
			`self,`
			`clusters: List[Dict],`
			`relationships: List[Dict]`
			`) -> Dict:`
			`"""`
			`生成可视化数据（用于网络图和树状图）`

			`Args:`
			`clusters: 话题集群列表`
			`relationships: 关联关系列表`

			`Returns:`
			`可视化数据字典`
			`"""`
			`# 节点数据（话题集群）`
			`nodes = [`
			`{`
			`"id": cluster["id"],`
			`"name": cluster["name"],`
			`"size": cluster["keyword_count"],`
			`"keywords": cluster["keywords"],`
			`"description": cluster.get("description", "")`
			`}`
			`for cluster in clusters`
			`]`

			`# 边数据（关联关系）`
			`edges = [`
			`{`
			`"source": rel["from"],`
			`"target": rel["to"],`
			`"strength": rel.get("strength", "弱"),`
			`"type": rel.get("type", "相关")`
			`}`
			`for rel in relationships`
			`]`

			`return {`
			`"nodes": nodes,`
			`"edges": edges`
			`}`