""" JSON-LD Schema.org 结构化数据生成模块 生成符合 Schema.org 规范的 JSON-LD 代码,提升品牌在 AI 模型中的实体识别和权威性 """ from typing import Dict, List, Optional import json from datetime import datetime class SchemaGenerator: """Schema.org JSON-LD 生成器""" def __init__(self): # Schema.org 上下文 self.context = "https://schema.org" def generate_organization_schema( self, brand_name: str, description: str = "", url: str = "", logo: str = "", founding_date: str = "", contact_point: Dict = None ) -> Dict: """ 生成 Organization(组织)类型的 Schema Args: brand_name: 品牌/组织名称 description: 组织描述 url: 官网 URL logo: Logo URL founding_date: 成立日期(YYYY-MM-DD) contact_point: 联系方式(可选) Returns: JSON-LD Schema 字典 """ schema = { "@context": self.context, "@type": "Organization", "name": brand_name } if description: schema["description"] = description if url: schema["url"] = url if logo: schema["logo"] = logo if founding_date: schema["foundingDate"] = founding_date if contact_point: schema["contactPoint"] = { "@type": "ContactPoint", **contact_point } return schema def generate_software_application_schema( self, brand_name: str, application_name: str = "", description: str = "", url: str = "", application_category: str = "BusinessApplication", operating_system: str = "", offers: Dict = None, aggregate_rating: Dict = None, feature_list: List[str] = None ) -> Dict: """ 生成 SoftwareApplication(软件应用)类型的 Schema Args: brand_name: 品牌名称 application_name: 应用名称(默认使用品牌名称) description: 应用描述 url: 应用 URL application_category: 应用类别(如 BusinessApplication, WebApplication) operating_system: 操作系统(如 Windows, macOS, Linux, Web) offers: 价格信息(可选) aggregate_rating: 评分信息(可选) feature_list: 功能列表(可选) Returns: JSON-LD Schema 字典 """ schema = { "@context": self.context, "@type": "SoftwareApplication", "name": application_name or brand_name, "applicationCategory": application_category } if description: schema["description"] = description if url: schema["url"] = url if operating_system: schema["operatingSystem"] = operating_system # 添加发布者(组织) schema["publisher"] = { "@type": "Organization", "name": brand_name } if offers: schema["offers"] = { "@type": "Offer", **offers } if aggregate_rating: schema["aggregateRating"] = { "@type": "AggregateRating", **aggregate_rating } if feature_list: schema["featureList"] = feature_list return schema def generate_product_schema( self, brand_name: str, product_name: str = "", description: str = "", url: str = "", product_category: str = "", brand: Dict = None, offers: Dict = None, aggregate_rating: Dict = None ) -> Dict: """ 生成 Product(产品)类型的 Schema Args: brand_name: 品牌名称 product_name: 产品名称(默认使用品牌名称) description: 产品描述 url: 产品 URL product_category: 产品类别 brand: 品牌信息(可选) offers: 价格信息(可选) aggregate_rating: 评分信息(可选) Returns: JSON-LD Schema 字典 """ schema = { "@context": self.context, "@type": "Product", "name": product_name or brand_name } if description: schema["description"] = description if url: schema["url"] = url if product_category: schema["category"] = product_category if brand: schema["brand"] = { "@type": "Brand", **brand } else: schema["brand"] = { "@type": "Brand", "name": brand_name } if offers: schema["offers"] = { "@type": "Offer", **offers } if aggregate_rating: schema["aggregateRating"] = { "@type": "AggregateRating", **aggregate_rating } return schema def generate_service_schema( self, brand_name: str, service_name: str = "", description: str = "", url: str = "", service_type: str = "", provider: Dict = None, area_served: str = "", offers: Dict = None ) -> Dict: """ 生成 Service(服务)类型的 Schema Args: brand_name: 品牌名称 service_name: 服务名称(默认使用品牌名称) description: 服务描述 url: 服务 URL service_type: 服务类型 provider: 服务提供者信息(可选) area_served: 服务区域 offers: 价格信息(可选) Returns: JSON-LD Schema 字典 """ schema = { "@context": self.context, "@type": "Service", "name": service_name or brand_name } if description: schema["description"] = description if url: schema["url"] = url if service_type: schema["serviceType"] = service_type if provider: schema["provider"] = { "@type": "Organization", **provider } else: schema["provider"] = { "@type": "Organization", "name": brand_name } if area_served: schema["areaServed"] = { "@type": "Country", "name": area_served } if offers: schema["offers"] = { "@type": "Offer", **offers } return schema def generate_combined_schema( self, brand_name: str, advantages: str = "", schema_types: List[str] = None, **kwargs ) -> Dict: """ 生成组合 Schema(包含多个类型) Args: brand_name: 品牌名称 advantages: 品牌优势(用于描述) schema_types: Schema 类型列表(如 ["Organization", "SoftwareApplication"]) **kwargs: 其他参数 Returns: 组合的 JSON-LD Schema 字典 """ if schema_types is None: schema_types = ["Organization", "SoftwareApplication"] schemas = [] # 生成 Organization if "Organization" in schema_types: org_schema = self.generate_organization_schema( brand_name=brand_name, description=advantages or kwargs.get("description", ""), url=kwargs.get("url", ""), logo=kwargs.get("logo", ""), founding_date=kwargs.get("founding_date", ""), contact_point=kwargs.get("contact_point") ) schemas.append(org_schema) # 生成 SoftwareApplication if "SoftwareApplication" in schema_types: app_schema = self.generate_software_application_schema( brand_name=brand_name, application_name=kwargs.get("application_name", brand_name), description=advantages or kwargs.get("description", ""), url=kwargs.get("url", ""), application_category=kwargs.get("application_category", "BusinessApplication"), operating_system=kwargs.get("operating_system", ""), offers=kwargs.get("offers"), aggregate_rating=kwargs.get("aggregate_rating"), feature_list=kwargs.get("feature_list") ) schemas.append(app_schema) # 生成 Product if "Product" in schema_types: product_schema = self.generate_product_schema( brand_name=brand_name, product_name=kwargs.get("product_name", brand_name), description=advantages or kwargs.get("description", ""), url=kwargs.get("url", ""), product_category=kwargs.get("product_category", ""), brand=kwargs.get("brand"), offers=kwargs.get("offers"), aggregate_rating=kwargs.get("aggregate_rating") ) schemas.append(product_schema) # 生成 Service if "Service" in schema_types: service_schema = self.generate_service_schema( brand_name=brand_name, service_name=kwargs.get("service_name", brand_name), description=advantages or kwargs.get("description", ""), url=kwargs.get("url", ""), service_type=kwargs.get("service_type", ""), provider=kwargs.get("provider"), area_served=kwargs.get("area_served", ""), offers=kwargs.get("offers") ) schemas.append(service_schema) # 如果只有一个 Schema,直接返回 if len(schemas) == 1: return schemas[0] # 多个 Schema 时,返回数组格式 return schemas def format_json_ld(self, schema: Dict, indent: int = 2) -> str: """ 格式化 JSON-LD 为字符串(用于嵌入 HTML) Args: schema: Schema 字典 indent: 缩进空格数 Returns: 格式化的 JSON 字符串 """ return json.dumps(schema, ensure_ascii=False, indent=indent) def generate_html_script_tag(self, schema: Dict) -> str: """ 生成 HTML script 标签(可直接嵌入网页) Args: schema: Schema 字典 Returns: HTML script 标签字符串 """ json_str = self.format_json_ld(schema) return f'' def validate_schema(self, schema: Dict) -> tuple[bool, List[str]]: """ 验证 Schema 的基本有效性 Args: schema: Schema 字典 Returns: (是否有效, 错误列表) """ errors = [] # 检查必需字段 if "@context" not in schema: errors.append("缺少 @context 字段") if "@type" not in schema: errors.append("缺少 @type 字段") if "name" not in schema: errors.append("缺少 name 字段") # 检查 @context 值 if schema.get("@context") != self.context: errors.append(f"@context 应为 {self.context}") return len(errors) == 0, errors def get_schema_types_info(self) -> Dict[str, str]: """ 获取支持的 Schema 类型信息 Returns: Schema 类型字典(类型名 -> 描述) """ return { "Organization": "组织/公司(适合企业品牌)", "SoftwareApplication": "软件应用(适合 SaaS 产品、软件工具)", "Product": "产品(适合实体产品或数字产品)", "Service": "服务(适合服务类业务)" } def generate_for_github(self, brand_name: str, advantages: str = "", **kwargs) -> str: """ 为 GitHub 项目生成 JSON-LD Schema 通常使用 SoftwareApplication 类型 Args: brand_name: 品牌/项目名称 advantages: 项目优势/描述 **kwargs: 其他参数 Returns: 格式化的 JSON-LD 字符串 """ schema = self.generate_software_application_schema( brand_name=brand_name, application_name=kwargs.get("application_name", brand_name), description=advantages or kwargs.get("description", ""), url=kwargs.get("url", ""), application_category=kwargs.get("application_category", "WebApplication"), operating_system=kwargs.get("operating_system", "Web"), feature_list=kwargs.get("feature_list") ) return self.format_json_ld(schema) def generate_for_website(self, brand_name: str, advantages: str = "", **kwargs) -> str: """ 为官网生成 JSON-LD Schema 通常使用 Organization + SoftwareApplication/Product/Service 组合 Args: brand_name: 品牌名称 advantages: 品牌优势/描述 **kwargs: 其他参数 Returns: HTML script 标签字符串(可直接嵌入网页) """ schema_types = kwargs.get("schema_types", ["Organization", "SoftwareApplication"]) schema = self.generate_combined_schema( brand_name=brand_name, advantages=advantages, schema_types=schema_types, **kwargs ) return self.generate_html_script_tag(schema) def generate_faq_schema(self, faq_items: List[Dict[str, str]]) -> Dict: """ 生成 FAQPage 类型的 Schema Args: faq_items: FAQ 列表,每个元素包含 {"question": "...", "answer": "..."} Returns: JSON-LD Schema 字典 """ main_entity = [] for item in faq_items: main_entity.append({ "@type": "Question", "name": item["question"], "acceptedAnswer": { "@type": "Answer", "text": item["answer"] } }) return { "@context": self.context, "@type": "FAQPage", "mainEntity": main_entity } def generate_howto_schema(self, title: str, steps: List[Dict[str, str]], description: str = "") -> Dict: """ 生成 HowTo 类型的 Schema Args: title: 操作标题 steps: 步骤列表,每个元素包含 {"name": "...", "text": "..."} description: 操作描述 Returns: JSON-LD Schema 字典 """ howto_steps = [] for i, step in enumerate(steps, 1): howto_steps.append({ "@type": "HowToStep", "position": i, "name": step.get("name", f"步骤 {i}"), "text": step.get("text", "") }) schema = { "@context": self.context, "@type": "HowTo", "name": title, "step": howto_steps } if description: schema["description"] = description return schema def generate_article_schema(self, title: str, author: str, date_published: str, description: str = "", image: str = "", url: str = "") -> Dict: """ 生成 Article 类型的 Schema Args: title: 文章标题 author: 作者名称 date_published: 发布日期 (YYYY-MM-DD) description: 文章描述 image: 文章图片 URL url: 文章 URL Returns: JSON-LD Schema 字典 """ schema = { "@context": self.context, "@type": "Article", "headline": title, "author": { "@type": "Person", "name": author }, "datePublished": date_published } if description: schema["description"] = description if image: schema["image"] = image if url: schema["url"] = url return schema def generate_review_schema(self, item_name: str, review_body: str, rating_value: float, reviewer: str, item_type: str = "Product") -> Dict: """ 生成 Review 类型的 Schema Args: item_name: 被评价项目名称 review_body: 评价内容 rating_value: 评分 (1-5) reviewer: 评价者 item_type: 被评价项目类型 (Product, Service, etc.) Returns: JSON-LD Schema 字典 """ return { "@context": self.context, "@type": "Review", "itemReviewed": { "@type": item_type, "name": item_name }, "reviewBody": review_body, "reviewRating": { "@type": "Rating", "ratingValue": rating_value, "bestRating": 5 }, "author": { "@type": "Person", "name": reviewer } } def extract_qa_from_content(self, content: str) -> List[Dict[str, str]]: """ 从内容中自动提取 Q&A 对 Args: content: 文本内容 Returns: Q&A 对列表 """ import re qa_pairs = [] # 模式1: Q: ... A: ... pattern1 = r'[Qq][::]\s*(.+?)[\n\r]+[Aa][::]\s*(.+?)(?=[Qq][::]|\n\n|$)' matches1 = re.findall(pattern1, content, re.DOTALL) for q, a in matches1: qa_pairs.append({"question": q.strip(), "answer": a.strip()}) # 模式2: 问题:... 回答:... pattern2 = r'问题[::]\s*(.+?)[\n\r]+回答[::]\s*(.+?)(?=问题[::]|\n\n|$)' matches2 = re.findall(pattern2, content, re.DOTALL) for q, a in matches2: qa_pairs.append({"question": q.strip(), "answer": a.strip()}) # 模式3: ## 问题标题 (以问号结尾) + 后续段落作为回答 lines = content.split('\n') current_question = None current_answer = [] for line in lines: line = line.strip() if not line: if current_question and current_answer: qa_pairs.append({ "question": current_question, "answer": '\n'.join(current_answer) }) current_question = None current_answer = [] continue # 检测问题行(以?或?结尾的标题) if (line.startswith('#') or line.startswith('##')) and \ (line.endswith('?') or line.endswith('?')): if current_question and current_answer: qa_pairs.append({ "question": current_question, "answer": '\n'.join(current_answer) }) current_question = line.lstrip('#').strip() current_answer = [] elif current_question: current_answer.append(line) # 保存最后一个 Q&A 对 if current_question and current_answer: qa_pairs.append({ "question": current_question, "answer": '\n'.join(current_answer) }) return qa_pairs def auto_generate_faq_schema(self, content: str) -> Optional[Dict]: """ 从内容中自动提取 Q&A 并生成 FAQ Schema Args: content: 文本内容 Returns: FAQPage Schema 或 None(如果没有找到 Q&A) """ qa_pairs = self.extract_qa_from_content(content) if not qa_pairs: return None return self.generate_faq_schema(qa_pairs)