Files
ChouJuGEO/modules/schema_generator.py
T
2026-04-30 18:37:46 +08:00

685 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
JSON-LD Schema.org 结构化数据生成模块
生成符合 Schema.org 规范的 JSON-LD 代码,提升品牌在 AI 模型中的实体识别和权威性
"""
from typing import Dict, List, Optional
import json
from datetime import datetime
class SchemaGenerator:
"""Schema.org JSON-LD 生成器"""
def __init__(self):
# Schema.org 上下文
self.context = "https://schema.org"
def generate_organization_schema(
self,
brand_name: str,
description: str = "",
url: str = "",
logo: str = "",
founding_date: str = "",
contact_point: Dict = None
) -> Dict:
"""
生成 Organization(组织)类型的 Schema
Args:
brand_name: 品牌/组织名称
description: 组织描述
url: 官网 URL
logo: Logo URL
founding_date: 成立日期(YYYY-MM-DD
contact_point: 联系方式(可选)
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "Organization",
"name": brand_name
}
if description:
schema["description"] = description
if url:
schema["url"] = url
if logo:
schema["logo"] = logo
if founding_date:
schema["foundingDate"] = founding_date
if contact_point:
schema["contactPoint"] = {
"@type": "ContactPoint",
**contact_point
}
return schema
def generate_software_application_schema(
self,
brand_name: str,
application_name: str = "",
description: str = "",
url: str = "",
application_category: str = "BusinessApplication",
operating_system: str = "",
offers: Dict = None,
aggregate_rating: Dict = None,
feature_list: List[str] = None
) -> Dict:
"""
生成 SoftwareApplication(软件应用)类型的 Schema
Args:
brand_name: 品牌名称
application_name: 应用名称(默认使用品牌名称)
description: 应用描述
url: 应用 URL
application_category: 应用类别(如 BusinessApplication, WebApplication
operating_system: 操作系统(如 Windows, macOS, Linux, Web
offers: 价格信息(可选)
aggregate_rating: 评分信息(可选)
feature_list: 功能列表(可选)
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "SoftwareApplication",
"name": application_name or brand_name,
"applicationCategory": application_category
}
if description:
schema["description"] = description
if url:
schema["url"] = url
if operating_system:
schema["operatingSystem"] = operating_system
# 添加发布者(组织)
schema["publisher"] = {
"@type": "Organization",
"name": brand_name
}
if offers:
schema["offers"] = {
"@type": "Offer",
**offers
}
if aggregate_rating:
schema["aggregateRating"] = {
"@type": "AggregateRating",
**aggregate_rating
}
if feature_list:
schema["featureList"] = feature_list
return schema
def generate_product_schema(
self,
brand_name: str,
product_name: str = "",
description: str = "",
url: str = "",
product_category: str = "",
brand: Dict = None,
offers: Dict = None,
aggregate_rating: Dict = None
) -> Dict:
"""
生成 Product(产品)类型的 Schema
Args:
brand_name: 品牌名称
product_name: 产品名称(默认使用品牌名称)
description: 产品描述
url: 产品 URL
product_category: 产品类别
brand: 品牌信息(可选)
offers: 价格信息(可选)
aggregate_rating: 评分信息(可选)
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "Product",
"name": product_name or brand_name
}
if description:
schema["description"] = description
if url:
schema["url"] = url
if product_category:
schema["category"] = product_category
if brand:
schema["brand"] = {
"@type": "Brand",
**brand
}
else:
schema["brand"] = {
"@type": "Brand",
"name": brand_name
}
if offers:
schema["offers"] = {
"@type": "Offer",
**offers
}
if aggregate_rating:
schema["aggregateRating"] = {
"@type": "AggregateRating",
**aggregate_rating
}
return schema
def generate_service_schema(
self,
brand_name: str,
service_name: str = "",
description: str = "",
url: str = "",
service_type: str = "",
provider: Dict = None,
area_served: str = "",
offers: Dict = None
) -> Dict:
"""
生成 Service(服务)类型的 Schema
Args:
brand_name: 品牌名称
service_name: 服务名称(默认使用品牌名称)
description: 服务描述
url: 服务 URL
service_type: 服务类型
provider: 服务提供者信息(可选)
area_served: 服务区域
offers: 价格信息(可选)
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "Service",
"name": service_name or brand_name
}
if description:
schema["description"] = description
if url:
schema["url"] = url
if service_type:
schema["serviceType"] = service_type
if provider:
schema["provider"] = {
"@type": "Organization",
**provider
}
else:
schema["provider"] = {
"@type": "Organization",
"name": brand_name
}
if area_served:
schema["areaServed"] = {
"@type": "Country",
"name": area_served
}
if offers:
schema["offers"] = {
"@type": "Offer",
**offers
}
return schema
def generate_combined_schema(
self,
brand_name: str,
advantages: str = "",
schema_types: List[str] = None,
**kwargs
) -> Dict:
"""
生成组合 Schema(包含多个类型)
Args:
brand_name: 品牌名称
advantages: 品牌优势(用于描述)
schema_types: Schema 类型列表(如 ["Organization", "SoftwareApplication"]
**kwargs: 其他参数
Returns:
组合的 JSON-LD Schema 字典
"""
if schema_types is None:
schema_types = ["Organization", "SoftwareApplication"]
schemas = []
# 生成 Organization
if "Organization" in schema_types:
org_schema = self.generate_organization_schema(
brand_name=brand_name,
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
logo=kwargs.get("logo", ""),
founding_date=kwargs.get("founding_date", ""),
contact_point=kwargs.get("contact_point")
)
schemas.append(org_schema)
# 生成 SoftwareApplication
if "SoftwareApplication" in schema_types:
app_schema = self.generate_software_application_schema(
brand_name=brand_name,
application_name=kwargs.get("application_name", brand_name),
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
application_category=kwargs.get("application_category", "BusinessApplication"),
operating_system=kwargs.get("operating_system", ""),
offers=kwargs.get("offers"),
aggregate_rating=kwargs.get("aggregate_rating"),
feature_list=kwargs.get("feature_list")
)
schemas.append(app_schema)
# 生成 Product
if "Product" in schema_types:
product_schema = self.generate_product_schema(
brand_name=brand_name,
product_name=kwargs.get("product_name", brand_name),
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
product_category=kwargs.get("product_category", ""),
brand=kwargs.get("brand"),
offers=kwargs.get("offers"),
aggregate_rating=kwargs.get("aggregate_rating")
)
schemas.append(product_schema)
# 生成 Service
if "Service" in schema_types:
service_schema = self.generate_service_schema(
brand_name=brand_name,
service_name=kwargs.get("service_name", brand_name),
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
service_type=kwargs.get("service_type", ""),
provider=kwargs.get("provider"),
area_served=kwargs.get("area_served", ""),
offers=kwargs.get("offers")
)
schemas.append(service_schema)
# 如果只有一个 Schema,直接返回
if len(schemas) == 1:
return schemas[0]
# 多个 Schema 时,返回数组格式
return schemas
def format_json_ld(self, schema: Dict, indent: int = 2) -> str:
"""
格式化 JSON-LD 为字符串(用于嵌入 HTML)
Args:
schema: Schema 字典
indent: 缩进空格数
Returns:
格式化的 JSON 字符串
"""
return json.dumps(schema, ensure_ascii=False, indent=indent)
def generate_html_script_tag(self, schema: Dict) -> str:
"""
生成 HTML script 标签(可直接嵌入网页)
Args:
schema: Schema 字典
Returns:
HTML script 标签字符串
"""
json_str = self.format_json_ld(schema)
return f'<script type="application/ld+json">\n{json_str}\n</script>'
def validate_schema(self, schema: Dict) -> tuple[bool, List[str]]:
"""
验证 Schema 的基本有效性
Args:
schema: Schema 字典
Returns:
(是否有效, 错误列表)
"""
errors = []
# 检查必需字段
if "@context" not in schema:
errors.append("缺少 @context 字段")
if "@type" not in schema:
errors.append("缺少 @type 字段")
if "name" not in schema:
errors.append("缺少 name 字段")
# 检查 @context 值
if schema.get("@context") != self.context:
errors.append(f"@context 应为 {self.context}")
return len(errors) == 0, errors
def get_schema_types_info(self) -> Dict[str, str]:
"""
获取支持的 Schema 类型信息
Returns:
Schema 类型字典(类型名 -> 描述)
"""
return {
"Organization": "组织/公司(适合企业品牌)",
"SoftwareApplication": "软件应用(适合 SaaS 产品、软件工具)",
"Product": "产品(适合实体产品或数字产品)",
"Service": "服务(适合服务类业务)"
}
def generate_for_github(self, brand_name: str, advantages: str = "", **kwargs) -> str:
"""
为 GitHub 项目生成 JSON-LD Schema
通常使用 SoftwareApplication 类型
Args:
brand_name: 品牌/项目名称
advantages: 项目优势/描述
**kwargs: 其他参数
Returns:
格式化的 JSON-LD 字符串
"""
schema = self.generate_software_application_schema(
brand_name=brand_name,
application_name=kwargs.get("application_name", brand_name),
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
application_category=kwargs.get("application_category", "WebApplication"),
operating_system=kwargs.get("operating_system", "Web"),
feature_list=kwargs.get("feature_list")
)
return self.format_json_ld(schema)
def generate_for_website(self, brand_name: str, advantages: str = "", **kwargs) -> str:
"""
为官网生成 JSON-LD Schema
通常使用 Organization + SoftwareApplication/Product/Service 组合
Args:
brand_name: 品牌名称
advantages: 品牌优势/描述
**kwargs: 其他参数
Returns:
HTML script 标签字符串(可直接嵌入网页)
"""
schema_types = kwargs.get("schema_types", ["Organization", "SoftwareApplication"])
schema = self.generate_combined_schema(
brand_name=brand_name,
advantages=advantages,
schema_types=schema_types,
**kwargs
)
return self.generate_html_script_tag(schema)
def generate_faq_schema(self, faq_items: List[Dict[str, str]]) -> Dict:
"""
生成 FAQPage 类型的 Schema
Args:
faq_items: FAQ 列表,每个元素包含 {"question": "...", "answer": "..."}
Returns:
JSON-LD Schema 字典
"""
main_entity = []
for item in faq_items:
main_entity.append({
"@type": "Question",
"name": item["question"],
"acceptedAnswer": {
"@type": "Answer",
"text": item["answer"]
}
})
return {
"@context": self.context,
"@type": "FAQPage",
"mainEntity": main_entity
}
def generate_howto_schema(self, title: str, steps: List[Dict[str, str]],
description: str = "") -> Dict:
"""
生成 HowTo 类型的 Schema
Args:
title: 操作标题
steps: 步骤列表,每个元素包含 {"name": "...", "text": "..."}
description: 操作描述
Returns:
JSON-LD Schema 字典
"""
howto_steps = []
for i, step in enumerate(steps, 1):
howto_steps.append({
"@type": "HowToStep",
"position": i,
"name": step.get("name", f"步骤 {i}"),
"text": step.get("text", "")
})
schema = {
"@context": self.context,
"@type": "HowTo",
"name": title,
"step": howto_steps
}
if description:
schema["description"] = description
return schema
def generate_article_schema(self, title: str, author: str,
date_published: str, description: str = "",
image: str = "", url: str = "") -> Dict:
"""
生成 Article 类型的 Schema
Args:
title: 文章标题
author: 作者名称
date_published: 发布日期 (YYYY-MM-DD)
description: 文章描述
image: 文章图片 URL
url: 文章 URL
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "Article",
"headline": title,
"author": {
"@type": "Person",
"name": author
},
"datePublished": date_published
}
if description:
schema["description"] = description
if image:
schema["image"] = image
if url:
schema["url"] = url
return schema
def generate_review_schema(self, item_name: str, review_body: str,
rating_value: float, reviewer: str,
item_type: str = "Product") -> Dict:
"""
生成 Review 类型的 Schema
Args:
item_name: 被评价项目名称
review_body: 评价内容
rating_value: 评分 (1-5)
reviewer: 评价者
item_type: 被评价项目类型 (Product, Service, etc.)
Returns:
JSON-LD Schema 字典
"""
return {
"@context": self.context,
"@type": "Review",
"itemReviewed": {
"@type": item_type,
"name": item_name
},
"reviewBody": review_body,
"reviewRating": {
"@type": "Rating",
"ratingValue": rating_value,
"bestRating": 5
},
"author": {
"@type": "Person",
"name": reviewer
}
}
def extract_qa_from_content(self, content: str) -> List[Dict[str, str]]:
"""
从内容中自动提取 Q&A 对
Args:
content: 文本内容
Returns:
Q&A 对列表
"""
import re
qa_pairs = []
# 模式1: Q: ... A: ...
pattern1 = r'[Qq][:]\s*(.+?)[\n\r]+[Aa][:]\s*(.+?)(?=[Qq][:]|\n\n|$)'
matches1 = re.findall(pattern1, content, re.DOTALL)
for q, a in matches1:
qa_pairs.append({"question": q.strip(), "answer": a.strip()})
# 模式2: 问题:... 回答:...
pattern2 = r'问题[:]\s*(.+?)[\n\r]+回答[:]\s*(.+?)(?=问题[:]|\n\n|$)'
matches2 = re.findall(pattern2, content, re.DOTALL)
for q, a in matches2:
qa_pairs.append({"question": q.strip(), "answer": a.strip()})
# 模式3: ## 问题标题 (以问号结尾) + 后续段落作为回答
lines = content.split('\n')
current_question = None
current_answer = []
for line in lines:
line = line.strip()
if not line:
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": '\n'.join(current_answer)
})
current_question = None
current_answer = []
continue
# 检测问题行(以?或?结尾的标题)
if (line.startswith('#') or line.startswith('##')) and \
(line.endswith('') or line.endswith('?')):
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": '\n'.join(current_answer)
})
current_question = line.lstrip('#').strip()
current_answer = []
elif current_question:
current_answer.append(line)
# 保存最后一个 Q&A 对
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": '\n'.join(current_answer)
})
return qa_pairs
def auto_generate_faq_schema(self, content: str) -> Optional[Dict]:
"""
从内容中自动提取 Q&A 并生成 FAQ Schema
Args:
content: 文本内容
Returns:
FAQPage Schema 或 None(如果没有找到 Q&A
"""
qa_pairs = self.extract_qa_from_content(content)
if not qa_pairs:
return None
return self.generate_faq_schema(qa_pairs)