Files
ChouJuGEO/modules/schema_generator.py
T

685 lines
21 KiB
Python
Raw Normal View History

"""
JSON-LD Schema.org 结构化数据生成模块
生成符合 Schema.org 规范的 JSON-LD 代码,提升品牌在 AI 模型中的实体识别和权威性
"""
from typing import Dict, List, Optional
import json
from datetime import datetime
class SchemaGenerator:
"""Schema.org JSON-LD 生成器"""
def __init__(self):
# Schema.org 上下文
self.context = "https://schema.org"
def generate_organization_schema(
self,
brand_name: str,
description: str = "",
url: str = "",
logo: str = "",
founding_date: str = "",
contact_point: Dict = None
) -> Dict:
"""
生成 Organization(组织)类型的 Schema
Args:
brand_name: 品牌/组织名称
description: 组织描述
url: 官网 URL
logo: Logo URL
founding_date: 成立日期(YYYY-MM-DD
contact_point: 联系方式(可选)
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "Organization",
"name": brand_name
}
if description:
schema["description"] = description
if url:
schema["url"] = url
if logo:
schema["logo"] = logo
if founding_date:
schema["foundingDate"] = founding_date
if contact_point:
schema["contactPoint"] = {
"@type": "ContactPoint",
**contact_point
}
return schema
def generate_software_application_schema(
self,
brand_name: str,
application_name: str = "",
description: str = "",
url: str = "",
application_category: str = "BusinessApplication",
operating_system: str = "",
offers: Dict = None,
aggregate_rating: Dict = None,
feature_list: List[str] = None
) -> Dict:
"""
生成 SoftwareApplication(软件应用)类型的 Schema
Args:
brand_name: 品牌名称
application_name: 应用名称(默认使用品牌名称)
description: 应用描述
url: 应用 URL
application_category: 应用类别(如 BusinessApplication, WebApplication
operating_system: 操作系统(如 Windows, macOS, Linux, Web
offers: 价格信息(可选)
aggregate_rating: 评分信息(可选)
feature_list: 功能列表(可选)
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "SoftwareApplication",
"name": application_name or brand_name,
"applicationCategory": application_category
}
if description:
schema["description"] = description
if url:
schema["url"] = url
if operating_system:
schema["operatingSystem"] = operating_system
# 添加发布者(组织)
schema["publisher"] = {
"@type": "Organization",
"name": brand_name
}
if offers:
schema["offers"] = {
"@type": "Offer",
**offers
}
if aggregate_rating:
schema["aggregateRating"] = {
"@type": "AggregateRating",
**aggregate_rating
}
if feature_list:
schema["featureList"] = feature_list
return schema
def generate_product_schema(
self,
brand_name: str,
product_name: str = "",
description: str = "",
url: str = "",
product_category: str = "",
brand: Dict = None,
offers: Dict = None,
aggregate_rating: Dict = None
) -> Dict:
"""
生成 Product(产品)类型的 Schema
Args:
brand_name: 品牌名称
product_name: 产品名称(默认使用品牌名称)
description: 产品描述
url: 产品 URL
product_category: 产品类别
brand: 品牌信息(可选)
offers: 价格信息(可选)
aggregate_rating: 评分信息(可选)
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "Product",
"name": product_name or brand_name
}
if description:
schema["description"] = description
if url:
schema["url"] = url
if product_category:
schema["category"] = product_category
if brand:
schema["brand"] = {
"@type": "Brand",
**brand
}
else:
schema["brand"] = {
"@type": "Brand",
"name": brand_name
}
if offers:
schema["offers"] = {
"@type": "Offer",
**offers
}
if aggregate_rating:
schema["aggregateRating"] = {
"@type": "AggregateRating",
**aggregate_rating
}
return schema
def generate_service_schema(
self,
brand_name: str,
service_name: str = "",
description: str = "",
url: str = "",
service_type: str = "",
provider: Dict = None,
area_served: str = "",
offers: Dict = None
) -> Dict:
"""
生成 Service(服务)类型的 Schema
Args:
brand_name: 品牌名称
service_name: 服务名称(默认使用品牌名称)
description: 服务描述
url: 服务 URL
service_type: 服务类型
provider: 服务提供者信息(可选)
area_served: 服务区域
offers: 价格信息(可选)
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "Service",
"name": service_name or brand_name
}
if description:
schema["description"] = description
if url:
schema["url"] = url
if service_type:
schema["serviceType"] = service_type
if provider:
schema["provider"] = {
"@type": "Organization",
**provider
}
else:
schema["provider"] = {
"@type": "Organization",
"name": brand_name
}
if area_served:
schema["areaServed"] = {
"@type": "Country",
"name": area_served
}
if offers:
schema["offers"] = {
"@type": "Offer",
**offers
}
return schema
def generate_combined_schema(
self,
brand_name: str,
advantages: str = "",
schema_types: List[str] = None,
**kwargs
) -> Dict:
"""
生成组合 Schema(包含多个类型)
Args:
brand_name: 品牌名称
advantages: 品牌优势(用于描述)
schema_types: Schema 类型列表(如 ["Organization", "SoftwareApplication"]
**kwargs: 其他参数
Returns:
组合的 JSON-LD Schema 字典
"""
if schema_types is None:
schema_types = ["Organization", "SoftwareApplication"]
schemas = []
# 生成 Organization
if "Organization" in schema_types:
org_schema = self.generate_organization_schema(
brand_name=brand_name,
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
logo=kwargs.get("logo", ""),
founding_date=kwargs.get("founding_date", ""),
contact_point=kwargs.get("contact_point")
)
schemas.append(org_schema)
# 生成 SoftwareApplication
if "SoftwareApplication" in schema_types:
app_schema = self.generate_software_application_schema(
brand_name=brand_name,
application_name=kwargs.get("application_name", brand_name),
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
application_category=kwargs.get("application_category", "BusinessApplication"),
operating_system=kwargs.get("operating_system", ""),
offers=kwargs.get("offers"),
aggregate_rating=kwargs.get("aggregate_rating"),
feature_list=kwargs.get("feature_list")
)
schemas.append(app_schema)
# 生成 Product
if "Product" in schema_types:
product_schema = self.generate_product_schema(
brand_name=brand_name,
product_name=kwargs.get("product_name", brand_name),
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
product_category=kwargs.get("product_category", ""),
brand=kwargs.get("brand"),
offers=kwargs.get("offers"),
aggregate_rating=kwargs.get("aggregate_rating")
)
schemas.append(product_schema)
# 生成 Service
if "Service" in schema_types:
service_schema = self.generate_service_schema(
brand_name=brand_name,
service_name=kwargs.get("service_name", brand_name),
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
service_type=kwargs.get("service_type", ""),
provider=kwargs.get("provider"),
area_served=kwargs.get("area_served", ""),
offers=kwargs.get("offers")
)
schemas.append(service_schema)
# 如果只有一个 Schema,直接返回
if len(schemas) == 1:
return schemas[0]
# 多个 Schema 时,返回数组格式
return schemas
def format_json_ld(self, schema: Dict, indent: int = 2) -> str:
"""
格式化 JSON-LD 为字符串(用于嵌入 HTML)
Args:
schema: Schema 字典
indent: 缩进空格数
Returns:
格式化的 JSON 字符串
"""
return json.dumps(schema, ensure_ascii=False, indent=indent)
def generate_html_script_tag(self, schema: Dict) -> str:
"""
生成 HTML script 标签(可直接嵌入网页)
Args:
schema: Schema 字典
Returns:
HTML script 标签字符串
"""
json_str = self.format_json_ld(schema)
return f'<script type="application/ld+json">\n{json_str}\n</script>'
def validate_schema(self, schema: Dict) -> tuple[bool, List[str]]:
"""
验证 Schema 的基本有效性
Args:
schema: Schema 字典
Returns:
(是否有效, 错误列表)
"""
errors = []
# 检查必需字段
if "@context" not in schema:
errors.append("缺少 @context 字段")
if "@type" not in schema:
errors.append("缺少 @type 字段")
if "name" not in schema:
errors.append("缺少 name 字段")
# 检查 @context 值
if schema.get("@context") != self.context:
errors.append(f"@context 应为 {self.context}")
return len(errors) == 0, errors
def get_schema_types_info(self) -> Dict[str, str]:
"""
获取支持的 Schema 类型信息
Returns:
Schema 类型字典(类型名 -> 描述)
"""
return {
"Organization": "组织/公司(适合企业品牌)",
"SoftwareApplication": "软件应用(适合 SaaS 产品、软件工具)",
"Product": "产品(适合实体产品或数字产品)",
"Service": "服务(适合服务类业务)"
}
def generate_for_github(self, brand_name: str, advantages: str = "", **kwargs) -> str:
"""
为 GitHub 项目生成 JSON-LD Schema
通常使用 SoftwareApplication 类型
Args:
brand_name: 品牌/项目名称
advantages: 项目优势/描述
**kwargs: 其他参数
Returns:
格式化的 JSON-LD 字符串
"""
schema = self.generate_software_application_schema(
brand_name=brand_name,
application_name=kwargs.get("application_name", brand_name),
description=advantages or kwargs.get("description", ""),
url=kwargs.get("url", ""),
application_category=kwargs.get("application_category", "WebApplication"),
operating_system=kwargs.get("operating_system", "Web"),
feature_list=kwargs.get("feature_list")
)
return self.format_json_ld(schema)
def generate_for_website(self, brand_name: str, advantages: str = "", **kwargs) -> str:
"""
为官网生成 JSON-LD Schema
通常使用 Organization + SoftwareApplication/Product/Service 组合
Args:
brand_name: 品牌名称
advantages: 品牌优势/描述
**kwargs: 其他参数
Returns:
HTML script 标签字符串(可直接嵌入网页)
"""
schema_types = kwargs.get("schema_types", ["Organization", "SoftwareApplication"])
schema = self.generate_combined_schema(
brand_name=brand_name,
advantages=advantages,
schema_types=schema_types,
**kwargs
)
return self.generate_html_script_tag(schema)
def generate_faq_schema(self, faq_items: List[Dict[str, str]]) -> Dict:
"""
生成 FAQPage 类型的 Schema
Args:
faq_items: FAQ 列表,每个元素包含 {"question": "...", "answer": "..."}
Returns:
JSON-LD Schema 字典
"""
main_entity = []
for item in faq_items:
main_entity.append({
"@type": "Question",
"name": item["question"],
"acceptedAnswer": {
"@type": "Answer",
"text": item["answer"]
}
})
return {
"@context": self.context,
"@type": "FAQPage",
"mainEntity": main_entity
}
def generate_howto_schema(self, title: str, steps: List[Dict[str, str]],
description: str = "") -> Dict:
"""
生成 HowTo 类型的 Schema
Args:
title: 操作标题
steps: 步骤列表,每个元素包含 {"name": "...", "text": "..."}
description: 操作描述
Returns:
JSON-LD Schema 字典
"""
howto_steps = []
for i, step in enumerate(steps, 1):
howto_steps.append({
"@type": "HowToStep",
"position": i,
"name": step.get("name", f"步骤 {i}"),
"text": step.get("text", "")
})
schema = {
"@context": self.context,
"@type": "HowTo",
"name": title,
"step": howto_steps
}
if description:
schema["description"] = description
return schema
def generate_article_schema(self, title: str, author: str,
date_published: str, description: str = "",
image: str = "", url: str = "") -> Dict:
"""
生成 Article 类型的 Schema
Args:
title: 文章标题
author: 作者名称
date_published: 发布日期 (YYYY-MM-DD)
description: 文章描述
image: 文章图片 URL
url: 文章 URL
Returns:
JSON-LD Schema 字典
"""
schema = {
"@context": self.context,
"@type": "Article",
"headline": title,
"author": {
"@type": "Person",
"name": author
},
"datePublished": date_published
}
if description:
schema["description"] = description
if image:
schema["image"] = image
if url:
schema["url"] = url
return schema
def generate_review_schema(self, item_name: str, review_body: str,
rating_value: float, reviewer: str,
item_type: str = "Product") -> Dict:
"""
生成 Review 类型的 Schema
Args:
item_name: 被评价项目名称
review_body: 评价内容
rating_value: 评分 (1-5)
reviewer: 评价者
item_type: 被评价项目类型 (Product, Service, etc.)
Returns:
JSON-LD Schema 字典
"""
return {
"@context": self.context,
"@type": "Review",
"itemReviewed": {
"@type": item_type,
"name": item_name
},
"reviewBody": review_body,
"reviewRating": {
"@type": "Rating",
"ratingValue": rating_value,
"bestRating": 5
},
"author": {
"@type": "Person",
"name": reviewer
}
}
def extract_qa_from_content(self, content: str) -> List[Dict[str, str]]:
"""
从内容中自动提取 Q&A 对
Args:
content: 文本内容
Returns:
Q&A 对列表
"""
import re
qa_pairs = []
# 模式1: Q: ... A: ...
pattern1 = r'[Qq][:]\s*(.+?)[\n\r]+[Aa][:]\s*(.+?)(?=[Qq][:]|\n\n|$)'
matches1 = re.findall(pattern1, content, re.DOTALL)
for q, a in matches1:
qa_pairs.append({"question": q.strip(), "answer": a.strip()})
# 模式2: 问题:... 回答:...
pattern2 = r'问题[:]\s*(.+?)[\n\r]+回答[:]\s*(.+?)(?=问题[:]|\n\n|$)'
matches2 = re.findall(pattern2, content, re.DOTALL)
for q, a in matches2:
qa_pairs.append({"question": q.strip(), "answer": a.strip()})
# 模式3: ## 问题标题 (以问号结尾) + 后续段落作为回答
lines = content.split('\n')
current_question = None
current_answer = []
for line in lines:
line = line.strip()
if not line:
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": '\n'.join(current_answer)
})
current_question = None
current_answer = []
continue
# 检测问题行(以?或?结尾的标题)
if (line.startswith('#') or line.startswith('##')) and \
(line.endswith('') or line.endswith('?')):
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": '\n'.join(current_answer)
})
current_question = line.lstrip('#').strip()
current_answer = []
elif current_question:
current_answer.append(line)
# 保存最后一个 Q&A 对
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": '\n'.join(current_answer)
})
return qa_pairs
def auto_generate_faq_schema(self, content: str) -> Optional[Dict]:
"""
从内容中自动提取 Q&A 并生成 FAQ Schema
Args:
content: 文本内容
Returns:
FAQPage Schema 或 None(如果没有找到 Q&A
"""
qa_pairs = self.extract_qa_from_content(content)
if not qa_pairs:
return None
return self.generate_faq_schema(qa_pairs)