fb309299bf
Made-with: Cursor
356 lines
12 KiB
Python
356 lines
12 KiB
Python
"""
|
||
技术配置生成模块
|
||
生成 robots.txt、sitemap.xml 等技术配置文件,提升内容收录效果
|
||
"""
|
||
from typing import List, Dict, Optional
|
||
from datetime import datetime
|
||
from urllib.parse import urljoin, urlparse
|
||
import xml.etree.ElementTree as ET
|
||
|
||
|
||
class TechnicalConfigGenerator:
|
||
"""技术配置文件生成器"""
|
||
|
||
def __init__(self):
|
||
pass
|
||
|
||
def generate_robots_txt(
|
||
self,
|
||
base_url: str = "",
|
||
allow_paths: List[str] = None,
|
||
disallow_paths: List[str] = None,
|
||
sitemap_url: str = "",
|
||
user_agent: str = "*",
|
||
crawl_delay: Optional[int] = None
|
||
) -> str:
|
||
"""
|
||
生成 robots.txt 文件
|
||
|
||
Args:
|
||
base_url: 网站基础 URL(如 https://example.com)
|
||
allow_paths: 允许爬取的路径列表(如 ["/", "/blog", "/docs"])
|
||
disallow_paths: 禁止爬取的路径列表(如 ["/admin", "/private"])
|
||
sitemap_url: sitemap.xml 的 URL
|
||
user_agent: User-Agent(默认 "*" 表示所有爬虫)
|
||
crawl_delay: 爬取延迟(秒,可选)
|
||
|
||
Returns:
|
||
robots.txt 文件内容
|
||
"""
|
||
lines = []
|
||
|
||
# User-Agent 规则
|
||
lines.append(f"User-agent: {user_agent}")
|
||
|
||
# 允许路径
|
||
if allow_paths:
|
||
for path in allow_paths:
|
||
lines.append(f"Allow: {path}")
|
||
|
||
# 禁止路径
|
||
if disallow_paths:
|
||
for path in disallow_paths:
|
||
lines.append(f"Disallow: {path}")
|
||
else:
|
||
# 默认禁止路径(如果未指定)
|
||
default_disallow = [
|
||
"/admin",
|
||
"/private",
|
||
"/api",
|
||
"/_next",
|
||
"/static",
|
||
]
|
||
for path in default_disallow:
|
||
lines.append(f"Disallow: {path}")
|
||
|
||
# 爬取延迟
|
||
if crawl_delay is not None:
|
||
lines.append(f"Crawl-delay: {crawl_delay}")
|
||
|
||
# Sitemap
|
||
if sitemap_url:
|
||
lines.append(f"Sitemap: {sitemap_url}")
|
||
elif base_url:
|
||
# 自动生成 sitemap URL
|
||
sitemap_url = urljoin(base_url.rstrip('/') + '/', 'sitemap.xml')
|
||
lines.append(f"Sitemap: {sitemap_url}")
|
||
|
||
return "\n".join(lines)
|
||
|
||
def generate_sitemap_xml(
|
||
self,
|
||
base_url: str,
|
||
urls: List[Dict[str, any]] = None,
|
||
keywords: List[str] = None,
|
||
lastmod: Optional[str] = None,
|
||
changefreq: str = "weekly",
|
||
priority: float = 0.8
|
||
) -> str:
|
||
"""
|
||
生成 sitemap.xml 文件
|
||
|
||
Args:
|
||
base_url: 网站基础 URL(如 https://example.com)
|
||
urls: URL 列表,每个元素包含:
|
||
- loc: URL 路径(如 "/blog/post-1")
|
||
- lastmod: 最后修改时间(ISO 格式,如 "2025-01-26")
|
||
- changefreq: 更新频率(如 "daily", "weekly", "monthly")
|
||
- priority: 优先级(0.0-1.0)
|
||
keywords: 关键词列表(如果提供,会基于关键词生成 URL)
|
||
lastmod: 默认最后修改时间(ISO 格式)
|
||
changefreq: 默认更新频率
|
||
priority: 默认优先级
|
||
|
||
Returns:
|
||
sitemap.xml 文件内容
|
||
"""
|
||
# 如果没有提供 lastmod,使用当前日期
|
||
if lastmod is None:
|
||
lastmod = datetime.now().strftime("%Y-%m-%d")
|
||
|
||
# 创建 XML 根元素
|
||
root = ET.Element("urlset")
|
||
root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
|
||
|
||
# 如果提供了 URLs,使用它们
|
||
if urls:
|
||
for url_data in urls:
|
||
url_elem = ET.SubElement(root, "url")
|
||
|
||
# loc(URL)
|
||
loc = url_data.get("loc", "")
|
||
if not loc.startswith("http"):
|
||
loc = urljoin(base_url.rstrip('/') + '/', loc.lstrip('/'))
|
||
ET.SubElement(url_elem, "loc").text = loc
|
||
|
||
# lastmod
|
||
url_lastmod = url_data.get("lastmod", lastmod)
|
||
ET.SubElement(url_elem, "lastmod").text = url_lastmod
|
||
|
||
# changefreq
|
||
url_changefreq = url_data.get("changefreq", changefreq)
|
||
ET.SubElement(url_elem, "changefreq").text = url_changefreq
|
||
|
||
# priority
|
||
url_priority = url_data.get("priority", priority)
|
||
ET.SubElement(url_elem, "priority").text = str(url_priority)
|
||
|
||
# 如果提供了关键词,基于关键词生成 URL
|
||
elif keywords:
|
||
for keyword in keywords:
|
||
url_elem = ET.SubElement(root, "url")
|
||
|
||
# 生成 URL(基于关键词)
|
||
# 将关键词转换为 URL 友好的格式
|
||
url_path = keyword.lower().replace(" ", "-").replace("_", "-")
|
||
# 移除特殊字符
|
||
import re
|
||
url_path = re.sub(r'[^\w\-]', '', url_path)
|
||
full_url = urljoin(base_url.rstrip('/') + '/', url_path)
|
||
|
||
ET.SubElement(url_elem, "loc").text = full_url
|
||
ET.SubElement(url_elem, "lastmod").text = lastmod
|
||
ET.SubElement(url_elem, "changefreq").text = changefreq
|
||
ET.SubElement(url_elem, "priority").text = str(priority)
|
||
|
||
# 如果没有提供 URLs 或关键词,至少添加首页
|
||
else:
|
||
url_elem = ET.SubElement(root, "url")
|
||
ET.SubElement(url_elem, "loc").text = base_url.rstrip('/') + '/'
|
||
ET.SubElement(url_elem, "lastmod").text = lastmod
|
||
ET.SubElement(url_elem, "changefreq").text = changefreq
|
||
ET.SubElement(url_elem, "priority").text = "1.0"
|
||
|
||
# 格式化 XML
|
||
ET.indent(root, space=" ")
|
||
xml_str = ET.tostring(root, encoding="unicode", xml_declaration=True)
|
||
|
||
return xml_str
|
||
|
||
def generate_sitemap_from_articles(
|
||
self,
|
||
base_url: str,
|
||
articles: List[Dict[str, str]],
|
||
lastmod: Optional[str] = None,
|
||
changefreq: str = "weekly",
|
||
priority: float = 0.8
|
||
) -> str:
|
||
"""
|
||
基于文章列表生成 sitemap.xml
|
||
|
||
Args:
|
||
base_url: 网站基础 URL
|
||
articles: 文章列表,每个元素包含:
|
||
- keyword: 关键词
|
||
- platform: 平台
|
||
- content: 内容(可选)
|
||
- created_at: 创建时间(可选)
|
||
lastmod: 默认最后修改时间
|
||
changefreq: 默认更新频率
|
||
priority: 默认优先级
|
||
|
||
Returns:
|
||
sitemap.xml 文件内容
|
||
"""
|
||
urls = []
|
||
|
||
for article in articles:
|
||
keyword = article.get("keyword", "")
|
||
platform = article.get("platform", "")
|
||
created_at = article.get("created_at", "")
|
||
|
||
# 生成 URL 路径
|
||
# 将关键词转换为 URL 友好的格式
|
||
url_path = keyword.lower().replace(" ", "-").replace("_", "-")
|
||
import re
|
||
url_path = re.sub(r'[^\w\-]', '', url_path)
|
||
|
||
# 如果有平台信息,可以添加到路径中
|
||
if platform:
|
||
platform_slug = platform.lower().replace(" ", "-").replace("(", "").replace(")", "")
|
||
platform_slug = re.sub(r'[^\w\-]', '', platform_slug)
|
||
url_path = f"{platform_slug}/{url_path}"
|
||
|
||
# 使用创建时间作为 lastmod
|
||
article_lastmod = lastmod
|
||
if created_at:
|
||
try:
|
||
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
|
||
except Exception:
|
||
dt = datetime.strptime(created_at, "%Y-%m-%d")
|
||
article_lastmod = dt.strftime("%Y-%m-%d")
|
||
|
||
urls.append({
|
||
"loc": url_path,
|
||
"lastmod": article_lastmod or lastmod or datetime.now().strftime("%Y-%m-%d"),
|
||
"changefreq": changefreq,
|
||
"priority": priority
|
||
})
|
||
|
||
return self.generate_sitemap_xml(
|
||
base_url=base_url,
|
||
urls=urls,
|
||
lastmod=lastmod,
|
||
changefreq=changefreq,
|
||
priority=priority
|
||
)
|
||
|
||
def generate_htaccess_redirects(
|
||
self,
|
||
redirects: List[Dict[str, str]]
|
||
) -> str:
|
||
"""
|
||
生成 .htaccess 重定向规则
|
||
|
||
Args:
|
||
redirects: 重定向列表,每个元素包含:
|
||
- from: 源路径
|
||
- to: 目标路径
|
||
- type: 重定向类型(301 永久重定向,302 临时重定向)
|
||
|
||
Returns:
|
||
.htaccess 文件内容
|
||
"""
|
||
lines = []
|
||
lines.append("# .htaccess 重定向规则")
|
||
lines.append("# 由 GEO 工具自动生成")
|
||
lines.append("")
|
||
|
||
for redirect in redirects:
|
||
from_path = redirect.get("from", "")
|
||
to_path = redirect.get("to", "")
|
||
redirect_type = redirect.get("type", "301")
|
||
|
||
if from_path and to_path:
|
||
lines.append(f"Redirect {redirect_type} {from_path} {to_path}")
|
||
|
||
return "\n".join(lines)
|
||
|
||
def generate_meta_tags(
|
||
self,
|
||
title: str,
|
||
description: str,
|
||
keywords: List[str] = None,
|
||
og_type: str = "website",
|
||
og_image: str = "",
|
||
canonical_url: str = ""
|
||
) -> str:
|
||
"""
|
||
生成 HTML meta 标签
|
||
|
||
Args:
|
||
title: 页面标题
|
||
description: 页面描述
|
||
keywords: 关键词列表
|
||
og_type: Open Graph 类型(如 "website", "article")
|
||
og_image: Open Graph 图片 URL
|
||
canonical_url: 规范 URL
|
||
|
||
Returns:
|
||
HTML meta 标签字符串
|
||
"""
|
||
tags = []
|
||
|
||
# 基础 meta 标签
|
||
tags.append(f'<meta charset="UTF-8">')
|
||
tags.append(f'<meta name="viewport" content="width=device-width, initial-scale=1.0">')
|
||
tags.append(f'<title>{title}</title>')
|
||
tags.append(f'<meta name="description" content="{description}">')
|
||
|
||
# 关键词
|
||
if keywords:
|
||
keywords_str = ", ".join(keywords)
|
||
tags.append(f'<meta name="keywords" content="{keywords_str}">')
|
||
|
||
# Open Graph 标签
|
||
tags.append(f'<meta property="og:type" content="{og_type}">')
|
||
tags.append(f'<meta property="og:title" content="{title}">')
|
||
tags.append(f'<meta property="og:description" content="{description}">')
|
||
if og_image:
|
||
tags.append(f'<meta property="og:image" content="{og_image}">')
|
||
|
||
# Canonical URL
|
||
if canonical_url:
|
||
tags.append(f'<link rel="canonical" href="{canonical_url}">')
|
||
|
||
return "\n".join(tags)
|
||
|
||
def validate_url(self, url: str) -> bool:
|
||
"""
|
||
验证 URL 格式
|
||
|
||
Args:
|
||
url: URL 字符串
|
||
|
||
Returns:
|
||
是否为有效 URL
|
||
"""
|
||
try:
|
||
result = urlparse(url)
|
||
return all([result.scheme, result.netloc])
|
||
except Exception:
|
||
return False
|
||
|
||
def sanitize_url_path(self, path: str) -> str:
|
||
"""
|
||
清理 URL 路径,使其符合 URL 规范
|
||
|
||
Args:
|
||
path: 原始路径
|
||
|
||
Returns:
|
||
清理后的路径
|
||
"""
|
||
import re
|
||
# 转换为小写
|
||
path = path.lower()
|
||
# 替换空格为连字符
|
||
path = path.replace(" ", "-")
|
||
# 移除特殊字符
|
||
path = re.sub(r'[^\w\-/]', '', path)
|
||
# 移除多余的连字符
|
||
path = re.sub(r'-+', '-', path)
|
||
# 移除开头和结尾的连字符
|
||
path = path.strip('-')
|
||
return path
|