Files
ChouJuGEO/modules/technical_config_generator.py
T
刘国栋 8f7f082c3d feat: 重构项目结构并添加平台同步基础架构
- 重构项目目录结构,将功能模块移至 modules/ 目录
- 创建平台同步基础架构,包括发布器基类和 GitHub 发布器
- 新增 UI 状态管理模块 (modules/ui/state.py) 统一管理会话状态
- 更新依赖配置,添加平台同步所需依赖 (httpx, pyperclip)
- 整理文档结构,将所有文档分类移至 docs/ 目录
- 添加 .cursorrules 文件定义项目开发规范
- 清理根目录重复文件,保持项目结构整洁
2026-01-30 10:21:29 +08:00

360 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
技术配置生成模块
生成 robots.txt、sitemap.xml 等技术配置文件,提升内容收录效果
"""
from typing import List, Dict, Optional
from datetime import datetime
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
class TechnicalConfigGenerator:
"""技术配置文件生成器"""
def __init__(self):
pass
def generate_robots_txt(
self,
base_url: str = "",
allow_paths: List[str] = None,
disallow_paths: List[str] = None,
sitemap_url: str = "",
user_agent: str = "*",
crawl_delay: Optional[int] = None
) -> str:
"""
生成 robots.txt 文件
Args:
base_url: 网站基础 URL(如 https://example.com
allow_paths: 允许爬取的路径列表(如 ["/", "/blog", "/docs"]
disallow_paths: 禁止爬取的路径列表(如 ["/admin", "/private"]
sitemap_url: sitemap.xml 的 URL
user_agent: User-Agent(默认 "*" 表示所有爬虫)
crawl_delay: 爬取延迟(秒,可选)
Returns:
robots.txt 文件内容
"""
lines = []
# User-Agent 规则
lines.append(f"User-agent: {user_agent}")
# 允许路径
if allow_paths:
for path in allow_paths:
lines.append(f"Allow: {path}")
# 禁止路径
if disallow_paths:
for path in disallow_paths:
lines.append(f"Disallow: {path}")
else:
# 默认禁止路径(如果未指定)
default_disallow = [
"/admin",
"/private",
"/api",
"/_next",
"/static",
]
for path in default_disallow:
lines.append(f"Disallow: {path}")
# 爬取延迟
if crawl_delay is not None:
lines.append(f"Crawl-delay: {crawl_delay}")
# Sitemap
if sitemap_url:
lines.append(f"Sitemap: {sitemap_url}")
elif base_url:
# 自动生成 sitemap URL
sitemap_url = urljoin(base_url.rstrip('/') + '/', 'sitemap.xml')
lines.append(f"Sitemap: {sitemap_url}")
return "\n".join(lines)
def generate_sitemap_xml(
self,
base_url: str,
urls: List[Dict[str, any]] = None,
keywords: List[str] = None,
lastmod: Optional[str] = None,
changefreq: str = "weekly",
priority: float = 0.8
) -> str:
"""
生成 sitemap.xml 文件
Args:
base_url: 网站基础 URL(如 https://example.com
urls: URL 列表,每个元素包含:
- loc: URL 路径(如 "/blog/post-1"
- lastmod: 最后修改时间(ISO 格式,如 "2025-01-26"
- changefreq: 更新频率(如 "daily", "weekly", "monthly"
- priority: 优先级(0.0-1.0
keywords: 关键词列表(如果提供,会基于关键词生成 URL)
lastmod: 默认最后修改时间(ISO 格式)
changefreq: 默认更新频率
priority: 默认优先级
Returns:
sitemap.xml 文件内容
"""
# 如果没有提供 lastmod,使用当前日期
if lastmod is None:
lastmod = datetime.now().strftime("%Y-%m-%d")
# 创建 XML 根元素
root = ET.Element("urlset")
root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
# 如果提供了 URLs,使用它们
if urls:
for url_data in urls:
url_elem = ET.SubElement(root, "url")
# locURL
loc = url_data.get("loc", "")
if not loc.startswith("http"):
loc = urljoin(base_url.rstrip('/') + '/', loc.lstrip('/'))
ET.SubElement(url_elem, "loc").text = loc
# lastmod
url_lastmod = url_data.get("lastmod", lastmod)
ET.SubElement(url_elem, "lastmod").text = url_lastmod
# changefreq
url_changefreq = url_data.get("changefreq", changefreq)
ET.SubElement(url_elem, "changefreq").text = url_changefreq
# priority
url_priority = url_data.get("priority", priority)
ET.SubElement(url_elem, "priority").text = str(url_priority)
# 如果提供了关键词,基于关键词生成 URL
elif keywords:
for keyword in keywords:
url_elem = ET.SubElement(root, "url")
# 生成 URL(基于关键词)
# 将关键词转换为 URL 友好的格式
url_path = keyword.lower().replace(" ", "-").replace("_", "-")
# 移除特殊字符
import re
url_path = re.sub(r'[^\w\-]', '', url_path)
full_url = urljoin(base_url.rstrip('/') + '/', url_path)
ET.SubElement(url_elem, "loc").text = full_url
ET.SubElement(url_elem, "lastmod").text = lastmod
ET.SubElement(url_elem, "changefreq").text = changefreq
ET.SubElement(url_elem, "priority").text = str(priority)
# 如果没有提供 URLs 或关键词,至少添加首页
else:
url_elem = ET.SubElement(root, "url")
ET.SubElement(url_elem, "loc").text = base_url.rstrip('/') + '/'
ET.SubElement(url_elem, "lastmod").text = lastmod
ET.SubElement(url_elem, "changefreq").text = changefreq
ET.SubElement(url_elem, "priority").text = "1.0"
# 格式化 XML
ET.indent(root, space=" ")
xml_str = ET.tostring(root, encoding="unicode", xml_declaration=True)
return xml_str
def generate_sitemap_from_articles(
self,
base_url: str,
articles: List[Dict[str, str]],
lastmod: Optional[str] = None,
changefreq: str = "weekly",
priority: float = 0.8
) -> str:
"""
基于文章列表生成 sitemap.xml
Args:
base_url: 网站基础 URL
articles: 文章列表,每个元素包含:
- keyword: 关键词
- platform: 平台
- content: 内容(可选)
- created_at: 创建时间(可选)
lastmod: 默认最后修改时间
changefreq: 默认更新频率
priority: 默认优先级
Returns:
sitemap.xml 文件内容
"""
urls = []
for article in articles:
keyword = article.get("keyword", "")
platform = article.get("platform", "")
created_at = article.get("created_at", "")
# 生成 URL 路径
# 将关键词转换为 URL 友好的格式
url_path = keyword.lower().replace(" ", "-").replace("_", "-")
import re
url_path = re.sub(r'[^\w\-]', '', url_path)
# 如果有平台信息,可以添加到路径中
if platform:
platform_slug = platform.lower().replace(" ", "-").replace("", "").replace("", "")
platform_slug = re.sub(r'[^\w\-]', '', platform_slug)
url_path = f"{platform_slug}/{url_path}"
# 使用创建时间作为 lastmod
article_lastmod = lastmod
if created_at:
try:
# 尝试解析时间字符串
if "T" in created_at:
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
else:
dt = datetime.strptime(created_at, "%Y-%m-%d")
article_lastmod = dt.strftime("%Y-%m-%d")
except:
pass
urls.append({
"loc": url_path,
"lastmod": article_lastmod or lastmod or datetime.now().strftime("%Y-%m-%d"),
"changefreq": changefreq,
"priority": priority
})
return self.generate_sitemap_xml(
base_url=base_url,
urls=urls,
lastmod=lastmod,
changefreq=changefreq,
priority=priority
)
def generate_htaccess_redirects(
self,
redirects: List[Dict[str, str]]
) -> str:
"""
生成 .htaccess 重定向规则
Args:
redirects: 重定向列表,每个元素包含:
- from: 源路径
- to: 目标路径
- type: 重定向类型(301 永久重定向,302 临时重定向)
Returns:
.htaccess 文件内容
"""
lines = []
lines.append("# .htaccess 重定向规则")
lines.append("# 由 GEO 工具自动生成")
lines.append("")
for redirect in redirects:
from_path = redirect.get("from", "")
to_path = redirect.get("to", "")
redirect_type = redirect.get("type", "301")
if from_path and to_path:
lines.append(f"Redirect {redirect_type} {from_path} {to_path}")
return "\n".join(lines)
def generate_meta_tags(
self,
title: str,
description: str,
keywords: List[str] = None,
og_type: str = "website",
og_image: str = "",
canonical_url: str = ""
) -> str:
"""
生成 HTML meta 标签
Args:
title: 页面标题
description: 页面描述
keywords: 关键词列表
og_type: Open Graph 类型(如 "website", "article"
og_image: Open Graph 图片 URL
canonical_url: 规范 URL
Returns:
HTML meta 标签字符串
"""
tags = []
# 基础 meta 标签
tags.append(f'<meta charset="UTF-8">')
tags.append(f'<meta name="viewport" content="width=device-width, initial-scale=1.0">')
tags.append(f'<title>{title}</title>')
tags.append(f'<meta name="description" content="{description}">')
# 关键词
if keywords:
keywords_str = ", ".join(keywords)
tags.append(f'<meta name="keywords" content="{keywords_str}">')
# Open Graph 标签
tags.append(f'<meta property="og:type" content="{og_type}">')
tags.append(f'<meta property="og:title" content="{title}">')
tags.append(f'<meta property="og:description" content="{description}">')
if og_image:
tags.append(f'<meta property="og:image" content="{og_image}">')
# Canonical URL
if canonical_url:
tags.append(f'<link rel="canonical" href="{canonical_url}">')
return "\n".join(tags)
def validate_url(self, url: str) -> bool:
"""
验证 URL 格式
Args:
url: URL 字符串
Returns:
是否为有效 URL
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def sanitize_url_path(self, path: str) -> str:
"""
清理 URL 路径,使其符合 URL 规范
Args:
path: 原始路径
Returns:
清理后的路径
"""
import re
# 转换为小写
path = path.lower()
# 替换空格为连字符
path = path.replace(" ", "-")
# 移除特殊字符
path = re.sub(r'[^\w\-/]', '', path)
# 移除多余的连字符
path = re.sub(r'-+', '-', path)
# 移除开头和结尾的连字符
path = path.strip('-')
return path