feat: 重构项目结构并添加平台同步基础架构

- 重构项目目录结构,将功能模块移至 modules/ 目录
- 创建平台同步基础架构,包括发布器基类和 GitHub 发布器
- 新增 UI 状态管理模块 (modules/ui/state.py) 统一管理会话状态
- 更新依赖配置,添加平台同步所需依赖 (httpx, pyperclip)
- 整理文档结构,将所有文档分类移至 docs/ 目录
- 添加 .cursorrules 文件定义项目开发规范
- 清理根目录重复文件,保持项目结构整洁
This commit is contained in:
刘国栋
2026-01-30 10:21:29 +08:00
parent 77d5ec70f8
commit 8f7f082c3d
102 changed files with 33742 additions and 1526 deletions
+359
View File
@@ -0,0 +1,359 @@
"""
技术配置生成模块
生成 robots.txt、sitemap.xml 等技术配置文件,提升内容收录效果
"""
from typing import List, Dict, Optional
from datetime import datetime
from urllib.parse import urljoin, urlparse
import xml.etree.ElementTree as ET
class TechnicalConfigGenerator:
"""技术配置文件生成器"""
def __init__(self):
pass
def generate_robots_txt(
self,
base_url: str = "",
allow_paths: List[str] = None,
disallow_paths: List[str] = None,
sitemap_url: str = "",
user_agent: str = "*",
crawl_delay: Optional[int] = None
) -> str:
"""
生成 robots.txt 文件
Args:
base_url: 网站基础 URL(如 https://example.com
allow_paths: 允许爬取的路径列表(如 ["/", "/blog", "/docs"]
disallow_paths: 禁止爬取的路径列表(如 ["/admin", "/private"]
sitemap_url: sitemap.xml 的 URL
user_agent: User-Agent(默认 "*" 表示所有爬虫)
crawl_delay: 爬取延迟(秒,可选)
Returns:
robots.txt 文件内容
"""
lines = []
# User-Agent 规则
lines.append(f"User-agent: {user_agent}")
# 允许路径
if allow_paths:
for path in allow_paths:
lines.append(f"Allow: {path}")
# 禁止路径
if disallow_paths:
for path in disallow_paths:
lines.append(f"Disallow: {path}")
else:
# 默认禁止路径(如果未指定)
default_disallow = [
"/admin",
"/private",
"/api",
"/_next",
"/static",
]
for path in default_disallow:
lines.append(f"Disallow: {path}")
# 爬取延迟
if crawl_delay is not None:
lines.append(f"Crawl-delay: {crawl_delay}")
# Sitemap
if sitemap_url:
lines.append(f"Sitemap: {sitemap_url}")
elif base_url:
# 自动生成 sitemap URL
sitemap_url = urljoin(base_url.rstrip('/') + '/', 'sitemap.xml')
lines.append(f"Sitemap: {sitemap_url}")
return "\n".join(lines)
def generate_sitemap_xml(
self,
base_url: str,
urls: List[Dict[str, any]] = None,
keywords: List[str] = None,
lastmod: Optional[str] = None,
changefreq: str = "weekly",
priority: float = 0.8
) -> str:
"""
生成 sitemap.xml 文件
Args:
base_url: 网站基础 URL(如 https://example.com
urls: URL 列表,每个元素包含:
- loc: URL 路径(如 "/blog/post-1"
- lastmod: 最后修改时间(ISO 格式,如 "2025-01-26"
- changefreq: 更新频率(如 "daily", "weekly", "monthly"
- priority: 优先级(0.0-1.0
keywords: 关键词列表(如果提供,会基于关键词生成 URL)
lastmod: 默认最后修改时间(ISO 格式)
changefreq: 默认更新频率
priority: 默认优先级
Returns:
sitemap.xml 文件内容
"""
# 如果没有提供 lastmod,使用当前日期
if lastmod is None:
lastmod = datetime.now().strftime("%Y-%m-%d")
# 创建 XML 根元素
root = ET.Element("urlset")
root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
# 如果提供了 URLs,使用它们
if urls:
for url_data in urls:
url_elem = ET.SubElement(root, "url")
# locURL
loc = url_data.get("loc", "")
if not loc.startswith("http"):
loc = urljoin(base_url.rstrip('/') + '/', loc.lstrip('/'))
ET.SubElement(url_elem, "loc").text = loc
# lastmod
url_lastmod = url_data.get("lastmod", lastmod)
ET.SubElement(url_elem, "lastmod").text = url_lastmod
# changefreq
url_changefreq = url_data.get("changefreq", changefreq)
ET.SubElement(url_elem, "changefreq").text = url_changefreq
# priority
url_priority = url_data.get("priority", priority)
ET.SubElement(url_elem, "priority").text = str(url_priority)
# 如果提供了关键词,基于关键词生成 URL
elif keywords:
for keyword in keywords:
url_elem = ET.SubElement(root, "url")
# 生成 URL(基于关键词)
# 将关键词转换为 URL 友好的格式
url_path = keyword.lower().replace(" ", "-").replace("_", "-")
# 移除特殊字符
import re
url_path = re.sub(r'[^\w\-]', '', url_path)
full_url = urljoin(base_url.rstrip('/') + '/', url_path)
ET.SubElement(url_elem, "loc").text = full_url
ET.SubElement(url_elem, "lastmod").text = lastmod
ET.SubElement(url_elem, "changefreq").text = changefreq
ET.SubElement(url_elem, "priority").text = str(priority)
# 如果没有提供 URLs 或关键词,至少添加首页
else:
url_elem = ET.SubElement(root, "url")
ET.SubElement(url_elem, "loc").text = base_url.rstrip('/') + '/'
ET.SubElement(url_elem, "lastmod").text = lastmod
ET.SubElement(url_elem, "changefreq").text = changefreq
ET.SubElement(url_elem, "priority").text = "1.0"
# 格式化 XML
ET.indent(root, space=" ")
xml_str = ET.tostring(root, encoding="unicode", xml_declaration=True)
return xml_str
def generate_sitemap_from_articles(
self,
base_url: str,
articles: List[Dict[str, str]],
lastmod: Optional[str] = None,
changefreq: str = "weekly",
priority: float = 0.8
) -> str:
"""
基于文章列表生成 sitemap.xml
Args:
base_url: 网站基础 URL
articles: 文章列表,每个元素包含:
- keyword: 关键词
- platform: 平台
- content: 内容(可选)
- created_at: 创建时间(可选)
lastmod: 默认最后修改时间
changefreq: 默认更新频率
priority: 默认优先级
Returns:
sitemap.xml 文件内容
"""
urls = []
for article in articles:
keyword = article.get("keyword", "")
platform = article.get("platform", "")
created_at = article.get("created_at", "")
# 生成 URL 路径
# 将关键词转换为 URL 友好的格式
url_path = keyword.lower().replace(" ", "-").replace("_", "-")
import re
url_path = re.sub(r'[^\w\-]', '', url_path)
# 如果有平台信息,可以添加到路径中
if platform:
platform_slug = platform.lower().replace(" ", "-").replace("", "").replace("", "")
platform_slug = re.sub(r'[^\w\-]', '', platform_slug)
url_path = f"{platform_slug}/{url_path}"
# 使用创建时间作为 lastmod
article_lastmod = lastmod
if created_at:
try:
# 尝试解析时间字符串
if "T" in created_at:
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
else:
dt = datetime.strptime(created_at, "%Y-%m-%d")
article_lastmod = dt.strftime("%Y-%m-%d")
except:
pass
urls.append({
"loc": url_path,
"lastmod": article_lastmod or lastmod or datetime.now().strftime("%Y-%m-%d"),
"changefreq": changefreq,
"priority": priority
})
return self.generate_sitemap_xml(
base_url=base_url,
urls=urls,
lastmod=lastmod,
changefreq=changefreq,
priority=priority
)
def generate_htaccess_redirects(
self,
redirects: List[Dict[str, str]]
) -> str:
"""
生成 .htaccess 重定向规则
Args:
redirects: 重定向列表,每个元素包含:
- from: 源路径
- to: 目标路径
- type: 重定向类型(301 永久重定向,302 临时重定向)
Returns:
.htaccess 文件内容
"""
lines = []
lines.append("# .htaccess 重定向规则")
lines.append("# 由 GEO 工具自动生成")
lines.append("")
for redirect in redirects:
from_path = redirect.get("from", "")
to_path = redirect.get("to", "")
redirect_type = redirect.get("type", "301")
if from_path and to_path:
lines.append(f"Redirect {redirect_type} {from_path} {to_path}")
return "\n".join(lines)
def generate_meta_tags(
self,
title: str,
description: str,
keywords: List[str] = None,
og_type: str = "website",
og_image: str = "",
canonical_url: str = ""
) -> str:
"""
生成 HTML meta 标签
Args:
title: 页面标题
description: 页面描述
keywords: 关键词列表
og_type: Open Graph 类型(如 "website", "article"
og_image: Open Graph 图片 URL
canonical_url: 规范 URL
Returns:
HTML meta 标签字符串
"""
tags = []
# 基础 meta 标签
tags.append(f'<meta charset="UTF-8">')
tags.append(f'<meta name="viewport" content="width=device-width, initial-scale=1.0">')
tags.append(f'<title>{title}</title>')
tags.append(f'<meta name="description" content="{description}">')
# 关键词
if keywords:
keywords_str = ", ".join(keywords)
tags.append(f'<meta name="keywords" content="{keywords_str}">')
# Open Graph 标签
tags.append(f'<meta property="og:type" content="{og_type}">')
tags.append(f'<meta property="og:title" content="{title}">')
tags.append(f'<meta property="og:description" content="{description}">')
if og_image:
tags.append(f'<meta property="og:image" content="{og_image}">')
# Canonical URL
if canonical_url:
tags.append(f'<link rel="canonical" href="{canonical_url}">')
return "\n".join(tags)
def validate_url(self, url: str) -> bool:
"""
验证 URL 格式
Args:
url: URL 字符串
Returns:
是否为有效 URL
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def sanitize_url_path(self, path: str) -> str:
"""
清理 URL 路径,使其符合 URL 规范
Args:
path: 原始路径
Returns:
清理后的路径
"""
import re
# 转换为小写
path = path.lower()
# 替换空格为连字符
path = path.replace(" ", "-")
# 移除特殊字符
path = re.sub(r'[^\w\-/]', '', path)
# 移除多余的连字符
path = re.sub(r'-+', '-', path)
# 移除开头和结尾的连字符
path = path.strip('-')
return path