feat: 重构项目结构并添加平台同步基础架构
- 重构项目目录结构,将功能模块移至 modules/ 目录 - 创建平台同步基础架构,包括发布器基类和 GitHub 发布器 - 新增 UI 状态管理模块 (modules/ui/state.py) 统一管理会话状态 - 更新依赖配置,添加平台同步所需依赖 (httpx, pyperclip) - 整理文档结构,将所有文档分类移至 docs/ 目录 - 添加 .cursorrules 文件定义项目开发规范 - 清理根目录重复文件,保持项目结构整洁
This commit is contained in:
@@ -0,0 +1,359 @@
|
||||
"""
|
||||
技术配置生成模块
|
||||
生成 robots.txt、sitemap.xml 等技术配置文件,提升内容收录效果
|
||||
"""
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
class TechnicalConfigGenerator:
|
||||
"""技术配置文件生成器"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def generate_robots_txt(
|
||||
self,
|
||||
base_url: str = "",
|
||||
allow_paths: List[str] = None,
|
||||
disallow_paths: List[str] = None,
|
||||
sitemap_url: str = "",
|
||||
user_agent: str = "*",
|
||||
crawl_delay: Optional[int] = None
|
||||
) -> str:
|
||||
"""
|
||||
生成 robots.txt 文件
|
||||
|
||||
Args:
|
||||
base_url: 网站基础 URL(如 https://example.com)
|
||||
allow_paths: 允许爬取的路径列表(如 ["/", "/blog", "/docs"])
|
||||
disallow_paths: 禁止爬取的路径列表(如 ["/admin", "/private"])
|
||||
sitemap_url: sitemap.xml 的 URL
|
||||
user_agent: User-Agent(默认 "*" 表示所有爬虫)
|
||||
crawl_delay: 爬取延迟(秒,可选)
|
||||
|
||||
Returns:
|
||||
robots.txt 文件内容
|
||||
"""
|
||||
lines = []
|
||||
|
||||
# User-Agent 规则
|
||||
lines.append(f"User-agent: {user_agent}")
|
||||
|
||||
# 允许路径
|
||||
if allow_paths:
|
||||
for path in allow_paths:
|
||||
lines.append(f"Allow: {path}")
|
||||
|
||||
# 禁止路径
|
||||
if disallow_paths:
|
||||
for path in disallow_paths:
|
||||
lines.append(f"Disallow: {path}")
|
||||
else:
|
||||
# 默认禁止路径(如果未指定)
|
||||
default_disallow = [
|
||||
"/admin",
|
||||
"/private",
|
||||
"/api",
|
||||
"/_next",
|
||||
"/static",
|
||||
]
|
||||
for path in default_disallow:
|
||||
lines.append(f"Disallow: {path}")
|
||||
|
||||
# 爬取延迟
|
||||
if crawl_delay is not None:
|
||||
lines.append(f"Crawl-delay: {crawl_delay}")
|
||||
|
||||
# Sitemap
|
||||
if sitemap_url:
|
||||
lines.append(f"Sitemap: {sitemap_url}")
|
||||
elif base_url:
|
||||
# 自动生成 sitemap URL
|
||||
sitemap_url = urljoin(base_url.rstrip('/') + '/', 'sitemap.xml')
|
||||
lines.append(f"Sitemap: {sitemap_url}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def generate_sitemap_xml(
|
||||
self,
|
||||
base_url: str,
|
||||
urls: List[Dict[str, any]] = None,
|
||||
keywords: List[str] = None,
|
||||
lastmod: Optional[str] = None,
|
||||
changefreq: str = "weekly",
|
||||
priority: float = 0.8
|
||||
) -> str:
|
||||
"""
|
||||
生成 sitemap.xml 文件
|
||||
|
||||
Args:
|
||||
base_url: 网站基础 URL(如 https://example.com)
|
||||
urls: URL 列表,每个元素包含:
|
||||
- loc: URL 路径(如 "/blog/post-1")
|
||||
- lastmod: 最后修改时间(ISO 格式,如 "2025-01-26")
|
||||
- changefreq: 更新频率(如 "daily", "weekly", "monthly")
|
||||
- priority: 优先级(0.0-1.0)
|
||||
keywords: 关键词列表(如果提供,会基于关键词生成 URL)
|
||||
lastmod: 默认最后修改时间(ISO 格式)
|
||||
changefreq: 默认更新频率
|
||||
priority: 默认优先级
|
||||
|
||||
Returns:
|
||||
sitemap.xml 文件内容
|
||||
"""
|
||||
# 如果没有提供 lastmod,使用当前日期
|
||||
if lastmod is None:
|
||||
lastmod = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
# 创建 XML 根元素
|
||||
root = ET.Element("urlset")
|
||||
root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
|
||||
|
||||
# 如果提供了 URLs,使用它们
|
||||
if urls:
|
||||
for url_data in urls:
|
||||
url_elem = ET.SubElement(root, "url")
|
||||
|
||||
# loc(URL)
|
||||
loc = url_data.get("loc", "")
|
||||
if not loc.startswith("http"):
|
||||
loc = urljoin(base_url.rstrip('/') + '/', loc.lstrip('/'))
|
||||
ET.SubElement(url_elem, "loc").text = loc
|
||||
|
||||
# lastmod
|
||||
url_lastmod = url_data.get("lastmod", lastmod)
|
||||
ET.SubElement(url_elem, "lastmod").text = url_lastmod
|
||||
|
||||
# changefreq
|
||||
url_changefreq = url_data.get("changefreq", changefreq)
|
||||
ET.SubElement(url_elem, "changefreq").text = url_changefreq
|
||||
|
||||
# priority
|
||||
url_priority = url_data.get("priority", priority)
|
||||
ET.SubElement(url_elem, "priority").text = str(url_priority)
|
||||
|
||||
# 如果提供了关键词,基于关键词生成 URL
|
||||
elif keywords:
|
||||
for keyword in keywords:
|
||||
url_elem = ET.SubElement(root, "url")
|
||||
|
||||
# 生成 URL(基于关键词)
|
||||
# 将关键词转换为 URL 友好的格式
|
||||
url_path = keyword.lower().replace(" ", "-").replace("_", "-")
|
||||
# 移除特殊字符
|
||||
import re
|
||||
url_path = re.sub(r'[^\w\-]', '', url_path)
|
||||
full_url = urljoin(base_url.rstrip('/') + '/', url_path)
|
||||
|
||||
ET.SubElement(url_elem, "loc").text = full_url
|
||||
ET.SubElement(url_elem, "lastmod").text = lastmod
|
||||
ET.SubElement(url_elem, "changefreq").text = changefreq
|
||||
ET.SubElement(url_elem, "priority").text = str(priority)
|
||||
|
||||
# 如果没有提供 URLs 或关键词,至少添加首页
|
||||
else:
|
||||
url_elem = ET.SubElement(root, "url")
|
||||
ET.SubElement(url_elem, "loc").text = base_url.rstrip('/') + '/'
|
||||
ET.SubElement(url_elem, "lastmod").text = lastmod
|
||||
ET.SubElement(url_elem, "changefreq").text = changefreq
|
||||
ET.SubElement(url_elem, "priority").text = "1.0"
|
||||
|
||||
# 格式化 XML
|
||||
ET.indent(root, space=" ")
|
||||
xml_str = ET.tostring(root, encoding="unicode", xml_declaration=True)
|
||||
|
||||
return xml_str
|
||||
|
||||
def generate_sitemap_from_articles(
|
||||
self,
|
||||
base_url: str,
|
||||
articles: List[Dict[str, str]],
|
||||
lastmod: Optional[str] = None,
|
||||
changefreq: str = "weekly",
|
||||
priority: float = 0.8
|
||||
) -> str:
|
||||
"""
|
||||
基于文章列表生成 sitemap.xml
|
||||
|
||||
Args:
|
||||
base_url: 网站基础 URL
|
||||
articles: 文章列表,每个元素包含:
|
||||
- keyword: 关键词
|
||||
- platform: 平台
|
||||
- content: 内容(可选)
|
||||
- created_at: 创建时间(可选)
|
||||
lastmod: 默认最后修改时间
|
||||
changefreq: 默认更新频率
|
||||
priority: 默认优先级
|
||||
|
||||
Returns:
|
||||
sitemap.xml 文件内容
|
||||
"""
|
||||
urls = []
|
||||
|
||||
for article in articles:
|
||||
keyword = article.get("keyword", "")
|
||||
platform = article.get("platform", "")
|
||||
created_at = article.get("created_at", "")
|
||||
|
||||
# 生成 URL 路径
|
||||
# 将关键词转换为 URL 友好的格式
|
||||
url_path = keyword.lower().replace(" ", "-").replace("_", "-")
|
||||
import re
|
||||
url_path = re.sub(r'[^\w\-]', '', url_path)
|
||||
|
||||
# 如果有平台信息,可以添加到路径中
|
||||
if platform:
|
||||
platform_slug = platform.lower().replace(" ", "-").replace("(", "").replace(")", "")
|
||||
platform_slug = re.sub(r'[^\w\-]', '', platform_slug)
|
||||
url_path = f"{platform_slug}/{url_path}"
|
||||
|
||||
# 使用创建时间作为 lastmod
|
||||
article_lastmod = lastmod
|
||||
if created_at:
|
||||
try:
|
||||
# 尝试解析时间字符串
|
||||
if "T" in created_at:
|
||||
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
|
||||
else:
|
||||
dt = datetime.strptime(created_at, "%Y-%m-%d")
|
||||
article_lastmod = dt.strftime("%Y-%m-%d")
|
||||
except:
|
||||
pass
|
||||
|
||||
urls.append({
|
||||
"loc": url_path,
|
||||
"lastmod": article_lastmod or lastmod or datetime.now().strftime("%Y-%m-%d"),
|
||||
"changefreq": changefreq,
|
||||
"priority": priority
|
||||
})
|
||||
|
||||
return self.generate_sitemap_xml(
|
||||
base_url=base_url,
|
||||
urls=urls,
|
||||
lastmod=lastmod,
|
||||
changefreq=changefreq,
|
||||
priority=priority
|
||||
)
|
||||
|
||||
def generate_htaccess_redirects(
|
||||
self,
|
||||
redirects: List[Dict[str, str]]
|
||||
) -> str:
|
||||
"""
|
||||
生成 .htaccess 重定向规则
|
||||
|
||||
Args:
|
||||
redirects: 重定向列表,每个元素包含:
|
||||
- from: 源路径
|
||||
- to: 目标路径
|
||||
- type: 重定向类型(301 永久重定向,302 临时重定向)
|
||||
|
||||
Returns:
|
||||
.htaccess 文件内容
|
||||
"""
|
||||
lines = []
|
||||
lines.append("# .htaccess 重定向规则")
|
||||
lines.append("# 由 GEO 工具自动生成")
|
||||
lines.append("")
|
||||
|
||||
for redirect in redirects:
|
||||
from_path = redirect.get("from", "")
|
||||
to_path = redirect.get("to", "")
|
||||
redirect_type = redirect.get("type", "301")
|
||||
|
||||
if from_path and to_path:
|
||||
lines.append(f"Redirect {redirect_type} {from_path} {to_path}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def generate_meta_tags(
|
||||
self,
|
||||
title: str,
|
||||
description: str,
|
||||
keywords: List[str] = None,
|
||||
og_type: str = "website",
|
||||
og_image: str = "",
|
||||
canonical_url: str = ""
|
||||
) -> str:
|
||||
"""
|
||||
生成 HTML meta 标签
|
||||
|
||||
Args:
|
||||
title: 页面标题
|
||||
description: 页面描述
|
||||
keywords: 关键词列表
|
||||
og_type: Open Graph 类型(如 "website", "article")
|
||||
og_image: Open Graph 图片 URL
|
||||
canonical_url: 规范 URL
|
||||
|
||||
Returns:
|
||||
HTML meta 标签字符串
|
||||
"""
|
||||
tags = []
|
||||
|
||||
# 基础 meta 标签
|
||||
tags.append(f'<meta charset="UTF-8">')
|
||||
tags.append(f'<meta name="viewport" content="width=device-width, initial-scale=1.0">')
|
||||
tags.append(f'<title>{title}</title>')
|
||||
tags.append(f'<meta name="description" content="{description}">')
|
||||
|
||||
# 关键词
|
||||
if keywords:
|
||||
keywords_str = ", ".join(keywords)
|
||||
tags.append(f'<meta name="keywords" content="{keywords_str}">')
|
||||
|
||||
# Open Graph 标签
|
||||
tags.append(f'<meta property="og:type" content="{og_type}">')
|
||||
tags.append(f'<meta property="og:title" content="{title}">')
|
||||
tags.append(f'<meta property="og:description" content="{description}">')
|
||||
if og_image:
|
||||
tags.append(f'<meta property="og:image" content="{og_image}">')
|
||||
|
||||
# Canonical URL
|
||||
if canonical_url:
|
||||
tags.append(f'<link rel="canonical" href="{canonical_url}">')
|
||||
|
||||
return "\n".join(tags)
|
||||
|
||||
def validate_url(self, url: str) -> bool:
|
||||
"""
|
||||
验证 URL 格式
|
||||
|
||||
Args:
|
||||
url: URL 字符串
|
||||
|
||||
Returns:
|
||||
是否为有效 URL
|
||||
"""
|
||||
try:
|
||||
result = urlparse(url)
|
||||
return all([result.scheme, result.netloc])
|
||||
except:
|
||||
return False
|
||||
|
||||
def sanitize_url_path(self, path: str) -> str:
|
||||
"""
|
||||
清理 URL 路径,使其符合 URL 规范
|
||||
|
||||
Args:
|
||||
path: 原始路径
|
||||
|
||||
Returns:
|
||||
清理后的路径
|
||||
"""
|
||||
import re
|
||||
# 转换为小写
|
||||
path = path.lower()
|
||||
# 替换空格为连字符
|
||||
path = path.replace(" ", "-")
|
||||
# 移除特殊字符
|
||||
path = re.sub(r'[^\w\-/]', '', path)
|
||||
# 移除多余的连字符
|
||||
path = re.sub(r'-+', '-', path)
|
||||
# 移除开头和结尾的连字符
|
||||
path = path.strip('-')
|
||||
return path
|
||||
Reference in New Issue
Block a user