skill-template/sohu-publisher/scripts/main.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
sohu-publisher：搜狐号自动发布。

子命令：
  publish <account_id> <article_id>       发布文章
  logs [--limit N] [--status s] [--account-id a]   查看发布记录
  log-get <log_id>                        查看单条发布记录(JSON)
  health | version
"""

from __future__ import annotations

import argparse
import asyncio
import io
import json
import os
import sqlite3
import subprocess
import sys
import time
from datetime import datetime
from typing import Any, Dict, List, Optional

import requests
from playwright.async_api import async_playwright

if sys.platform == "win32":
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")

SKILL_SLUG = "sohu-publisher"
SKILL_VERSION = "1.2.0"
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OPENCLAW_DIR = os.path.dirname(BASE_DIR)


def _now_unix() -> int:
    return int(time.time())


def _unix_to_iso(ts: Optional[int]) -> Optional[str]:
    if ts is None:
        return None
    try:
        return datetime.fromtimestamp(int(ts)).isoformat(timespec="seconds")
    except (ValueError, OSError, OverflowError):
        return None


def get_data_root() -> str:
    env = (os.getenv("CLAW_DATA_ROOT") or os.getenv("JIANGCHANG_DATA_ROOT") or "").strip()
    if env:
        return env
    if sys.platform == "win32":
        return r"D:\claw-data"
    return os.path.join(os.path.expanduser("~"), ".claw-data")


def get_user_id() -> str:
    return (os.getenv("CLAW_USER_ID") or os.getenv("JIANGCHANG_USER_ID") or "").strip() or "_anon"


def get_skills_root() -> str:
    env = (os.getenv("CLAW_SKILLS_ROOT") or os.getenv("JIANGCHANG_SKILLS_ROOT") or "").strip()
    if env:
        return env
    return OPENCLAW_DIR


def get_skill_data_dir() -> str:
    path = os.path.join(get_data_root(), get_user_id(), SKILL_SLUG)
    os.makedirs(path, exist_ok=True)
    return path


def get_db_path() -> str:
    return os.path.join(get_skill_data_dir(), "sohu-publisher.db")


def get_conn() -> sqlite3.Connection:
    return sqlite3.connect(get_db_path())


def init_db() -> None:
    conn = get_conn()
    try:
        cur = conn.cursor()
        cur.execute(
            """
            CREATE TABLE IF NOT EXISTS publish_logs (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                account_id TEXT NOT NULL,
                article_id INTEGER NOT NULL,
                article_title TEXT,
                status TEXT NOT NULL,
                error_msg TEXT,
                created_at INTEGER NOT NULL,
                updated_at INTEGER NOT NULL
            )
            """
        )
        cur.execute("PRAGMA table_info(publish_logs)")
        cols = {r[1] for r in cur.fetchall()}
        if "article_title" not in cols:
            cur.execute("ALTER TABLE publish_logs ADD COLUMN article_title TEXT")
        if "updated_at" not in cols:
            cur.execute("ALTER TABLE publish_logs ADD COLUMN updated_at INTEGER")
        if "created_at" not in cols:
            cur.execute("ALTER TABLE publish_logs ADD COLUMN created_at INTEGER")
        conn.commit()
    finally:
        conn.close()


def save_publish_log(account_id: str, article_id: int, article_title: str, status: str, error_msg: Optional[str] = None) -> int:
    init_db()
    now = _now_unix()
    conn = get_conn()
    try:
        cur = conn.cursor()
        cur.execute(
            """
            INSERT INTO publish_logs (account_id, article_id, article_title, status, error_msg, created_at, updated_at)
            VALUES (?, ?, ?, ?, ?, ?, ?)
            """,
            (account_id, int(article_id), article_title or "", status, error_msg, now, now),
        )
        new_id = int(cur.lastrowid)
        conn.commit()
    finally:
        conn.close()
    return new_id


def check_entitlement(skill_slug: str) -> tuple[bool, str]:
    auth_base = (os.getenv("JIANGCHANG_AUTH_BASE_URL") or "").strip().rstrip("/")
    if not auth_base:
        return True, ""
    user_id = (os.getenv("CLAW_USER_ID") or os.getenv("JIANGCHANG_USER_ID") or "").strip()
    if not user_id:
        return False, "鉴权失败：缺少用户身份（CLAW_USER_ID / JIANGCHANG_USER_ID）"

    auth_api_key = (os.getenv("JIANGCHANG_AUTH_API_KEY") or "").strip()
    timeout = int((os.getenv("JIANGCHANG_AUTH_TIMEOUT_SECONDS") or "5").strip())
    headers = {"Content-Type": "application/json"}
    if auth_api_key:
        headers["Authorization"] = f"Bearer {auth_api_key}"
    payload = {
        "user_id": user_id,
        "skill_slug": skill_slug,
        "trace_id": (os.getenv("JIANGCHANG_TRACE_ID") or "").strip(),
        "context": {"entry": "main.py"},
    }
    try:
        res = requests.post(f"{auth_base}/api/entitlements/check", json=payload, headers=headers, timeout=timeout)
    except requests.RequestException as exc:
        return False, f"鉴权请求失败：{exc}"
    if res.status_code != 200:
        return False, f"鉴权服务异常：HTTP {res.status_code}"
    try:
        body = res.json()
    except ValueError:
        return False, "鉴权服务异常：返回非 JSON"
    code = body.get("code")
    data = body.get("data") or {}
    if code != 200:
        return False, str(body.get("msg") or "鉴权失败")
    if not data.get("allow", False):
        return False, str(data.get("reason") or "未购买或已过期")
    return True, ""


def _call_json_script(script_path: str, args: List[str]) -> Optional[Dict[str, Any]]:
    proc = subprocess.run([sys.executable, script_path, *args], capture_output=True, text=True, encoding="utf-8", errors="replace")
    raw = (proc.stdout or "").strip()
    if not raw or raw.startswith("ERROR"):
        return None
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        return None


def _normalize_sohu_title(raw_title: str, article: Dict[str, Any]) -> str:
    """
    搜狐标题约束：5-72 字。
    - >72：截断
    - <5：用正文首行/前缀补足
    """
    title = (raw_title or "").strip()
    if len(title) > 72:
        title = title[:72].rstrip()

    if len(title) >= 5:
        return title

    body = str(article.get("content") or article.get("content_html") or "").strip()
    first_line = ""
    for line in body.splitlines():
        t = line.strip()
        if t:
            first_line = t
            break
    seed = first_line or "搜狐发布稿件"
    seed = seed.replace("\r", " ").replace("\n", " ").strip()

    # 先把 seed 拼上，仍不足时再补固定后缀，最终保证 >=5
    merged = (title + seed).strip()
    if len(merged) < 5:
        merged = (merged + "发布稿件标题").strip()
    if len(merged) > 72:
        merged = merged[:72].rstrip()
    return merged


def get_account(account_id: str) -> Optional[Dict[str, Any]]:
    script = os.path.join(get_skills_root(), "account-manager", "scripts", "main.py")
    return _call_json_script(script, ["get", str(account_id)])


def get_article(article_id: str) -> Optional[Dict[str, Any]]:
    script = os.path.join(get_skills_root(), "content-manager", "scripts", "main.py")
    return _call_json_script(script, ["get", str(article_id)])


async def publish(account: Dict[str, Any], article: Dict[str, Any], account_id: str) -> str:
    profile_dir = account["profile_dir"]
    original_title = str(article.get("title") or "")
    title = _normalize_sohu_title(original_title, article)
    if title != original_title:
        print(f"ℹ️ 标题已自动修正（搜狐要求 5-72 字）：\n  原标题：{original_title}\n  新标题：{title}")
    content_html = article.get("content_html", article.get("content", ""))

    async with async_playwright() as p:
        browser = await p.chromium.launch_persistent_context(
            user_data_dir=profile_dir,
            headless=False,
            channel="chrome",
            no_viewport=True,
            permissions=["clipboard-read", "clipboard-write"],
            args=["--start-maximized"],
        )
        page = browser.pages[0] if browser.pages else await browser.new_page()
        await page.goto("https://mp.sohu.com/mpfe/v4/contentManagement/news/addarticle?contentStatus=1")
        await page.wait_for_load_state("networkidle")

        try:
            title_input = page.locator(".publish-title input").first
            await title_input.wait_for(state="visible", timeout=10000)
        except Exception:
            await browser.close()
            return "ERROR:REQUIRE_LOGIN"

        print("💡 页面加载且已确认登录，开始自动填入文字...")
        await title_input.click()
        await title_input.fill("")
        await page.keyboard.insert_text(title)
        await asyncio.sleep(1)

        editor = page.locator("#editor .ql-editor").first
        await editor.click()
        await page.evaluate(
            """(html_str) => {
                const blobHtml = new Blob([html_str], { type: 'text/html' });
                const blobText = new Blob([html_str], { type: 'text/plain' });
                const item = new window.ClipboardItem({
                    'text/html': blobHtml,
                    'text/plain': blobText
                });
                return navigator.clipboard.write([item]);
            }""",
            content_html,
        )

        modifier = "Meta" if sys.platform == "darwin" else "Control"
        await page.keyboard.press(f"{modifier}+v")
        await asyncio.sleep(3)
        await page.locator("li.publish-report-btn").first.click()
        print("⌛ 正在提交发布，进入高压视觉核验阶段...")

        publish_success = False
        error_text = "动作执行阻断：由于特殊元素拦截、频率限制或底层报错，未能成功发出。"
        try:
            async with page.expect_navigation(url=lambda u: "addarticle" not in u, timeout=8000):
                pass
            publish_success = True
        except Exception:
            try:
                limit_text = page.locator("text=/.*已达上限.*/").first
                if await limit_text.is_visible(timeout=1500):
                    error_text = await limit_text.inner_text()
                else:
                    error_msg = await page.evaluate(
                        """() => {
                            const els = Array.from(document.querySelectorAll('div, span, p'));
                            for (let el of els) {
                                const style = window.getComputedStyle(el);
                                if ((style.position === 'fixed' || style.position === 'absolute')
                                    && parseInt(style.zIndex || 0) > 80
                                    && el.innerText.trim().length > 3
                                    && el.innerText.trim().length < 80) {
                                    return el.innerText.trim();
                                }
                            }
                            return null;
                        }"""
                    )
                    if error_msg:
                        error_text = f"抓取到报错原文: {error_msg}"
            except Exception as e:
                error_text = f"抓取报错文案期间遭遇环境隔离: {e}"

        await asyncio.sleep(5)
        await browser.close()
        if publish_success:
            return "SUCCESS"
        return f"FAIL:{error_text}"


def cmd_publish(account_id: str, article_id: str) -> int:
    ok, reason = check_entitlement(SKILL_SLUG)
    if not ok:
        print(f"❌ {reason}")
        return 1
    if not str(article_id).isdigit():
        print("❌ article_id 必须是数字。请先执行 content-manager 的 list 查看 id。")
        return 1

    account = get_account(account_id)
    if not account:
        print(f"❌ 查无此配置账号：{account_id}")
        return 1
    platform = str(account.get("platform") or "").strip().lower()
    if platform != "sohu":
        platform_cn_map = {
            "doubao": "豆包",
            "deepseek": "DeepSeek",
            "qianwen": "通义千问",
            "kimi": "Kimi",
            "yiyan": "文心一言",
            "yuanbao": "腾讯元宝",
            "toutiao": "头条号",
            "zhihu": "知乎",
            "wechat": "微信公众号",
            "sohu": "搜狐号",
        }
        got_cn = platform_cn_map.get(platform, platform or "未知平台")
        print("❌ 账号平台不匹配：当前账号不是「搜狐号」。")
        print(f"当前 account_id={account_id} 对应平台：{got_cn}（platform={platform or 'unknown'}）")
        print("请换一个搜狐账号 id 后重试。")
        print("可先执行：python account-manager/scripts/main.py list sohu")
        return 1
    login_status = int(account.get("login_status") or 0)
    if login_status != 1:
        print("❌ 该搜狐账号当前未登录，暂不能发布。")
        print("请先手工登录，再执行发布：")
        print(f"  python account-manager/scripts/main.py login {account_id}")
        print(f"登录完成后再执行：python sohu-publisher/scripts/main.py publish {account_id} {article_id}")
        return 1
    article = get_article(article_id)
    if not article:
        print(f"❌ 查无此文章编号（库中无 ID: {article_id}）")
        return 1

    result = asyncio.run(publish(account, article, account_id))
    content_script = os.path.join(get_skills_root(), "content-manager", "scripts", "main.py")
    title = article.get("title", "")

    if result == "ERROR:REQUIRE_LOGIN":
        save_publish_log(account_id, int(article_id), title, "require_login", "账号未登录或登录已失效")
        print(f"⚠️ 搜狐号 ({account_id}) 登录状态已失效，发布流程已中止。")
        print("请先手工完成登录，再重新发布：")
        print(f"  python account-manager/scripts/main.py login {account_id}")
        print(f"  python sohu-publisher/scripts/main.py publish {account_id} {article_id}")
        return 1

    if result == "SUCCESS":
        log_id = save_publish_log(account_id, int(article_id), title, "published", None)
        subprocess.run([sys.executable, content_script, "feedback", article_id, "published", account_id])
        print(f"🎉 发布成功：{title}")
        print(f"✅ 发布日志已记录，log_id={log_id}")
        return 0

    if result.startswith("FAIL:"):
        error_msg = result[len("FAIL:") :]
        log_id = save_publish_log(account_id, int(article_id), title, "failed", error_msg)
        subprocess.run([sys.executable, content_script, "feedback", article_id, "failed", account_id, error_msg])
        print(f"❌ 发布失败：{error_msg}")
        print(f"✅ 失败日志已记录，log_id={log_id}")
        return 1

    save_publish_log(account_id, int(article_id), title, "failed", f"未知结果：{result}")
    return 1


def cmd_logs(limit: int = 10, status: Optional[str] = None, account_id: Optional[str] = None) -> int:
    init_db()
    if limit <= 0:
        limit = 10
    conn = get_conn()
    try:
        cur = conn.cursor()
        sql = (
            "SELECT id, account_id, article_id, article_title, status, error_msg, created_at, updated_at "
            "FROM publish_logs WHERE 1=1 "
        )
        params: List[Any] = []
        if status:
            sql += "AND status = ? "
            params.append(status)
        if account_id:
            sql += "AND account_id = ? "
            params.append(account_id)
        sql += "ORDER BY created_at DESC, id DESC LIMIT ?"
        params.append(int(limit))
        cur.execute(sql, tuple(params))
        rows = cur.fetchall()
    finally:
        conn.close()

    if not rows:
        print("暂无发布记录")
        return 0

    sep_line = "_" * 39
    for idx, r in enumerate(rows):
        rid, aid, arid, title, st, err, cat, uat = r
        print(f"id：{rid}")
        print(f"account_id：{aid or ''}")
        print(f"article_id：{arid}")
        print(f"article_title：{title or ''}")
        print(f"status：{st or ''}")
        print(f"error_msg：{err or ''}")
        print(f"created_at：{_unix_to_iso(cat) or str(cat or '')}")
        print(f"updated_at：{_unix_to_iso(uat) or str(uat or '')}")
        if idx != len(rows) - 1:
            print(sep_line)
            print()
    return 0


def cmd_log_get(log_id: str) -> int:
    if not str(log_id).isdigit():
        print("❌ log_id 必须是数字")
        return 1
    init_db()
    conn = get_conn()
    try:
        cur = conn.cursor()
        cur.execute(
            "SELECT id, account_id, article_id, article_title, status, error_msg, created_at, updated_at FROM publish_logs WHERE id = ?",
            (int(log_id),),
        )
        row = cur.fetchone()
    finally:
        conn.close()
    if not row:
        print("❌ 没有这条发布记录")
        return 1
    rid, aid, arid, title, st, err, cat, uat = row
    print(
        json.dumps(
            {
                "id": int(rid),
                "account_id": aid,
                "article_id": int(arid),
                "article_title": title,
                "status": st,
                "error_msg": err,
                "created_at": _unix_to_iso(cat),
                "updated_at": _unix_to_iso(uat),
            },
            ensure_ascii=False,
        )
    )
    return 0


class ZhArgumentParser(argparse.ArgumentParser):
    def error(self, message: str) -> None:
        print(f"参数错误：{message}\n请执行：python main.py -h 查看帮助", file=sys.stderr)
        self.exit(2)


def _print_full_usage() -> None:
    print("搜狐号发布（main.py）可用命令：")
    print("  python main.py publish <account_id> <article_id>   # 发布一篇")
    print("  python main.py logs [--limit N] [--status s] [--account-id a]   # 查看发布记录")
    print("  python main.py log-get <log_id>                    # 查看单条日志(JSON)")
    print("  python main.py health")
    print("  python main.py version")
    print()
    print("常见示例：")
    print("  python main.py publish sohu_account1 12")
    print("  python main.py logs")
    print("  python main.py logs --status failed --limit 20")
    print("  python main.py log-get 7")
    print()
    print("说明：也兼容旧写法：python main.py <account_id> <article_id>")


def build_parser() -> ZhArgumentParser:
    p = ZhArgumentParser(
        prog="main.py",
        description="搜狐号发布：发布文章、查看发布记录、查询单条日志。",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=(
            "示例：\n"
            "  python main.py publish sohu_account1 12\n"
            "  python main.py logs\n"
            "  python main.py logs --status failed --limit 20\n"
            "  python main.py log-get 7\n"
            "  python main.py health\n"
            "  python main.py version"
        ),
    )
    sub = p.add_subparsers(dest="cmd", required=True, parser_class=ZhArgumentParser)

    sp = sub.add_parser("publish", help="发布一篇文章到搜狐号")
    sp.add_argument("account_id", help="账号 id（来自 account-manager list）")
    sp.add_argument("article_id", help="文章 id（来自 content-manager list）")
    sp.set_defaults(handler=lambda a: cmd_publish(a.account_id, a.article_id))

    sp = sub.add_parser("logs", help="查看发布记录（默认最近 10 条）")
    sp.add_argument("--limit", type=int, default=10, help="最多显示条数（默认 10）")
    sp.add_argument("--status", default=None, help="按状态筛选：published/failed/require_login")
    sp.add_argument("--account-id", default=None, help="按账号 id 筛选")
    sp.set_defaults(handler=lambda a: cmd_logs(limit=a.limit, status=a.status, account_id=a.account_id))

    sp = sub.add_parser("log-get", help="按 log_id 查看单条发布记录(JSON)")
    sp.add_argument("log_id", help="日志 id（整数）")
    sp.set_defaults(handler=lambda a: cmd_log_get(a.log_id))

    sp = sub.add_parser("health", help="健康检查")
    sp.set_defaults(handler=lambda _a: 0 if sys.version_info >= (3, 10) else 1)

    sp = sub.add_parser("version", help="版本信息(JSON)")
    sp.set_defaults(
        handler=lambda _a: (
            print(json.dumps({"version": SKILL_VERSION, "skill": SKILL_SLUG}, ensure_ascii=False)) or 0
        )
    )
    return p


def main(argv: Optional[List[str]] = None) -> int:
    argv = argv if argv is not None else sys.argv[1:]
    if not argv:
        _print_full_usage()
        return 1
    # 兼容旧用法：python main.py <account_id> <article_id>
    if len(argv) == 2 and argv[0] not in {"publish", "logs", "log-get", "health", "version", "-h", "--help"}:
        return cmd_publish(argv[0], argv[1])
    parser = build_parser()
    args = parser.parse_args(argv)
    return int(args.handler(args))


if __name__ == "__main__":
    raise SystemExit(main())