Crawler/web/dailymotion_com/main1.py

import json
import random
import time
import uuid
import concurrent.futures
import logging
from random import uniform
from datetime import datetime
from typing import Dict, List, Optional, Union

import pandas as pd
import requests
import os
import urllib3
from requests import RequestException
from fake_useragent import UserAgent

# 配置日志记录
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('dailymotion.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# 禁用 SSL 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 基础配置
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
KW_PATH = os.path.join(BASE_DIR, 'data', 'keyword.xlsx')
OUTPUT_DIR = os.path.join(BASE_DIR, 'out_put_CNTW')

# 创建输出目录
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    logger.info(f'创建输出目录: {OUTPUT_DIR}')

# 请求配置
MAX_RETRIES = 3
BASE_DELAY = 2
MAX_WORKERS = 5  # 并发线程数限制
REQUEST_TIMEOUT = 30  # 请求超时时间

class DailymotionAPI:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'Accept': '*/*, */*',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/json, application/json',
            'Host': 'graphql.api.dailymotion.com',
            'Origin': 'https://www.dailymotion.com',
            'Referer': 'https://www.dailymotion.com/',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-site',
            'X-DM-AppInfo-Id': 'com.dailymotion.neon',
            'X-DM-AppInfo-Type': 'website',
            'X-DM-AppInfo-Version': 'v2025-04-28T12:37:52.391Z',
            'X-DM-Neon-SSR': '0',
            'X-DM-Preferred-Country': 'us',
            'accept-language': 'zh-CN',
        })
        self.session.proxies = {
            "http": 'http://127.0.0.1:7890',
            "https": 'http://127.0.0.1:7890',
        }

    def _make_request(self, url: str, json_data: Dict, retries: int = MAX_RETRIES) -> Dict:
        """发送请求并处理响应

        Args:
            url: 请求URL
            json_data: 请求数据
            retries: 重试次数

        Returns:
            Dict: 响应数据
        """
        for attempt in range(retries):
            try:
                self.session.headers['User-Agent'] = UserAgent().random
                response = self.session.post(
                    url,
                    json=json_data,
                    timeout=REQUEST_TIMEOUT,
                    verify=False
                )
                response.raise_for_status()
                return response.json()
            except Exception as e:
                if attempt == retries - 1:
                    logger.error(f'请求失败: {str(e)}')
                    raise
                wait_time = BASE_DELAY * (2 ** attempt) + uniform(1, 3)
                logger.warning(f'请求失败，等待 {wait_time:.2f} 秒后重试...')
                time.sleep(wait_time)

    def get_video_info(self, x_id: str) -> Dict[str, Union[int, str]]:
        """获取视频详细信息

        Args:
            x_id: 视频ID

        Returns:
            Dict: 包含视频统计信息的字典
        """
        try:
            payload = {
                "operationName": "WATCHING_VIDEO",
                "variables": {"xid": x_id, "isSEO": False},
                "query": "fragment VIDEO_FRAGMENT on Video {\n  id\n  xid\n  isPublished\n  duration\n  title\n  description\n  thumbnailx60: thumbnailURL(size: \"x60\")\n  thumbnailx120: thumbnailURL(size: \"x120\")\n  thumbnailx240: thumbnailURL(size: \"x240\")\n  thumbnailx360: thumbnailURL(size: \"x360\")\n  thumbnailx480: thumbnailURL(size: \"x480\")\n  thumbnailx720: thumbnailURL(size: \"x720\")\n  thumbnailx1080: thumbnailURL(size: \"x1080\")\n  aspectRatio\n  category\n  categories(filter: {category: {eq: CONTENT_CATEGORY}}) {\n    edges {\n      node { id name slug __typename }\n      __typename\n    }\n    __typename\n  }\n  iab_categories: categories(\n    filter: {category: {eq: IAB_CATEGORY}, percentage: {gte: 70}}\n  ) {\n    edges {\n      node { id slug __typename }\n      __typename\n    }\n    __typename\n  }\n  bestAvailableQuality\n  createdAt\n  viewerEngagement {\n    id\n    liked\n    favorited\n    __typename\n  }\n  isPrivate\n  isWatched\n  isCreatedForKids\n  isExplicit\n  canDisplayAds\n  videoWidth: width\n  videoHeight: height\n  status\n  hashtags {\n    edges {\n      node { id name __typename }\n      __typename\n    }\n    __typename\n  }\n  stats {\n    id\n    views { id total __typename }\n    __typename\n  }\n  channel {\n    __typename\n    id\n    xid\n    name\n    displayName\n    isArtist\n    logoURLx25: logoURL(size: \"x25\")\n    logoURL(size: \"x60\")\n    isFollowed\n    accountType\n    coverURLx375: coverURL(size: \"x375\")\n    stats {\n      id\n      views { id total __typename }\n      followers { id total __typename }\n      videos { id total __typename }\n      __typename\n    }\n    country { id codeAlpha2 __typename }\n    organization @skip(if: $isSEO) {\n      id\n      xid\n      owner { id xid __typename }\n      __typename\n    }\n  }\n  language { id codeAlpha2 __typename }\n  tags {\n    edges {\n      node { id label __typename }\n      __typename\n    }\n    __typename\n  }\n  moderation { id reviewedAt __typename }\n  topics(whitelistedOnly: true, first: 3, page: 1) {\n    edges {\n      node {\n        id\n        xid\n        name\n        names {\n          edges {\n            node {\n              id\n              name\n              language { id codeAlpha2 __typename }\n              __typename\n            }\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  geoblockedCountries {\n    id\n    allowed\n    denied\n    __typename\n  }\n  transcript {\n    edges {\n      node { id timecode text __typename }\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment LIVE_FRAGMENT on Live {\n  id\n  xid\n  startAt\n  endAt\n  isPublished\n  title\n  description\n  thumbnailx60: thumbnailURL(size: \"x60\")\n  thumbnailx120: thumbnailURL(size: \"x120\")\n  thumbnailx240: thumbnailURL(size: \"x240\")\n  thumbnailx360: thumbnailURL(size: \"x360\")\n  thumbnailx480: thumbnailURL(size: \"x480\")\n  thumbnailx720: thumbnailURL(size: \"x720\")\n  thumbnailx1080: thumbnailURL(size: \"x1080\")\n  aspectRatio\n  category\n  createdAt\n  viewerEngagement { id liked favorited __typename }\n  isPrivate\n  isExplicit\n  isCreatedForKids\n  bestAvailableQuality\n  canDisplayAds\n  videoWidth: width\n  videoHeight: height\n  stats { id views { id total __typename } __typename }\n  channel {\n    __typename\n    id\n    xid\n    name\n    displayName\n    isArtist\n    logoURLx25: logoURL(size: \"x25\")\n    logoURL(size: \"x60\")\n    isFollowed\n    accountType\n    coverURLx375: coverURL(size: \"x375\")\n    stats { id views { id total __typename } followers { id total __typename } videos { id total __typename } __typename }\n    country { id codeAlpha2 __typename }\n    organization @skip(if: $isSEO) { id xid owner { id xid __typename } __typename }\n  }\n  language { id codeAlpha2 __typename }\n  tags { edges { node { id label __typename } __typename } __typename }\n  moderation { id reviewedAt __typename }\n  topics(whitelistedOnly: true, first: 3, page: 1) {\n    edges { node { id xid name names { edges { node { id name language { id codeAlpha2 __typename } __typename } __typename } __typename } __typename } __typename }\n    __typename\n  }\n  geoblockedCountries { id allowed denied __typename }\n  __typename\n}\n\nquery WATCHING_VIDEO($xid: String!, $isSEO: Boolean!) {\n  video: media(xid: $xid) {\n    __typename\n    ... on Video { id ...VIDEO_FRAGMENT __typename }\n    ... on Live  { id ...LIVE_FRAGMENT  __typename }\n  }\n}"
            }

            response_data = self._make_request('https://graphql.api.dailymotion.com/', payload)
            v_info = response_data['data']['video']['channel']['stats']

            return {
                "view": v_info['views']['total'],
                "fans": v_info['followers']['total'],
                "videos": v_info['videos']['total'],
            }
        except Exception as e:
            logger.error(f'获取视频信息失败: {str(e)}')
            return {"view": '-', "fans": '-', "videos": '-'}

def process_video(api: DailymotionAPI, node: Dict, calculated_index: int) -> Optional[Dict]:
    """处理单个视频信息

    Args:
        api: DailymotionAPI实例
        node: 视频节点数据
        calculated_index: 计算的索引

    Returns:
        Optional[Dict]: 处理后的视频信息
    """
    xid = node.get('xid')
    try:
        logger.info(f'开始处理视频 {xid} (索引: {calculated_index})')

        # 添加随机延迟避免请求过于频繁
        time.sleep(uniform(1, 2))

        v_info = api.get_video_info(xid)
        result = {
            "index": calculated_index,
            "id": node.get('id'),
            "xid": xid,
            "link": f"https://www.dailymotion.com/video/{xid}",
            "title": node.get('title'),
            "createtime": node.get('createdAt'),
            "duration": node.get('duration'),
            "pic": node.get('thumbnail', {}).get('url'),
            "view": v_info['view'],
            "fans": v_info['fans'],
            "videos": v_info['videos']
        }

        logger.debug(f'视频 {xid} 处理成功')
        return result

    except Exception as e:
        logger.error(f'处理视频 {xid} 出错: {str(e)}')
        return None

def process_videos_batch(api: DailymotionAPI, videos: List[Dict], start_index: int) -> List[Dict]:
    """批量处理视频信息

    Args:
        api: DailymotionAPI实例
        videos: 视频列表
        start_index: 起始索引

    Returns:
        List[Dict]: 处理后的视频信息列表
    """
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_video = {executor.submit(process_video, api, video, i): (video, i)
                          for i, video in enumerate(videos, start=start_index)}

        for future in concurrent.futures.as_completed(future_to_video):
            video, index = future_to_video[future]
            try:
                result = future.result()
                if result:
                    results.append(result)
            except Exception as e:
                logger.error(f'处理视频失败 (索引: {index}): {str(e)}')

    return results

def save_results(results: List[Dict], output_file: str):
    """保存处理结果

    Args:
        results: 处理结果列表
        output_file: 输出文件路径
    """
    try:
        df = pd.DataFrame(results)
        df.to_excel(output_file, index=False, engine='openpyxl')
        logger.info(f'结果已保存到: {output_file}')
    except Exception as e:
        logger.error(f'保存结果失败: {str(e)}')

def search_videos(api: DailymotionAPI, keyword: str, page: int = 1) -> List[Dict]:
    """搜索视频列表

    Args:
        api: DailymotionAPI实例
        keyword: 搜索关键词
        page: 页码

    Returns:
        List[Dict]: 视频列表
    """
    try:
        payload = {
            "operationName": "SEARCH_VIDEOS",
            "variables": {
                "query": keyword,
                "page": page,
                "limit": 20,
                "sort": "relevance"
            },
            "query": "query SEARCH_VIDEOS($query: String!, $page: Int!, $limit: Int!, $sort: String!) {\n  videos(\n    first: $limit\n    page: $page\n    search: {query: $query, sort: $sort}\n  ) {\n    pageInfo { hasNextPage currentPage __typename }\n    edges {\n      node {\n        id\n        xid\n        title\n        createdAt\n        duration\n        thumbnail { url __typename }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}"
        }

        response = api._make_request('https://graphql.api.dailymotion.com/', payload)
        videos = response['data']['videos']['edges']
        return [video['node'] for video in videos]

    except Exception as e:
        logger.error(f'搜索视频失败: {str(e)}')
        return []

def load_progress(keyword: str) -> Dict:
    """加载进度信息

    Args:
        keyword: 关键词

    Returns:
        Dict: 进度信息
    """
    progress_file = os.path.join(OUTPUT_DIR, f'{keyword}_progress.json')
    if os.path.exists(progress_file):
        try:
            with open(progress_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f'加载进度失败: {str(e)}')
    return {'page': 1, 'video_data': [], 'user_data': []}

def save_progress(keyword: str, progress: Dict):
    """保存进度信息

    Args:
        keyword: 关键词
        progress: 进度信息
    """
    progress_file = os.path.join(OUTPUT_DIR, f'{keyword}_progress.json')
    try:
        with open(progress_file, 'w', encoding='utf-8') as f:
            json.dump(progress, f)
    except Exception as e:
        logger.error(f'保存进度失败: {str(e)}')

def main():
    """主函数"""
    try:
        # 读取关键词
        df = pd.read_excel(KW_PATH)
        if '搜索词' in df.columns:
            keywords = df['搜索词'].tolist()
        elif 'keyword' in df.columns:
            keywords = df['keyword'].tolist()
        else:
            raise ValueError('Excel文件中未找到列名"搜索词"或"keyword"，请检查文件格式')

        api = DailymotionAPI()

        for keyword in keywords:
            logger.info(f'开始处理关键词: {keyword}')

            # 加载进度
            progress = load_progress(keyword)
            current_page = progress['page']
            video_data = progress['video_data']

            try:
                while True:
                    # 搜索视频
                    videos = search_videos(api, keyword, current_page)
                    if not videos:
                        break

                    # 处理视频信息
                    results = process_videos_batch(api, videos, len(video_data))
                    video_data.extend(results)

                    # 保存进度
                    progress['page'] = current_page
                    progress['video_data'] = video_data
                    save_progress(keyword, progress)

                    logger.info(f'已处理 {len(video_data)} 个视频')
                    current_page += 1

                # 保存结果
                if video_data:
                    output_file = os.path.join(OUTPUT_DIR, f'{keyword}_results.xlsx')
                    save_results(video_data, output_file)

            except Exception as e:
                logger.error(f'处理关键词 {keyword} 出错: {str(e)}')
                continue

    except Exception as e:
        logger.error(f'程序执行出错: {str(e)}')
    finally:
        logger.info('程序执行完成')

if __name__ == '__main__':
    main()