DailyMotion/oneget.py

import base64
from datetime import datetime

import requests
import uuid
import random
import time
import copy
from threading import Lock
import logging
from DB import DBVidcon
import json
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

MACHINE_ID = 3
logger = logging.getLogger(__name__)
db = DBVidcon()
proxiesdict = db.get_proxy_agent_dict()

def clean_dash_to_zero(val):
    if val in ('-', '', None):
        return 0
    try:
        return int(val)
    except (ValueError, TypeError) as e:
        logger.exception(f"[字段异常] val = {val} → {str(e)}")
        return 0


def format_create_time(timestr):
    try:
        dt = date_parser.isoparse(timestr)
        return dt.strftime("%Y-%m-%d %H:%M:%S")
    except Exception as e:
        logger.exception(f"[时间格式错误] {timestr} → {str(e)}")
        return "1970-01-01 00:00:00"


def format_duration(seconds):
    try:
        seconds = int(seconds)
        return f"{seconds // 60:02}:{seconds % 60:02}"
    except Exception:
        return "00:00"

class DMHeaderManager:
    _headers_template = {
        'Accept': '*/*, */*',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Type': 'application/json, application/json',
        'Host': 'graphql.api.dailymotion.com',
        'Origin': 'https://www.dailymotion.com',
        'Referer': 'https://www.dailymotion.com/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-site',
        'User-Agent': 'Mozilla/5.0',
        'X-DM-AppInfo-Id': 'com.dailymotion.neon',
        'X-DM-AppInfo-Type': 'website',
        'X-DM-AppInfo-Version': 'v2025-05-26T13:45:05.666Z',
        'X-DM-Neon-SSR': '0',
        'X-DM-Preferred-Country': 'tw',
        'accept-language': 'zh-CN',
        'authorization': '',
        'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'x-dm-visit-id': '',
        'x-dm-visitor-id': '',
    }

    _user_agents = [
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    ]

    def __init__(self, proxies: dict = None):
        self._headers_cache = None
        self._cache_lock = Lock()
        self._proxies = proxies

    def get_headers(self, retry: int = 2) -> dict:
        for attempt in range(retry + 1):
            try:
                return self._generate_headers()
            except Exception as e:
                logger.warning(f"[get_headers] 第 {attempt + 1} 次尝试失败: {e}")
                time.sleep(2)

        with self._cache_lock:
            if self._headers_cache:
                logger.info("[get_headers]")
                return copy.deepcopy(self._headers_cache)

        logger.warning("[get_headers] 基础 headers")
        return copy.deepcopy(self._headers_template)

    def _generate_headers(self) -> dict:
        visitor_id = str(uuid.uuid4())
        visit_id = str(int(time.time() * 1000))
        traffic_segment = str(random.randint(100_000, 999_999))
        ua = random.choice(self._user_agents)

        token_headers = {
            'Accept': '*/*',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Origin': 'https://www.dailymotion.com',
            'Pragma': 'no-cache',
            'Referer': 'https://www.dailymotion.com/',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-site',
            'User-Agent': ua,
            'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
        }

        data = {
            'client_id': 'f1a362d288c1b98099c7',
            'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
            'grant_type': 'client_credentials',
            'traffic_segment': traffic_segment,
            'visitor_id': visitor_id,
        }

        response = requests.post(
            'https://graphql.api.dailymotion.com/oauth/token',
            headers=token_headers,
            data=data,
            proxies=self._proxies,
            timeout=10
        )
        response.raise_for_status()
        token = response.json()['access_token']

        new_headers = copy.deepcopy(self._headers_template)
        new_headers['authorization'] = f'Bearer {token}'
        new_headers['x-dm-visit-id'] = visit_id
        new_headers['x-dm-visitor-id'] = visitor_id
        new_headers['User-Agent'] = ua

        with self._cache_lock:
            self._headers_cache = copy.deepcopy(new_headers)

        return new_headers


class DMVideoInfo:
    def __init__(self, proxies: dict = None, max_retries: int = 3, backoff_factor: float = 0.5):
        self.proxies = proxies
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        self.session = self._create_session()

    def _create_session(self):
        session = requests.Session()
        retry = Retry(
            total=self.max_retries,
            connect=self.max_retries,
            read=self.max_retries,
            backoff_factor=self.backoff_factor,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["GET"]
        )
        adapter = HTTPAdapter(max_retries=retry)
        session.mount("http://", adapter)
        session.mount("https://", adapter)

        if self.proxies:
            session.proxies.update(self.proxies)

        return session

    def get_video_info(self, data: dict) -> dict:
        v_xid = data.get('v_xid')
        url = f'https://api.dailymotion.com/video/{v_xid}'
        params = {
            'fields': 'id,title,created_time,thumbnail_240_url,duration,'
                      'owner.id,owner.screenname,likes_total,views_total,'
                      'owner.avatar_60_url,owner.followers_total,owner.videos_total'
        }

        try:
            resp = self.session.get(url, params=params, timeout=10)
            resp.raise_for_status()
            r_data = resp.json()
            xid = r_data["id"]
            vid = base64.b64encode(f"Video:{xid}".encode('utf-8')).decode('utf-8')
            uxid = r_data["owner.id"]
            uid = base64.b64encode(f"Channel:{uxid}".encode('utf-8')).decode('utf-8')
            data["v_id"] = vid
            data["v_title"] = r_data["title"]
            data["link"] = "https://www.dailymotion.com/video/" + xid,
            data["duration"] = r_data["duration"]
            data['createdtime'] = datetime.fromtimestamp(r_data.get("created_time")).strftime("%Y-%m-%d %H:%M:%S"),
            data['']
        except requests.RequestException as e:
            print(f"[ERROR] 请求失败 vxid={v_xid} : {e}")
            return None


def main():
    kwdata = db.get_web_items()
    if not kwdata:
        logger.error("没有获取到关键词数据")
        exit(1)

    kwdata = kwdata[0][1]
    rn = kwdata['rn']
    proxy_name = proxiesdict.get(rn)
    proxies_str = db.get_proxy(proxy_name, '-1')
    proxies = {
        'http': proxies_str,
        'https': proxies_str
    }
    kw = kwdata['keyword']

    dmheader_manager = DMHeaderManager(proxies=proxies)

    headers = dmheader_manager.get_headers()
    for i in range(1, 11):
        data = {
            "operationName": "SEARCH_QUERY",
            "variables": {
                "query": kw,
                "shouldIncludeTopResults": True,  # 是否包含热门结果
                "shouldIncludeChannels": False,  # 是否包含频道
                "shouldIncludePlaylists": False,  # 是否包含播放列表
                "shouldIncludeHashtags": False,  # 是否包含标签
                "shouldIncludeVideos": False,  # 是否包含视频
                "shouldIncludeLives": False,  # 是否包含直播
                "page": i,
                "limit": 20,
                "recaptchaToken": None
            },
            "query": """
        fragment VIDEO_BASE_FRAGMENT on Video {
          id
          xid
          title
          createdAt
          duration
          aspectRatio
          thumbnail(height: PORTRAIT_240) {
            id
            url
            __typename
          }
          creator {
            id
            xid
            name
            displayName
            accountType
            avatar(height: SQUARE_60) {
              id
              url
              __typename
            }
            __typename
          }
          __typename
        }

        fragment CHANNEL_BASE_FRAG on Channel {
          id
          xid
          name
          displayName
          accountType
          isFollowed
          avatar(height: SQUARE_120) {
            id
            url
            __typename
          }
          followerEngagement {
            id
            followDate
            __typename
          }
          metrics {
            id
            engagement {
              id
              followers {
                edges {
                  node {
                    id
                    total
                    __typename
                  }
                  __typename
                }
                __typename
              }
              __typename
            }
            __typename
          }
          __typename
        }

        fragment PLAYLIST_BASE_FRAG on Collection {
          id
          xid
          name
          description
          thumbnail(height: PORTRAIT_240) {
            id
            url
            __typename
          }
          creator {
            id
            xid
            name
            displayName
            accountType
            avatar(height: SQUARE_60) {
              id
              url
              __typename
            }
            __typename
          }
          metrics {
            id
            engagement {
              id
              videos(filter: {visibility: {eq: PUBLIC}}) {
                edges {
                  node {
                    id
                    total
                    __typename
                  }
                  __typename
                }
                __typename
              }
              __typename
            }
            __typename
          }
          __typename
        }

        fragment HASHTAG_BASE_FRAG on Hashtag {
          id
          xid
          name
          metrics {
            id
            engagement {
              id
              videos {
                edges {
                  node {
                    id
                    total
                    __typename
                  }
                  __typename
                }
                __typename
              }
              __typename
            }
            __typename
          }
          __typename
        }

        fragment LIVE_BASE_FRAGMENT on Live {
          id
          xid
          title
          audienceCount
          aspectRatio
          isOnAir
          thumbnail(height: PORTRAIT_240) {
            id
            url
            __typename
          }
          creator {
            id
            xid
            name
            displayName
            accountType
            avatar(height: SQUARE_60) {
              id
              url
              __typename
            }
            __typename
          }
          __typename
        }

        query SEARCH_QUERY(
          $query: String!,
          $shouldIncludeTopResults: Boolean!,
          $shouldIncludeVideos: Boolean!,
          $shouldIncludeChannels: Boolean!,
          $shouldIncludePlaylists: Boolean!,
          $shouldIncludeHashtags: Boolean!,
          $shouldIncludeLives: Boolean!,
          $page: Int,
          $limit: Int,
          $sortByVideos: SearchVideoSort,
          $durationMinVideos: Int,
          $durationMaxVideos: Int,
          $createdAfterVideos: DateTime,
          $recaptchaToken: String
        ) {
          search(token: $recaptchaToken) {
            id

            stories(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeTopResults) {
              metadata {
                id
                algorithm {
                  uuid
                  __typename
                }
                __typename
              }
              pageInfo {
                hasNextPage
                nextPage
                __typename
              }
              edges {
                node {
                  ...VIDEO_BASE_FRAGMENT
                  ...CHANNEL_BASE_FRAG
                  ...PLAYLIST_BASE_FRAG
                  ...HASHTAG_BASE_FRAG
                  ...LIVE_BASE_FRAGMENT
                  __typename
                }
                __typename
              }
              __typename
            }

            videos(
              query: $query,
              first: $limit,
              page: $page,
              sort: $sortByVideos,
              durationMin: $durationMinVideos,
              durationMax: $durationMaxVideos,
              createdAfter: $createdAfterVideos
            ) @include(if: $shouldIncludeVideos) {
              metadata {
                id
                algorithm {
                  uuid
                  __typename
                }
                __typename
              }
              pageInfo {
                hasNextPage
                nextPage
                __typename
              }
              edges {
                node {
                  id
                  ...VIDEO_BASE_FRAGMENT
                  __typename
                }
                __typename
              }
              __typename
            }

            lives(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeLives) {
              metadata {
                id
                algorithm {
                  uuid
                  __typename
                }
                __typename
              }
              pageInfo {
                hasNextPage
                nextPage
                __typename
              }
              edges {
                node {
                  id
                  ...LIVE_BASE_FRAGMENT
                  __typename
                }
                __typename
              }
              __typename
            }

            channels(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeChannels) {
              metadata {
                id
                algorithm {
                  uuid
                  __typename
                }
                __typename
              }
              pageInfo {
                hasNextPage
                nextPage
                __typename
              }
              edges {
                node {
                  id
                  ...CHANNEL_BASE_FRAG
                  __typename
                }
                __typename
              }
              __typename
            }

            playlists: collections(query: $query, first: $limit, page: $page) @include(if: $shouldIncludePlaylists) {
              metadata {
                id
                algorithm {
                  uuid
                  __typename
                }
                __typename
              }
              pageInfo {
                hasNextPage
                nextPage
                __typename
              }
              edges {
                node {
                  id
                  ...PLAYLIST_BASE_FRAG
                  __typename
                }
                __typename
              }
              __typename
            }

            hashtags(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeHashtags) {
              metadata {
                id
                algorithm {
                  uuid
                  __typename
                }
                __typename
              }
              pageInfo {
                hasNextPage
                nextPage
                __typename
              }
              edges {
                node {
                  id
                  ...HASHTAG_BASE_FRAG
                  __typename
                }
                __typename
              }
              __typename
            }

            __typename
          }
        }
        """
        }

        payload = json.dumps(data).encode()

        response = requests.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload,
                                 proxies=proxies)

        data = response.json()
        edges = data['data']['search']['stories']['edges']
        edges_len = len(edges)
        dm_video_info = DMVideoInfo(proxies=proxies)
        tancks = []
        for j, edge in enumerate(edges):
            node = edge.get("node", {})
            tancks.append({
                "keyword": kw,
                "v_name": kwdata.get("v_name", ""),
                "v_xid": node.get("xid"),
                "batch": kwdata.get("batch"),
                "rn": kwdata.get("rn"),
                "machine_id": MACHINE_ID,
                "index": (i - 1) * 20 + j + 1,
                "level": 0,
            })

        if edges_len < 20:
            break