feat: 添加视频信息处理和时间格式化功能

This commit is contained in:
晓丰 2025-07-17 14:21:15 +08:00
parent e9ef87fe62
commit fe96e23cc2

832
oneget.py
View File

@ -1,3 +1,6 @@
import base64
from datetime import datetime
import requests import requests
import uuid import uuid
import random import random
@ -7,11 +10,39 @@ from threading import Lock
import logging import logging
from DB import DBVidcon from DB import DBVidcon
import json import json
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
MACHINE_ID = 3
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
db = DBVidcon() db = DBVidcon()
proxiesdict = db.get_proxy_agent_dict() proxiesdict = db.get_proxy_agent_dict()
def clean_dash_to_zero(val):
if val in ('-', '', None):
return 0
try:
return int(val)
except (ValueError, TypeError) as e:
logger.exception(f"[字段异常] val = {val}{str(e)}")
return 0
def format_create_time(timestr):
try:
dt = date_parser.isoparse(timestr)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
logger.exception(f"[时间格式错误] {timestr}{str(e)}")
return "1970-01-01 00:00:00"
def format_duration(seconds):
try:
seconds = int(seconds)
return f"{seconds // 60:02}:{seconds % 60:02}"
except Exception:
return "00:00"
class DMHeaderManager: class DMHeaderManager:
_headers_template = { _headers_template = {
@ -124,400 +155,471 @@ class DMHeaderManager:
return new_headers return new_headers
kwdata = db.get_web_items()
if not kwdata:
logger.error("没有获取到关键词数据")
exit(1)
kwdata = kwdata[0][1] class DMVideoInfo:
rn = kwdata['rn'] def __init__(self, proxies: dict = None, max_retries: int = 3, backoff_factor: float = 0.5):
proxy_name = proxiesdict.get(rn) self.proxies = proxies
proxies_str = db.get_proxy(proxy_name, '-1') self.max_retries = max_retries
proxies = { self.backoff_factor = backoff_factor
'http': proxies_str, self.session = self._create_session()
'https': proxies_str
}
kw = kwdata['keyword']
print(kw)
print("=" * 30)
dmheader_manager = DMHeaderManager(proxies=proxies)
headers = dmheader_manager.get_headers() def _create_session(self):
session = requests.Session()
retry = Retry(
total=self.max_retries,
connect=self.max_retries,
read=self.max_retries,
backoff_factor=self.backoff_factor,
status_forcelist=[500, 502, 503, 504],
allowed_methods=["GET"]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
data = { if self.proxies:
"operationName": "SEARCH_QUERY", session.proxies.update(self.proxies)
"variables": {
"query": kw, return session
"shouldIncludeTopResults": True, # 是否包含热门结果
"shouldIncludeChannels": False, # 是否包含频道 def get_video_info(self, data: dict) -> dict:
"shouldIncludePlaylists": False, # 是否包含播放列表 v_xid = data.get('v_xid')
"shouldIncludeHashtags": False, # 是否包含标签 url = f'https://api.dailymotion.com/video/{v_xid}'
"shouldIncludeVideos": False, # 是否包含视频 params = {
"shouldIncludeLives": False, # 是否包含直播 'fields': 'id,title,created_time,thumbnail_240_url,duration,'
"page": 1, 'owner.id,owner.screenname,likes_total,views_total,'
"limit": 20, 'owner.avatar_60_url,owner.followers_total,owner.videos_total'
"recaptchaToken": None }
},
"query": """ try:
fragment VIDEO_BASE_FRAGMENT on Video { resp = self.session.get(url, params=params, timeout=10)
id resp.raise_for_status()
xid r_data = resp.json()
title xid = r_data["id"]
createdAt vid = base64.b64encode(f"Video:{xid}".encode('utf-8')).decode('utf-8')
duration uxid = r_data["owner.id"]
aspectRatio uid = base64.b64encode(f"Channel:{uxid}".encode('utf-8')).decode('utf-8')
thumbnail(height: PORTRAIT_240) { data["v_id"] = vid
id data["v_title"] = r_data["title"]
url data["link"] = "https://www.dailymotion.com/video/" + xid,
__typename data["duration"] = r_data["duration"]
} data['createdtime'] = datetime.fromtimestamp(r_data.get("created_time")).strftime("%Y-%m-%d %H:%M:%S"),
creator { data['']
id except requests.RequestException as e:
xid print(f"[ERROR] 请求失败 vxid={v_xid} : {e}")
name return None
displayName
accountType
avatar(height: SQUARE_60) {
id def main():
url kwdata = db.get_web_items()
__typename if not kwdata:
logger.error("没有获取到关键词数据")
exit(1)
kwdata = kwdata[0][1]
rn = kwdata['rn']
proxy_name = proxiesdict.get(rn)
proxies_str = db.get_proxy(proxy_name, '-1')
proxies = {
'http': proxies_str,
'https': proxies_str
} }
__typename kw = kwdata['keyword']
}
__typename
}
fragment CHANNEL_BASE_FRAG on Channel { dmheader_manager = DMHeaderManager(proxies=proxies)
id
xid headers = dmheader_manager.get_headers()
name for i in range(1, 11):
displayName data = {
accountType "operationName": "SEARCH_QUERY",
isFollowed "variables": {
avatar(height: SQUARE_120) { "query": kw,
id "shouldIncludeTopResults": True, # 是否包含热门结果
url "shouldIncludeChannels": False, # 是否包含频道
__typename "shouldIncludePlaylists": False, # 是否包含播放列表
} "shouldIncludeHashtags": False, # 是否包含标签
followerEngagement { "shouldIncludeVideos": False, # 是否包含视频
id "shouldIncludeLives": False, # 是否包含直播
followDate "page": i,
__typename "limit": 20,
} "recaptchaToken": None
metrics { },
id "query": """
engagement { fragment VIDEO_BASE_FRAGMENT on Video {
id id
followers { xid
edges { title
node { createdAt
duration
aspectRatio
thumbnail(height: PORTRAIT_240) {
id id
total url
__typename
}
creator {
id
xid
name
displayName
accountType
avatar(height: SQUARE_60) {
id
url
__typename
}
__typename __typename
} }
__typename __typename
} }
__typename
} fragment CHANNEL_BASE_FRAG on Channel {
__typename id
} xid
__typename name
} displayName
__typename accountType
} isFollowed
avatar(height: SQUARE_120) {
fragment PLAYLIST_BASE_FRAG on Collection {
id
xid
name
description
thumbnail(height: PORTRAIT_240) {
id
url
__typename
}
creator {
id
xid
name
displayName
accountType
avatar(height: SQUARE_60) {
id
url
__typename
}
__typename
}
metrics {
id
engagement {
id
videos(filter: {visibility: {eq: PUBLIC}}) {
edges {
node {
id id
total url
__typename
}
followerEngagement {
id
followDate
__typename
}
metrics {
id
engagement {
id
followers {
edges {
node {
id
total
__typename
}
__typename
}
__typename
}
__typename
}
__typename __typename
} }
__typename __typename
} }
__typename
} fragment PLAYLIST_BASE_FRAG on Collection {
__typename id
} xid
__typename name
} description
__typename thumbnail(height: PORTRAIT_240) {
}
fragment HASHTAG_BASE_FRAG on Hashtag {
id
xid
name
metrics {
id
engagement {
id
videos {
edges {
node {
id id
total url
__typename
}
creator {
id
xid
name
displayName
accountType
avatar(height: SQUARE_60) {
id
url
__typename
}
__typename
}
metrics {
id
engagement {
id
videos(filter: {visibility: {eq: PUBLIC}}) {
edges {
node {
id
total
__typename
}
__typename
}
__typename
}
__typename
}
__typename __typename
} }
__typename __typename
} }
__typename
} fragment HASHTAG_BASE_FRAG on Hashtag {
__typename
}
__typename
}
__typename
}
fragment LIVE_BASE_FRAGMENT on Live {
id
xid
title
audienceCount
aspectRatio
isOnAir
thumbnail(height: PORTRAIT_240) {
id
url
__typename
}
creator {
id
xid
name
displayName
accountType
avatar(height: SQUARE_60) {
id
url
__typename
}
__typename
}
__typename
}
query SEARCH_QUERY(
$query: String!,
$shouldIncludeTopResults: Boolean!,
$shouldIncludeVideos: Boolean!,
$shouldIncludeChannels: Boolean!,
$shouldIncludePlaylists: Boolean!,
$shouldIncludeHashtags: Boolean!,
$shouldIncludeLives: Boolean!,
$page: Int,
$limit: Int,
$sortByVideos: SearchVideoSort,
$durationMinVideos: Int,
$durationMaxVideos: Int,
$createdAfterVideos: DateTime,
$recaptchaToken: String
) {
search(token: $recaptchaToken) {
id
stories(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeTopResults) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
...VIDEO_BASE_FRAGMENT
...CHANNEL_BASE_FRAG
...PLAYLIST_BASE_FRAG
...HASHTAG_BASE_FRAG
...LIVE_BASE_FRAGMENT
__typename
}
__typename
}
__typename
}
videos(
query: $query,
first: $limit,
page: $page,
sort: $sortByVideos,
durationMin: $durationMinVideos,
durationMax: $durationMaxVideos,
createdAfter: $createdAfterVideos
) @include(if: $shouldIncludeVideos) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id id
...VIDEO_BASE_FRAGMENT xid
name
metrics {
id
engagement {
id
videos {
edges {
node {
id
total
__typename
}
__typename
}
__typename
}
__typename
}
__typename
}
__typename __typename
} }
__typename
} fragment LIVE_BASE_FRAGMENT on Live {
__typename
}
lives(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeLives) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id id
...LIVE_BASE_FRAGMENT xid
title
audienceCount
aspectRatio
isOnAir
thumbnail(height: PORTRAIT_240) {
id
url
__typename
}
creator {
id
xid
name
displayName
accountType
avatar(height: SQUARE_60) {
id
url
__typename
}
__typename
}
__typename __typename
} }
__typename
} query SEARCH_QUERY(
__typename $query: String!,
} $shouldIncludeTopResults: Boolean!,
$shouldIncludeVideos: Boolean!,
channels(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeChannels) { $shouldIncludeChannels: Boolean!,
metadata { $shouldIncludePlaylists: Boolean!,
id $shouldIncludeHashtags: Boolean!,
algorithm { $shouldIncludeLives: Boolean!,
uuid $page: Int,
__typename $limit: Int,
$sortByVideos: SearchVideoSort,
$durationMinVideos: Int,
$durationMaxVideos: Int,
$createdAfterVideos: DateTime,
$recaptchaToken: String
) {
search(token: $recaptchaToken) {
id
stories(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeTopResults) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
...VIDEO_BASE_FRAGMENT
...CHANNEL_BASE_FRAG
...PLAYLIST_BASE_FRAG
...HASHTAG_BASE_FRAG
...LIVE_BASE_FRAGMENT
__typename
}
__typename
}
__typename
}
videos(
query: $query,
first: $limit,
page: $page,
sort: $sortByVideos,
durationMin: $durationMinVideos,
durationMax: $durationMaxVideos,
createdAfter: $createdAfterVideos
) @include(if: $shouldIncludeVideos) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...VIDEO_BASE_FRAGMENT
__typename
}
__typename
}
__typename
}
lives(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeLives) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...LIVE_BASE_FRAGMENT
__typename
}
__typename
}
__typename
}
channels(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeChannels) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...CHANNEL_BASE_FRAG
__typename
}
__typename
}
__typename
}
playlists: collections(query: $query, first: $limit, page: $page) @include(if: $shouldIncludePlaylists) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...PLAYLIST_BASE_FRAG
__typename
}
__typename
}
__typename
}
hashtags(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeHashtags) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...HASHTAG_BASE_FRAG
__typename
}
__typename
}
__typename
}
__typename
}
} }
__typename """
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...CHANNEL_BASE_FRAG
__typename
} }
__typename
}
__typename
}
playlists: collections(query: $query, first: $limit, page: $page) @include(if: $shouldIncludePlaylists) { payload = json.dumps(data).encode()
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...PLAYLIST_BASE_FRAG
__typename
}
__typename
}
__typename
}
hashtags(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeHashtags) { response = requests.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload,
metadata { proxies=proxies)
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...HASHTAG_BASE_FRAG
__typename
}
__typename
}
__typename
}
__typename data = response.json()
} edges = data['data']['search']['stories']['edges']
} edges_len = len(edges)
""" dm_video_info = DMVideoInfo(proxies=proxies)
} tancks = []
for j, edge in enumerate(edges):
node = edge.get("node", {})
tancks.append({
"keyword": kw,
"v_name": kwdata.get("v_name", ""),
"v_xid": node.get("xid"),
"batch": kwdata.get("batch"),
"rn": kwdata.get("rn"),
"machine_id": MACHINE_ID,
"index": (i - 1) * 20 + j + 1,
"level": 0,
})
payload = json.dumps(data).encode() if edges_len < 20:
break
response = requests.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload,
proxies=proxies)
data = response.json()
edges = data['data']['search']['stories']['edges']
for i, edge in enumerate(edges):
print(i, edge['node']['xid'])