Compare commits

..

1 Commits

22 changed files with 487 additions and 2409 deletions

923
DB.py

File diff suppressed because it is too large Load Diff

View File

@ -1,38 +0,0 @@
from DB import DBVidcon
import requests
from logger import logger
db = DBVidcon()
def check_video_removed(video_id):
url = f"https://api.dailymotion.com/video/{video_id}"
params = {"fields": "published,private,status"}
resp = requests.get(url, params=params, timeout=10)
# 404 -> 不存在或已被删除
if resp.status_code == 404:
return 1
data = resp.json()
# published=False 或 private=True 都视作“已下架”
if not data.get("published", False) or data.get("private", False):
return 1
return 0
def main():
lis = db.getreport_video()
for li in lis:
video_id = li['v_xid']
status = check_video_removed(video_id)
if status == 1:
db.mark_video_removed(li['id'], status)
logger.info(f"视频id {video_id} 下架")
else:
db.mark_video_removed(li['id'], status)
logger.info(f"视频id {video_id} 仍然存在")
if __name__ == '__main__':
main()

View File

@ -1,6 +1,6 @@
import json, time
import argparse
from DB import DBVidcon
from DB import DBVidcon, DBSA
def parse_args():
parser = argparse.ArgumentParser(
@ -14,8 +14,10 @@ def main():
args = parse_args()
batch = int(time.time())
db = DBVidcon()
push = None
empty = None
for chunk in DBSA.stream_video_keys(chunk_size=10_000):
db.cache_video_keys_bulk(chunk)
print(f"同步Redis=={len(chunk)}")
if args.level == 0:
push = db.push_l0

View File

@ -1,40 +0,0 @@
import json, time
import argparse
from DB import DBVidcon
def parse_args():
parser = argparse.ArgumentParser(
description="Dump keyword/title rows into Redis list."
)
parser.add_argument("-l", "--level", type=int, default=99,
help="value for t.level (default: 99)")
return parser.parse_args()
def main():
batch = int(time.time())
db = DBVidcon()
push = db.push_web
empty = db.web_empty
if empty():
rows = db.fetch_keyword_title(level=0)
payload_list = []
for row in rows:
payload_list.append(json.dumps({**row, "batch": batch}, ensure_ascii=False))
if len(payload_list) >= 10000:
push(payload_list)
payload_list.clear()
if payload_list: # 收尾
push(payload_list)
data = {
"level": 0,
"batch": batch,
"count": len(rows),
}
db.log_batch_start(data)
print(f"✔ 推送 {len(rows)}batch={batch})到 {push.__name__}队列完毕")
db.close()
if __name__ == "__main__":
main()

View File

@ -1,57 +0,0 @@
import requests
from flask import Flask, request, jsonify
from DB import DBVidcon
app = Flask(__name__)
endpoint = "https://api.dailymotion.com/videos"
DEFAULT_PAGE = 1
FIXED_LIMIT = 100
VALID_SORTS = {
'recent', 'relevance', 'alpha', 'alphaaz',
'alphaza', 'most', 'least', 'changed'
}
db = DBVidcon()
@app.route("/get", methods=["GET"])
def get_videos():
keyword = request.args.get("keyword", "").strip()
if not keyword:
return jsonify({"status": "error", "msg": "keyword 参数不能为空"}), 400
# 页码和国家参数
i = request.args.get("page", DEFAULT_PAGE, type=int)
rn = request.args.get("rn", "US").upper()
# 排序参数,必须合法
sort = request.args.get("sort", "relevance").strip().lower()
if sort not in VALID_SORTS:
return jsonify({
"status": "error",
"msg": f"sort 参数非法,可选值: {sorted(VALID_SORTS)}"
}), 400
proxy_string = db.get_proxy(rn)
proxies = {"http": proxy_string, "https": proxy_string} if proxy_string else None
params = {
"search": keyword,
"fields": "id,title,created_time,thumbnail_240_url,duration,"
"owner.id,owner.screenname,likes_total,views_total",
"limit": FIXED_LIMIT,
"page": i,
"sort": sort
}
try:
resp = requests.get(endpoint, params=params, proxies=proxies, timeout=10)
resp.raise_for_status()
jd = resp.json()
return jsonify(jd), 200
except requests.exceptions.RequestException as e:
return jsonify({"status": "error", "msg": str(e)}), 502
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8000, debug=False)

View File

@ -1,51 +0,0 @@
#!/usr/bin/env python3
# app.py
import requests
from flask import Flask, jsonify, abort
app = Flask(__name__)
def check_video_removed(video_id):
"""
调用 Dailymotion API 判断视频是否已下架删除
返回:
1 已被删除 / 不存在 / 已下架 / 设为私有
0 正常公开中
"""
url = f"https://api.dailymotion.com/video/{video_id}"
params = {"fields": "published,private,status"}
try:
resp = requests.get(url, params=params, timeout=10)
except requests.RequestException as exc:
# 网络错误时返回 503让上游知道需要重试
abort(503, description=f"Upstream request failed: {exc}")
# 404 → 不存在或已被删除
if resp.status_code == 404:
return 1
# 其他非 2xx 状态码 → 直接透传给客户端
if resp.status_code // 100 != 2:
abort(resp.status_code, description=resp.text)
data = resp.json()
# published=False 或 private=True 都视作“已下架”
if not data.get("published", False) or data.get("private", False):
return 1
return 0
@app.route("/video/<video_id>", methods=["GET"])
def video_status(video_id):
removed = check_video_removed(video_id)
return jsonify({"video_id": video_id, "removed": removed})
if __name__ == "__main__":
# 支持通过环境变量覆盖监听地址和端口
import os
host = os.getenv("HOST", "0.0.0.0")
port = 5100
app.run(host=host, port=port, debug=False)

View File

@ -1,18 +0,0 @@
#!/usr/bin/env bash
# -------- 可按需修改 --------
TARGET="/opt/ql/DailyMotion/main.py" # 关键字:精确到脚本路径即可
SIG="9" # 信号;默认 -9想温和一点改成 15
# --------------------------------
pids=$(pgrep -f "$TARGET")
if [ -z "$pids" ]; then
echo "没有发现正在运行的 $TARGET"
exit 0
fi
echo "即将发送 SIG${SIG:-15} 到进程: $pids"
kill "-${SIG:-15}" $pids
echo "完成"

38
main.py
View File

@ -1,4 +1,3 @@
#!/opt/ql/daily_com/bin/python3
import base64
import traceback
import argparse
@ -56,15 +55,15 @@ def format_duration(seconds):
return "00:00"
def get_searchInfo(keyword, level, headers, proxy_name, r=2):
def get_searchInfo(keyword, level, rn, proxy_name, r=2):
if r == 2:
logger.info(f"NET处理->{keyword},\trn->{proxy_name},\tlevel->{level}")
video_list = []
max_page = 3
limit = 100
max_page = 2
limit = 30
endpoint = 'https://api.dailymotion.com/videos'
if level == 0 or level == 1:
max_page = 4
max_page = 3
limit = 100
for j in range(1, max_page):
params = {
@ -88,7 +87,7 @@ def get_searchInfo(keyword, level, headers, proxy_name, r=2):
logger.exception(f"[Requested] 未知:{e}, keyword: {keyword}, l: {level}")
else:
time.sleep((3 - r) * 5)
return get_searchInfo(keyword, level, headers, proxy_name, r - 1)
return get_searchInfo(keyword, level, rn, proxy_name, r - 1)
try:
resinfo = jsondata.get("list")
except Exception:
@ -98,7 +97,7 @@ def get_searchInfo(keyword, level, headers, proxy_name, r=2):
return None
else:
time.sleep((3 - r) * 5)
return get_searchInfo(keyword, level, headers, proxy_name, r - 1)
return get_searchInfo(keyword, level, rn, proxy_name, r - 1)
for index, iteminfo in enumerate(resinfo):
calculated_index = index + 1 + (j - 1) * limit
xid = iteminfo["id"]
@ -106,6 +105,10 @@ def get_searchInfo(keyword, level, headers, proxy_name, r=2):
uxid = iteminfo["owner.id"]
uid = base64.b64encode(f"Channel:{uxid}".encode('utf-8')).decode('utf-8')
duration = iteminfo.get('duration')
is_repeat = 0
if db.video_key_exists(vid.strip(), rn):
is_repeat = 1
if duration <= 300:
continue
v_data = {
@ -123,9 +126,11 @@ def get_searchInfo(keyword, level, headers, proxy_name, r=2):
"u_id": uid,
"u_xid": uxid,
"u_name": iteminfo.get('owner.screenname'),
"u_pic": iteminfo.get('owner.avatar_60_url')
"u_pic": iteminfo.get('owner.avatar_60_url'),
"is_repeat": is_repeat,
}
video_list.append(v_data)
time.sleep(3)
if len(video_list) < 100:
break
return video_list
@ -137,11 +142,11 @@ proxiesdict = db.get_proxy_agent_dict()
def search_worker(payload, kitem, flag):
try:
gproxies = proxiesdict[kitem['rn']]
v_list = get_searchInfo(kitem['keyword'], kitem['level'], None, gproxies)
v_list = get_searchInfo(kitem['keyword'], kitem['level'], kitem['rn'], gproxies)
if not v_list:
for i in range(2):
time.sleep(i * 5)
v_list = get_searchInfo(kitem['keyword'], kitem['level'], None, gproxies)
v_list = get_searchInfo(kitem['keyword'], kitem['level'], kitem['rn'], gproxies)
if v_list:
break
time.sleep(2)
@ -163,11 +168,10 @@ def integrate_data_parallel():
time.sleep(10)
continue
futures = []
for payload, kitem in tasks:
futures.append(executor.submit(search_worker, payload, kitem, flag))
time.sleep(1)
futures = [
executor.submit(search_worker, payload, kitem, flag)
for payload, kitem in tasks
]
rollback = {0: [], 1: [], 2: []}
for fut in concurrent.futures.as_completed(futures):
@ -178,8 +182,6 @@ def integrate_data_parallel():
continue
for item in v_list:
if not v_list:
continue
DBSA.upsert_video({
"keyword": kitem["keyword"],
"v_name": kitem["v_name"],
@ -202,6 +204,7 @@ def integrate_data_parallel():
"batch": kitem["batch"],
"machine_id": MACHINE_ID,
"level": kitem["level"],
"is_repeat": item['is_repeat']
})
DBSA.flush()
if rollback[0]:
@ -210,6 +213,7 @@ def integrate_data_parallel():
db.rollback_l1(rollback[1])
if rollback[2]:
db.rollback_l2(rollback[2])
time.sleep(10)
def parse_args() -> argparse.Namespace:

View File

@ -98,9 +98,9 @@ def fetch_all_data_for_rn(rn: str, batches: list[int]) -> pd.DataFrame:
def export_all():
# 指定要处理的批次
batches = [1748965168, 1749049335]
batches = [1747324254, 1747323990]
# 先更新 is_repeat
# update_is_repeat(batches)
update_is_repeat(batches)
rn_list = get_rn_list()
timestamp = datetime.now().strftime("%Y%m%d")

684
oneget.py
View File

@ -1,684 +0,0 @@
import argparse
import base64
from datetime import datetime
import concurrent.futures
import requests
import uuid
import random
import time
import copy
from threading import Lock
from DB import DBVidcon, DBSA
import json
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from dateutil import parser as date_parser
MACHINE_ID = 0
db = DBVidcon()
proxiesdict = db.get_proxy_agent_dict()
class RetryRequests:
def __init__(
self,
proxies: dict = None,
timeout: int = 10,
total: int = 3,
backoff_factor: float = 1.0,
status_forcelist: tuple = (500, 502, 503, 504),
allowed_methods: tuple = ("GET", "POST"),
):
self.session = requests.Session()
self.timeout = timeout
self.proxies = proxies
retry = Retry(
total=total,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
allowed_methods=allowed_methods,
raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retry)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
def get(self, url, **kwargs):
kwargs.setdefault("timeout", self.timeout)
if self.proxies:
kwargs.setdefault("proxies", self.proxies)
return self.session.get(url, **kwargs)
def post(self, url, **kwargs):
kwargs.setdefault("timeout", self.timeout)
if self.proxies:
kwargs.setdefault("proxies", self.proxies)
return self.session.post(url, **kwargs)
req = RetryRequests()
def clean_dash_to_zero(val):
if val in ('-', '', None):
return 0
try:
return int(val)
except (ValueError, TypeError) as e:
print(f"[字段异常] val = {val}{str(e)}")
return 0
def format_create_time(timestr):
try:
dt = date_parser.isoparse(timestr)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
print(f"[时间格式错误] {timestr}{str(e)}")
return "1970-01-01 00:00:00"
def format_duration(seconds):
try:
seconds = int(seconds)
return f"{seconds // 60:02}:{seconds % 60:02}"
except Exception:
return "00:00"
class DMHeaderManager:
_headers_template = {
'Accept': '*/*, */*',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json, application/json',
'Host': 'graphql.api.dailymotion.com',
'Origin': 'https://www.dailymotion.com',
'Referer': 'https://www.dailymotion.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0',
'X-DM-AppInfo-Id': 'com.dailymotion.neon',
'X-DM-AppInfo-Type': 'website',
'X-DM-AppInfo-Version': 'v2025-05-26T13:45:05.666Z',
'X-DM-Neon-SSR': '0',
'X-DM-Preferred-Country': 'tw',
'accept-language': 'zh-CN',
'authorization': '',
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'x-dm-visit-id': '',
'x-dm-visitor-id': '',
}
_user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
]
def __init__(self, proxies: dict = None):
self._headers_cache = None
self._cache_lock = Lock()
self._proxies = proxies
def get_headers(self, retry: int = 2) -> dict:
visitor_id = str(uuid.uuid4())
visit_id = str(int(time.time() * 1000))
traffic_segment = str(random.randint(100_000, 999_999))
ua = random.choice(self._user_agents)
token_headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://www.dailymotion.com',
'Pragma': 'no-cache',
'Referer': 'https://www.dailymotion.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': ua,
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
data = {
'client_id': 'f1a362d288c1b98099c7',
'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
'grant_type': 'client_credentials',
'traffic_segment': traffic_segment,
'visitor_id': visitor_id,
}
response = req.post(
'https://graphql.api.dailymotion.com/oauth/token',
headers=token_headers,
data=data,
proxies=self._proxies,
timeout=10
)
response.raise_for_status()
token = response.json()['access_token']
new_headers = copy.deepcopy(self._headers_template)
new_headers['authorization'] = f'Bearer {token}'
new_headers['x-dm-visit-id'] = visit_id
new_headers['x-dm-visitor-id'] = visitor_id
new_headers['User-Agent'] = ua
with self._cache_lock:
self._headers_cache = copy.deepcopy(new_headers)
return new_headers
class DMVideoInfo:
def __init__(self, proxies: dict = None, max_retries: int = 3, backoff_factor: float = 0.5):
self.proxies = proxies
self.max_retries = max_retries
self.backoff_factor = backoff_factor
def get_video_info(self, data: dict) -> dict:
v_xid = data.get('v_xid')
url = f'https://api.dailymotion.com/video/{v_xid}'
params = {
'fields': 'id,title,created_time,thumbnail_240_url,duration,'
'owner.id,owner.screenname,likes_total,views_total,'
'owner.avatar_60_url,owner.followers_total,owner.videos_total'
}
try:
resp = req.get(url, params=params, timeout=10)
resp.raise_for_status()
r_data = resp.json()
xid = r_data["id"]
vid = base64.b64encode(f"Video:{xid}".encode('utf-8')).decode('utf-8')
uxid = r_data["owner.id"]
uid = base64.b64encode(f"Channel:{uxid}".encode('utf-8')).decode('utf-8')
duration = r_data.get("duration", 0)
if duration < 30:
return None
data["v_id"] = vid
data["title"] = r_data.get("title", "")
data["link"] = "https://www.dailymotion.com/video/" + xid
data["duration"] = format_duration(r_data.get("duration", 0))
data['create_time'] = format(
datetime.fromtimestamp(r_data.get("created_time")).strftime("%Y-%m-%d %H:%M:%S"))
data['fans'] = clean_dash_to_zero(r_data.get("owner.followers_total", 0))
data['videos'] = clean_dash_to_zero(r_data.get("owner.videos_total", 0))
data['watch_number'] = clean_dash_to_zero(r_data.get("views_total", 0))
data['cover_pic'] = r_data.get('thumbnail_240_url')
data['u_id'] = uid
data['u_xid'] = uxid
data['u_name'] = r_data.get("owner.screenname", "")
data['u_pic'] = r_data.get("owner.avatar_60_url", "")
DBSA.upsert_video(data)
DBSA.flush()
except requests.RequestException as e:
print(f"[ERROR] 请求失败 vxid={v_xid} : {e}")
return None
def parse_args() -> argparse.Namespace:
global MACHINE_ID
parser = argparse.ArgumentParser(
description="Configure worker settings."
)
parser.add_argument(
"-m", "--machine-id",
type=int,
help=f"Machine identifier (default: {MACHINE_ID})"
)
args = parser.parse_args()
if args.machine_id is not None:
MACHINE_ID = args.machine_id
if MACHINE_ID is None or MACHINE_ID == 0:
raise ValueError("请指定机器编号")
return args
def main():
while True:
kwdata = db.get_web_items()
if not kwdata:
print("没有获取到关键词数据")
time.sleep(30)
continue
print(f"搜索关键词数据: {kwdata}")
kwdata = kwdata[0][1]
rn = kwdata['rn']
proxy_name = proxiesdict.get(rn)
# proxies_str = "http://127.0.0.1:10808"
proxies_str = db.get_proxy(proxy_name, '-1')
proxies = {
'http': proxies_str,
'https': proxies_str
}
kw = kwdata['keyword']
dmheader_manager = DMHeaderManager(proxies=proxies)
dmvideo_info = DMVideoInfo(proxies=proxies)
headers = dmheader_manager.get_headers()
for i in range(1, 11):
data = {
"operationName": "SEARCH_QUERY",
"variables": {
"query": kw,
"shouldIncludeTopResults": True, # 是否包含热门结果
"shouldIncludeChannels": False, # 是否包含频道
"shouldIncludePlaylists": False, # 是否包含播放列表
"shouldIncludeHashtags": False, # 是否包含标签
"shouldIncludeVideos": False, # 是否包含视频
"shouldIncludeLives": False, # 是否包含直播
"page": i,
"limit": 20,
"recaptchaToken": None
},
"query": """
fragment VIDEO_BASE_FRAGMENT on Video {
id
xid
title
createdAt
duration
aspectRatio
thumbnail(height: PORTRAIT_240) {
id
url
__typename
}
creator {
id
xid
name
displayName
accountType
avatar(height: SQUARE_60) {
id
url
__typename
}
__typename
}
__typename
}
fragment CHANNEL_BASE_FRAG on Channel {
id
xid
name
displayName
accountType
isFollowed
avatar(height: SQUARE_120) {
id
url
__typename
}
followerEngagement {
id
followDate
__typename
}
metrics {
id
engagement {
id
followers {
edges {
node {
id
total
__typename
}
__typename
}
__typename
}
__typename
}
__typename
}
__typename
}
fragment PLAYLIST_BASE_FRAG on Collection {
id
xid
name
description
thumbnail(height: PORTRAIT_240) {
id
url
__typename
}
creator {
id
xid
name
displayName
accountType
avatar(height: SQUARE_60) {
id
url
__typename
}
__typename
}
metrics {
id
engagement {
id
videos(filter: {visibility: {eq: PUBLIC}}) {
edges {
node {
id
total
__typename
}
__typename
}
__typename
}
__typename
}
__typename
}
__typename
}
fragment HASHTAG_BASE_FRAG on Hashtag {
id
xid
name
metrics {
id
engagement {
id
videos {
edges {
node {
id
total
__typename
}
__typename
}
__typename
}
__typename
}
__typename
}
__typename
}
fragment LIVE_BASE_FRAGMENT on Live {
id
xid
title
audienceCount
aspectRatio
isOnAir
thumbnail(height: PORTRAIT_240) {
id
url
__typename
}
creator {
id
xid
name
displayName
accountType
avatar(height: SQUARE_60) {
id
url
__typename
}
__typename
}
__typename
}
query SEARCH_QUERY(
$query: String!,
$shouldIncludeTopResults: Boolean!,
$shouldIncludeVideos: Boolean!,
$shouldIncludeChannels: Boolean!,
$shouldIncludePlaylists: Boolean!,
$shouldIncludeHashtags: Boolean!,
$shouldIncludeLives: Boolean!,
$page: Int,
$limit: Int,
$sortByVideos: SearchVideoSort,
$durationMinVideos: Int,
$durationMaxVideos: Int,
$createdAfterVideos: DateTime,
$recaptchaToken: String
) {
search(token: $recaptchaToken) {
id
stories(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeTopResults) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
...VIDEO_BASE_FRAGMENT
...CHANNEL_BASE_FRAG
...PLAYLIST_BASE_FRAG
...HASHTAG_BASE_FRAG
...LIVE_BASE_FRAGMENT
__typename
}
__typename
}
__typename
}
videos(
query: $query,
first: $limit,
page: $page,
sort: $sortByVideos,
durationMin: $durationMinVideos,
durationMax: $durationMaxVideos,
createdAfter: $createdAfterVideos
) @include(if: $shouldIncludeVideos) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...VIDEO_BASE_FRAGMENT
__typename
}
__typename
}
__typename
}
lives(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeLives) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...LIVE_BASE_FRAGMENT
__typename
}
__typename
}
__typename
}
channels(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeChannels) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...CHANNEL_BASE_FRAG
__typename
}
__typename
}
__typename
}
playlists: collections(query: $query, first: $limit, page: $page) @include(if: $shouldIncludePlaylists) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...PLAYLIST_BASE_FRAG
__typename
}
__typename
}
__typename
}
hashtags(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeHashtags) {
metadata {
id
algorithm {
uuid
__typename
}
__typename
}
pageInfo {
hasNextPage
nextPage
__typename
}
edges {
node {
id
...HASHTAG_BASE_FRAG
__typename
}
__typename
}
__typename
}
__typename
}
}
"""
}
payload = json.dumps(data).encode()
response = req.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload,
proxies=proxies)
data = response.json()
try:
edges = data['data']['search']['stories']['edges']
except (TypeError,KeyError):
print("stories 为 None 或结构异常,跳过")
break
edges_len = len(edges)
print(f"{i} 页,关键词: {kw},获取到 {edges_len} 条数据")
tancks = []
for j, edge in enumerate(edges):
node = edge.get("node", {})
s_data = {
"keyword": kw,
"v_name": kwdata.get("v_name", ""),
"v_xid": node.get("xid"),
"batch": kwdata.get("batch"),
"rn": kwdata.get("rn"),
"machine_id": MACHINE_ID,
"index": (i - 1) * 20 + j + 1,
"level": 0,
}
tancks.append(s_data)
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
executor.map(dmvideo_info.get_video_info, tancks)
if edges_len < 20:
break
time.sleep(10)
time.sleep(20)
if __name__ == '__main__':
parse_args()
start_time = datetime.now()
print(f"开始时间:{start_time.strftime('%Y-%m-%d %H:%M:%S')}")
main()

29
onoe.py
View File

@ -34,7 +34,19 @@ UserAgent = [
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.0.3.4000 Chrome/47.0.2526.73 Safari/537.36',
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)']
sec_ch_ua_list = [
'"Chromium";v="0", "Not;A=Brand";v="24", "Google Chrome";v="0"',
'"Chromium";v="0", "Not;A=Brand";v="24", "Google Chrome";v="0"',
'"Chromium";v="0", "Not;A=Brand";v="24", "Google Chrome";v="0"',
'"Chromium";v="0", "Not;A=Brand";v="24", "Google Chrome";v="0"',
'"Chromium";v="0", "Not;A=Brand";v="24", "Google Chrome";v="0"',
'"Chromium";v="136", "Not;A=Brand";v="24", "Google Chrome";v="136"',
'"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
'"Chromium";v="53", "Not;A=Brand";v="24", "Google Chrome";v="53"',
'"Chromium";v="0", "Not;A=Brand";v="24", "Google Chrome";v="0"',
'"Chromium";v="47", "Not;A=Brand";v="24", "Google Chrome";v="47"',
'"Chromium";v="0", "Not;A=Brand";v="24", "Google Chrome";v="0"'
]
def get_part_ids(part_num: int, take: int, offset: int = 0):
part_ids = list(range(offset, offset + take))
@ -221,14 +233,17 @@ def gettoken(proxy, r=2):
}
try:
proxy_str = db.get_proxy(proxy)
logger.info(f"[代理] => {proxy_str}")
url = 'https://graphql.api.dailymotion.com/oauth/token'
response = requests.post(url, headers=headers, data=data, proxies={"http": proxy_str, "https": proxy_str})
token = response.json()['access_token']
copy_headers = copy.deepcopy(headers1)
uaidx = random.randint(0, len(UserAgent) - 1)
copy_headers['authorization'] = "Bearer " + token
copy_headers['x-dm-visit-id'] = str(int(time.time() * 1000))
copy_headers['x-dm-visitor-id'] = uuid_with_dash
copy_headers['User-Agent'] = UserAgent[random.randint(0, len(UserAgent) - 1)]
copy_headers['User-Agent'] = UserAgent[uaidx]
copy_headers['sec-ch-ua'] = sec_ch_ua_list[uaidx]
copy_headers['X-DM-Preferred-Country'] = proxy.lower()
with _cache_lock:
_headers_cache = copy_headers
@ -267,18 +282,18 @@ def solve_recaptcha_v3_with_proxy(
payload = {
"clientKey": "CAP-A76C932D4C6CCB3CA748F77FDC07D996",
"task": {
"type": "ReCaptchaV3Task",
"type": "ReCaptchaV3TaskProxyLess",
"websiteURL": f"https://www.dailymotion.com/search/{encoded_query}/top-results",
"websiteKey": "6LeOJBIrAAAAAPMIjyYvo-eN_9W1HDOkrEqHR8tM",
"pageAction": "___grecaptcha_cfg.clients['100000']['L']['L']['promise-callback'](gRecaptchaResponse)",
"pageAction": "search",
"minScore": 0.5
}
}
resp = requests.post(create_url, json=payload, headers=headers, timeout=30)
resp = requests.post(create_url, data=json.dumps(payload), headers=headers, timeout=30)
logger.info(f"[token] 发送 payload:{payload}")
resp.raise_for_status()
task_id = resp.json()["taskId"]
logger.info(f"task_id: {task_id}")
logger.info(f"task_id: {resp.text}")
# 轮询获取结果
check_payload = {"clientKey": "CAP-A76C932D4C6CCB3CA748F77FDC07D996", "taskId": task_id}
for i in range(max_poll_attempts):
@ -287,7 +302,7 @@ def solve_recaptcha_v3_with_proxy(
result = r.json()
logger.info(f"{i}次,task_id:{task_id},结果:{result}")
if result.get("status") == "ready":
return result["solution"]["token"]
return result["solution"]["gRecaptchaResponse"]
time.sleep(polling_interval)
raise TimeoutError(f"任务 {task_id} 在轮询 {max_poll_attempts} 次后未完成")

Binary file not shown.

Binary file not shown.

View File

@ -1,56 +0,0 @@
import requests
url = "https://api.siliconflow.cn/v1/chat/completions"
kw = "朝雪录"
rn = "US"
payload = {
"model": "Qwen/Qwen3-14B",
"max_tokens": 512,
"enable_thinking": True,
"thinking_budget": 4096,
"min_p": 0.05,
"temperature": 0.7,
"top_p": 0.7,
"top_k": 50,
"frequency_penalty": 0.5,
"n": 1,
"stream": False,
"stop": [],
"messages": [
{
"role": "user",
"content": """你是一个视频搜索优化助手。用户给你一个中文视频标题或关键词,请你翻译并联想出 10 个适合用于英文视频网站(如 Dailymotion搜索的关键词结果用英文逗号分隔输出仅返回关键词列表不加说明。
示例输入朝雪录
示例输出Coroner's Diary,Coroners Diary, Coroners Diary episode,Coroners Diary season 1,Coroners Diary full episode,coroners diary
"""
},
{
"role": "user",
"content": f"请推理:{kw} 并输出 10 个地区缩写为{rn}的适合用于视频网站搜索的关键词,地区缩写不在关键词内,。"
}
]
}
headers = {
"Authorization": "Bearer sk-isvydeloxqhoiwoiojleghdsuhagryjbxzphfhxneevxeoeh",
"Content-Type": "application/json"
}
response = requests.post(url, json=payload, headers=headers, timeout=30)
def parse_keywords_from_response(resp_json):
try:
# 取出文本内容
content = resp_json["choices"][0]["message"]["content"]
# 按英文逗号分隔
keywords = [kw.strip() for kw in content.split(",") if kw.strip()]
return keywords
except Exception as e:
print("解析失败:", e)
return []
kws = parse_keywords_from_response(response.json())
print(kws)
print(len(kws))

View File

@ -1,19 +0,0 @@
import json
from DB import DBVidcon
payload_list = []
db = DBVidcon()
rows = db.get_report_video()
push = db.push_report
# =======================
for row in rows:
payload_list.append(json.dumps({**row}, ensure_ascii=False))
if len(payload_list) >= 10000:
push(payload_list)
payload_list.clear()
if payload_list: # 收尾
push(payload_list)
db.close()

124
report.py
View File

@ -1,124 +0,0 @@
import argparse
import json
import time
from DB import DBVidcon, DBSA
from report_video import DailymotionClient
from logger import logger
import requests
MACHINE_ID = None
IsSubsequent = False
def parse_args() -> argparse.Namespace:
global MACHINE_ID, IsSubsequent
parser = argparse.ArgumentParser(
description="Configure worker settings."
)
parser.add_argument(
"-m", "--machine-id",
type=int,
help=f"Machine identifier (default: {MACHINE_ID})"
)
parser.add_argument(
"-s", "--IsSubsequent",
type=int,
help=f"Maximum concurrent workers (default: {IsSubsequent})"
)
args = parser.parse_args()
if args.machine_id is not None:
MACHINE_ID = args.machine_id
if args.IsSubsequent is not None:
if args.IsSubsequent <= 0:
IsSubsequent = False
else:
IsSubsequent = True
if MACHINE_ID is None:
raise ValueError("请指定机器编号")
return args
parse_args()
def get_public_ip():
try:
response = requests.get("https://api.ipify.org?format=json", timeout=5)
return response.json().get("ip")
except requests.RequestException as e:
print("获取失败:", e)
return None
ip = get_public_ip()
logger.info(f"当前机器IP: {ip}, 机器编号: {MACHINE_ID}, 是否后续处理: {IsSubsequent}")
db = DBVidcon()
account = db.get_account_info(MACHINE_ID)
d = DailymotionClient(email=account['account'], password=account['password'])
k = {
"open": 1,
"solved": 2,
"awaiting your reply": 3,
}
last_main_run = 0
last_subsequent_run = 0
MAIN_INTERVAL = 60 * 60 # 每 5 分钟执行一次
SUBSEQUENT_INTERVAL = 30 * 60 # 每 60 分钟执行一次
# d.test()
while True:
now = int(time.time())
# 处理主流程
if now - last_main_run >= MAIN_INTERVAL:
last_main_run = now
re_list = []
idss = []
lis = db.item_report(100)
if len(lis) > 0:
for li in lis:
item = json.loads(li[0])
re_list.append(item)
idss.append(item['id'])
logger.info(f"name:{item['name_title']},link:{item['link']} ")
try:
ids, info, report_id, status, report_ts = d.process_ticket(re_list)
subsequent_status = k.get(status, 1)
db.update_fight_record_status(
ids, report_id, 2, f"http://{ip}:5000/image/{info}",
report_ts, subsequent_status, MACHINE_ID
)
db.flush()
except Exception as e:
logger.error(f"ID:{re_list[0]['id']}, end id{re_list[-1]['id']}, e:{e}")
db.update_fight_record_status(idss, 0, 3, str(e), mid=MACHINE_ID)
time.sleep(60) # 出错延迟
if now - last_subsequent_run >= SUBSEQUENT_INTERVAL and IsSubsequent:
last_subsequent_run = now
subsequent_list = db.get_subsequent_report_video(MACHINE_ID)
if len(subsequent_list) > 0:
for li in subsequent_list:
subsequent_status = 0
r_id = li['report_id']
logger.info(f"subsequent report_id:{r_id} ")
# try:
subsequent_status, info = d.report_follow_up(r_id)
db.update_subsequent_status_by_report_id(
r_id, subsequent_status, f"http://{ip}:5000/image/{info}"
)
# except Exception as e:
# logger.logger.error(f"ID:{rs_id}, e:{e}")
# db.update_subsequent_status_by_id(rs_id, 1, str(e))
time.sleep(5) # 避免频繁请求
time.sleep(5)

View File

@ -1,417 +0,0 @@
import time
import functools
import os
import re
from datetime import datetime
from sys import platform
import requests
from logger import logger
from playwright.sync_api import (
sync_playwright,
TimeoutError as PlaywrightTimeoutError,
Page,
Browser,
)
def solve_turnstile_capsolver(page: Page,
timeout: int = 120) -> bool:
"""
使用 CapSolver 自动完成当前 Page 上的 Cloudflare Turnstile
成功返回 True失败/超时返回 False
"""
cap_key = "CAP-A76C932D4C6CCB3CA748F77FDC07D996"
widget = page.query_selector("div.cf-turnstile[data-sitekey]")
if not widget:
return False
sitekey = widget.get_attribute("data-sitekey")
page_url = page.url
create_payload = {
"clientKey": cap_key,
"task": {
"type": "TurnstileTaskProxyLess",
"websiteURL": page_url,
"websiteKey": sitekey
}
}
create_resp = requests.post(
"https://api.capsolver.com/createTask",
json=create_payload, timeout=20
).json()
if create_resp.get("errorId"):
print("[CapSolver] createTask 失败:", create_resp)
return False
task_id = create_resp["taskId"]
poll_payload = {"clientKey": cap_key, "taskId": task_id}
token = None
elapsed, step = 0, 3
while elapsed < timeout:
time.sleep(step)
elapsed += step
res = requests.post(
"https://api.capsolver.com/getTaskResult",
json=poll_payload, timeout=15
).json()
if res.get("status") == "ready":
token = res["solution"]["token"]
break
if res.get("status") != "processing":
print("[CapSolver] getTaskResult 异常:", res)
return False
if not token:
print("[CapSolver] 超时未取到 token")
return False
page.evaluate(
"""(tk) => {
const ta = document.querySelector('textarea[name="cf-turnstile-response"]');
if (ta) ta.value = tk;
if (window.turnstileCallback)
try { window.turnstileCallback(tk); } catch(e){}
}""",
token
)
page.wait_for_timeout(1500)
return True
def require_login(func):
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
self.ensure_login()
return func(self, *args, **kwargs)
return wrapper
class DailymotionClient:
url = "https://faq.dailymotion.com/hc/en-us/requests/new"
EMAIL = "copyright@qiyi.com"
PASSWORD = "ppsIQIYI2018@"
def __init__(self,email, password, headless: bool = None):
self.email = email
self.password = password
self.headless = headless
self.check_interval = 60 * 60
if self.headless is None:
self.headless = platform == "linux" or platform == "linux2"
if self.headless:
proxy = None
self.file_path = "/opt/ql/DailyMotion/oss/LOA.pdf"
self.file_path2 = "/opt/ql/DailyMotion/oss/BAZTSJT.pdf"
else:
proxy={'server': 'http://127.0.0.1:7890'}
self.file_path = "./oss/LOA.pdf"
self.file_path2 = "./oss/BAZTSJT.pdf"
logger.info(f"Launching DailymotionClient with headless={self.headless}, proxy={proxy}")
self._pw = sync_playwright().start()
self.browser: Browser = self._pw.chromium.launch(
headless=self.headless,
proxy=proxy,
)
self.context = self.browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36",
locale="en-US",
viewport={"width": 1280, "height": 800},
timezone_id="Asia/Shanghai",
permissions=[],
)
self.context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
self.page: Page = self.context.new_page()
self._last_check_ts = 0
self._last_check_result = False
os.makedirs('screenshots', exist_ok=True)
self.page.goto(self.url)
def _do_login(self) -> None:
self.page.goto(self.url, timeout=30000)
# self.page.wait_for_load_state("networkidle", timeout=30000)
self.page.wait_for_timeout(3000)
file_path = f'screenshots/{str(int(time.time()))}.png'
self.page.screenshot(path=file_path)
if self.page.query_selector("div.cf-turnstile[data-sitekey]"):
ok = solve_turnstile_capsolver(self.page)
if not ok:
raise RuntimeError("CapSolver 处理 Turnstile 失败")
logbtn = self.page.locator("//a[@class='login button']")
if logbtn.count() > 0:
logbtn.nth(0).click()
self.page.wait_for_selector("//input[@data-testid=\"emailInput\"]")
# “我了解”弹窗
i_now_btn = self.page.locator("button:has-text(\"I understand\")")
if i_now_btn.count() > 0:
i_now_btn.click()
# 输入账号密码
email_edit = self.page.locator("//input[@data-testid=\"emailInput\"]")
password_edit = self.page.locator("//input[@data-testid=\"passwordInput\"]")
if email_edit.count():
email_edit.fill(self.email)
if password_edit.count():
password_edit.fill(self.password)
# 登录
login_btn = self.page.locator('button[form="signin-form"][type="submit"]')
try:
self.page.wait_for_selector(
'button[form="signin-form"][type="submit"]:not([disabled])', timeout=20000
)
except PlaywrightTimeoutError:
pass
login_btn.click()
# 等待跳回
self.page.wait_for_url(self.url, timeout=30000)
time.sleep(1)
self._last_check_ts = time.time()
self._last_check_result = True
def _detect_login(self) -> bool:
self.page.goto(self.url, timeout=30000)
self.page.wait_for_timeout(3000)
return self.page.locator("//a[@class='login button']").count() == 0
def is_logged_in(self) -> bool:
now = time.time()
if now - self._last_check_ts < self.check_interval:
return self._last_check_result
try:
ok = self._detect_login()
except Exception:
ok = False
self._last_check_ts = now
self._last_check_result = ok
return ok
def ensure_login(self) -> None:
if not self.is_logged_in():
self._do_login()
@require_login
def process_ticket(self, lis: list):
titles = "\r\n"
links = ""
ids= []
title = ""
link = ""
assignment = True
for li in lis:
if assignment:
title = li['name_title']
link = li['link']
assignment = False
ids.append(li['id'])
titles += li['name_title'] + ",\r\n"
links += li['link'] + ",\r\n"
logger.info(f"Processing ticket for title: {titles}, link: {links}")
self.page.goto(self.url, timeout=3000)
titles_list = [title.strip() for title in titles.split(',')]
unique_titles = list(set(titles_list))
unique_titles.sort()
titles =",".join(unique_titles) # 去重
description = """We request that you take immediate actionto stop the infringing activity, take steps to ensure that iQIYI Content is notre-posted on, re-linked to, or otherwise available through your site. Pleaseinform us of the actions you have taken and their results.
1) please help remove these videos
2) The drama series titles are {}
""".format(titles)
# likls = ["\"" + l + "\"" for l in link]
# links = ','.join(likls)
if self.page.query_selector("div.cf-turnstile[data-sitekey]"):
ok = solve_turnstile_capsolver(self.page)
if not ok:
raise RuntimeError("CapSolver 处理 Turnstile 失败")
# file_path = f'screenshots/{str(int(time.time()))}_{title}_{link.split("/")[-1]}.png'
# self.page.screenshot(path=file_path)
resports = self.page.locator('li.blocks-item:nth-child(8)')
resports.click()
time.sleep(2)
cc = self.page.locator("input#request_collaborators_")
cc.scroll_into_view_if_needed()
cc.click()
cc.type("duke.chen@dailymotion.com")
self.page.get_by_role("button", name="Copyright infringement").click()
time.sleep(1)
self.page.get_by_role("button", name="Notification").nth(0).click()
time.sleep(1)
self.page.get_by_role("button", name="A legal entity").click()
time.sleep(1)
self.page.get_by_label("Corporate name").fill("Beijing iQIYI Science & Technology Co.,Ltd")
time.sleep(1)
self.page.get_by_label("Legal status").fill("Legal Department")
time.sleep(1)
self.page.get_by_label("Subject").fill("Copyright infringement Notification")
time.sleep(1)
self.page.get_by_label("Please indicate the URL of the video(s) you would like to report*").fill(links)
time.sleep(1)
self.page.get_by_label("Description").nth(1).fill(description)
time.sleep(1)
self.page.get_by_label("I state in good faith", exact=False).check()
time.sleep(1)
self.page.get_by_label("I state in good faith that the use of the Protected", exact=False).check()
time.sleep(1)
self.page.get_by_role("checkbox", name="I certify that all information provided", exact=False).check()
time.sleep(1)
self.page.get_by_role("checkbox", name="I acknowledge that my statements", exact=False).check()
time.sleep(1)
self.page.get_by_role("checkbox", name="The data provided through this form", exact=False).check()
time.sleep(1)
self.page.get_by_role("checkbox", name="By submitting the present form,", exact=False).check()
time.sleep(1)
self.page.get_by_role("textbox", name="electronic signature", exact=False).fill("柴达") # 占位
time.sleep(1)
self.page.set_input_files('input#request-attachments', [
self.file_path,
self.file_path2
])
self.page.wait_for_timeout(8000)
self.page.get_by_role("button", name="Submit").click()
time.sleep(2)
file_path = f'screenshots/{str(int(time.time()))}_{title}_{link.split("/")[-1]}.png'
locator = self.page.locator("//dt[normalize-space(.)='Id']/following-sibling::dd[1]")
raw_text = locator.text_content()
match = re.search(r'\d+', raw_text or '')
report_id = match.group() if match else None
status_raw = self.page.locator("span.status-label").text_content()
subsequent_status = status_raw.strip().lower() if status_raw else None
time_elem = self.page.locator("dt", has_text="Created").locator("xpath=following-sibling::dd[1]/time")
datetime_str = time_elem.get_attribute("datetime") # e.g. 2025-06-12T06:15:33+00:00
if datetime_str:
dt = datetime.fromisoformat(datetime_str.replace("Z", "+00:00")) # 安全处理 ISO 时间
timestamp = int(dt.timestamp())
else:
timestamp = None
self.page.screenshot(path=file_path)
if self.page.url != self.url:
self.page.goto(self.url, timeout=30000)
return ids, file_path, report_id, subsequent_status, timestamp
@require_login
def report_follow_up(self, report_id: str):
max_retries = 3
retry_delay = 2
loaded = False
subsequent_status = ""
for attempt in range(max_retries):
try:
self.page.goto(f"https://faq.dailymotion.com/hc/en-us/requests/{report_id}", timeout=30000)
# self.page.wait_for_load_state("networkidle") # 保证页面加载稳定
self.page.wait_for_selector("span.status-label", timeout=30000)
try:
status_raw = self.page.locator("span.status-label").text_content()
except Exception as e:
print(f"[警告] 获取状态标签失败: {e}")
status_raw = None
subsequent_status = status_raw.strip().lower() if status_raw else None
loaded = True
break
except Exception as e:
print(f"[ERROR] 尝试 {attempt + 1}/{max_retries} 失败: {e}")
if attempt < max_retries - 1:
time.sleep(retry_delay)
if not loaded:
return 1, "页面加载失败"
txt = (
"I am the authorized agent of Beijing iQIYI Technology Co., Ltd., responsible for dealing with "
"unauthorized overseas distribution of pirated videos of our works. "
"We have confirmed that the above links contain infringing content and we insist on requesting to takedown. Thank you!"
)
if "awaiting your reply" in subsequent_status:
span_show = self.page.locator('span.comment-show-container-content')
if span_show.count() > 0:
span_show.nth(0).click()
self.page.wait_for_timeout(1000)
textarea = self.page.locator('#request_comment_body')
textarea.type(txt, delay=30)
self.page.wait_for_timeout(1000)
self.page.get_by_role("button", name="Submit").click()
success = self.wait_for_selector_safe("span.status-label", timeout=30000, retries=3)
if not success:
return 1, "提交后未检测到状态更新"
span_show = self.page.locator('span.comment-show-container-content')
if span_show.count() > 0:
span_show.nth(0).click()
pic_path = f'screenshots/{str(int(time.time()))}_{report_id}.png'
self.page.screenshot(path=pic_path)
return 0, pic_path
elif "open" in subsequent_status:
return 1, ""
elif "solved" in subsequent_status:
return 2, ""
return 0, "未知状态"
def wait_for_selector_safe(self, selector: str, timeout=30000, retries=3, retry_delay=2):
for i in range(retries):
try:
self.page.wait_for_selector(selector, timeout=timeout)
return True
except Exception as e:
print(f"[重试] 第 {i + 1}/{retries} 次等待 {selector} 失败: {e}")
if i < retries - 1:
time.sleep(retry_delay)
return False
@require_login
def test(self):
logger.info(f"Testing DailymotionClient with email: {self.email}")
self.page.goto(self.url, timeout=30000)
file_path = f'screenshots/{str(int(time.time()))}_test.png'
self.page.screenshot(path=file_path)
self.page.wait_for_timeout(1000)
file_path = f"screenshots/{str(int(time.time()))}_test2.png"
self.page.screenshot(path=file_path)
logger.info(f"Test screenshot saved to {file_path}")
self.page.wait_for_timeout(1000)
file_path = f"screenshots/{str(int(time.time()))}_test3.png"
self.page.screenshot(path=file_path)
logger.info(f"Test screenshot saved to {file_path}")
def close(self):
try:
self.page.close()
except Exception:
pass
try:
self.browser.close()
except Exception:
pass
try:
self._pw.stop()
except Exception:
pass
if __name__ == "__main__":
dm = DailymotionClient("copyright@qiyi.com", "ppsIQIYI2018@")
# dm.process_ticket("恋爱学园","https://www.dailymotion.com/video/x9lfr24")
dm.report_follow_up("13566")

View File

@ -1,66 +1,10 @@
import time
import requests
import json
import redis
import requests
import urllib3
from matplotlib.artist import allow_rasterization
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from typing import Optional, Dict, Any, Union
class HttpClient:
def __init__(self, max_retries: int = 3, backoff_factor: float = 0.5):
self.session = requests.Session()
# 配置重试策略
retry_strategy = Retry(
total=max_retries,
backoff_factor=backoff_factor,
status_forcelist=[500, 502, 503, 504, 429]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
def request(self,
method: str,
url: str,
headers: Optional[Dict] = None,
params: Optional[Dict] = None,
data: Optional[Union[Dict, str]] = None,
cookies: Optional[Dict] = None,
allow_redirects: bool = True,
timeout: int = 30,
**kwargs) -> requests.Response:
try:
response = self.session.request(
method=method,
url=url,
headers=headers,
params=params,
data=data,
cookies=cookies,
allow_redirects=allow_redirects,
timeout=timeout,
**kwargs
)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f"请求失败: {url}, 错误: {str(e)}")
raise
def get(self, url: str, **kwargs) -> requests.Response:
return self.request("GET", url, **kwargs)
def post(self, url: str, **kwargs) -> requests.Response:
return self.request("POST", url, **kwargs)
# 创建全局的 HTTP 客户端实例
http_client = HttpClient()
session = requests.Session()
_REDIS_CONF = {
"host": "192.144.230.75",
"port": 6379,
@ -88,197 +32,151 @@ def get_report_token(key_name: str):
def login():
try:
headers = {
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "https://www.dailymotion.com",
"Pragma": "no-cache",
"Referer": "https://www.dailymotion.com/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Microsoft Edge\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
url = "https://graphql.api.dailymotion.com/oauth/token"
data = {
"client_id": "f1a362d288c1b98099c7",
"client_secret": "eea605b96e01c796ff369935357eca920c5da4c5",
"grant_type": "password",
"username": "copyright@qiyi.com",
"password": "ppsIQIYI2018@",
"scope": "userinfo,email,manage_subscriptions,manage_history,manage_likes,manage_playlists,manage_videos",
"version": "2",
"traffic_segment": "962042",
"visitor_id": "359703fb-66c2-43d2-bd0d-b1cac9c7ae8a"
}
response = http_client.post(url, headers=headers, data=data)
data = {
"update_time": int(time.time()),
"username": "copyright@qiyi.com",
"password": "ppsIQIYI2018@",
"token": response.json()
}
save_report_token('token', data)
return data
except Exception as e:
print(f"登录失败: {str(e)}")
raise
def refresh_token(access_token, refresh_token):
headers = {
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Length": "0",
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "https://www.dailymotion.com",
"Pragma": "no-cache",
"Referer": "https://www.dailymotion.com/signin?urlback=%2Fzendesk%3Ftimestamp%3D1748932650%26return_to%3Dhttps%253A%252F%252Ffaq.dailymotion.com%252Fhc%252Fen-us%252Frequests%252Fnew",
"Referer": "https://www.dailymotion.com/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Site": "same-site",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Microsoft Edge\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
url = "https://graphql.api.dailymotion.com/oauth/token"
data = {
"client_id": "f1a362d288c1b98099c7",
"client_secret": "eea605b96e01c796ff369935357eca920c5da4c5",
"grant_type": "password",
"username": "copyright@qiyi.com",
"password": "ppsIQIYI2018@",
"scope": "userinfo,email,manage_subscriptions,manage_history,manage_likes,manage_playlists,manage_videos",
"version": "2",
"traffic_segment": "962042",
"visitor_id": "359703fb-66c2-43d2-bd0d-b1cac9c7ae8a"
}
response = session.post(url, headers=headers, data=data)
data = {
"update_time": int(time.time()),
"username": "copyright@qiyi.com",
"password": "ppsIQIYI2018@",
"token": response.json()
}
save_report_token('token', data)
return data
def get_cookies(access_token: str, refresh_token: str):
cookies = {
"dmvk": "683e982c34e34",
"ts": "133696",
"v1st": "a847389a-6b91-4157-948f-457666f7172b",
"ff": "on",
"lang": "zh_CN",
"usprivacy": "1---",
"dmaid": "73ca37e4-6858-46c1-aac4-a4a5fc9a270e",
"cookie_policy_closed": "1",
"access_token": access_token,
"refresh_token": refresh_token,
}
url = "https://www.dailymotion.com/cookie/refresh_token"
response = http_client.post(url, headers=headers, cookies=cookies)
session.post(url, cookies=cookies, allow_redirects=True)
def zendesk():
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Referer": "https://www.dailymotion.com/sg",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Microsoft Edge\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
def get_cookies1(access_token: str, refresh_token: str):
"""302 跳转"""
cookies = {
"access_token": access_token,
"refresh_token": refresh_token,
}
url = "https://www.dailymotion.com/zendesk"
params = {
"return_to": "https://faq.dailymotion.com/hc/en-us/requests/new",
"timestamp": str(time.time()),
"timestamp": str(int(time.time())),
}
response = http_client.get(url, headers=headers, params=params, allow_redirects=True)
data = http_client.session.cookies.get_dict()
data['update_time'] = int(time.time())
save_report_token('cookies', data)
session.get(url, cookies=cookies, params=params, allow_redirects=True)
cookies_dict = {"update_time": int(time.time()), "cookies": session.cookies.get_dict()}
save_report_token('cookies', cookies_dict)
return cookies_dict
def get_csrftoken():
try:
url = "https://faq.dailymotion.com/hc/api/internal/csrf_token.json"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Referer": "https://www.dailymotion.com/sg",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Microsoft Edge\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
response = http_client.get(url, headers=headers)
data = {"update_time": int(time.time()), "csrf_token": response.json()}
save_report_token('csrf_token', data)
return data
except Exception as e:
print(f"获取 CSRF token 失败: {str(e)}")
raise
url = "https://faq.dailymotion.com/hc/api/internal/csrf_token.json"
response = session.get(url)
data = {"update_time": int(time.time()), "csrf_token": response.json()}
save_report_token('csrf_token', data)
return data
def report(csrf_token: str, v_url, title):
try:
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"content-type": "application/x-www-form-urlencoded",
"origin": "https://faq.dailymotion.com",
"pragma": "no-cache",
"priority": "u=0, i",
"referer": "https://faq.dailymotion.com/hc/en-us/requests/new?ticket_form_id=136048",
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Microsoft Edge\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0"
}
url = "https://faq.dailymotion.com/hc/en-us/requests"
data = {
"utf8": "",
"authenticity_token": csrf_token,
"request%5Bticket_form_id%5D": "136048",
"request%5Bcollaborators%5D%5B%5D": "duke.chen@dailymotion.com",
"request%5Bcustom_fields%5D%5B360008684839%5D": "__dc.copyright_user_protection_-_copyright__",
"request%5Bcustom_fields%5D%5B30150188%5D": "copyrightform-notification",
"request%5Bcustom_fields%5D%5B25089567%5D": "legal_entity",
"request%5Bcustom_fields%5D%5B25159868%5D": "Beijing iQIYI Science & Technology Co.,Ltd",
"request%5Bcustom_fields%5D%5B4869133282962%5D": "Legal Department",
"request%5Bsubject%5D": "Copyright infringement Notification",
"request%5Bcustom_fields%5D%5B25613698%5D": v_url,
"request%5Bdescription%5D": f"We request that you take immediate actionto stop the infringing activity, take steps to ensure that iQIYI Content is notre-posted on, re-linked to, or otherwise available through your site. Pleaseinform us of the actions you have taken and their results.\r\n1) please help remove these videos\r\n2) The drama series titles are \"{title}\"\r\n",
"request%5Bdescription_mimetype%5D": "text/plain",
"request%5Bcustom_fields%5D%5B4769880845586%5D": "on",
"request%5Bcustom_fields%5D%5B25626417%5D": "on",
"request%5Bcustom_fields%5D%5B4769797363346%5D": "on",
"request%5Bcustom_fields%5D%5B25159848%5D": "on",
"request%5Bcustom_fields%5D%5B4769658191250%5D": "on"
}
response = requests.post(url, headers=headers, data=data)
print(response.status_code)
print(response.text)
print(response)
return response.status_code == 200
except Exception as e:
print(f"提交报告失败: {str(e)}")
raise
def report(csrf_token:str, cookies:dict, ):
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://faq.dailymotion.com',
'pragma': 'no-cache',
'priority': 'u=0, i',
'referer': 'https://faq.dailymotion.com/hc/en-us/requests/new?ticket_form_id=136048',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
}
data = {
'utf8': '',
'authenticity_token': csrf_token,
'request[ticket_form_id]': '136048',
'request[collaborators][]': 'duke.chen@dailymotion.com',
'request[custom_fields][360008684839]': '__dc.copyright_user_protection_-_copyright__',
'request[custom_fields][30150188]': 'copyrightform-notification',
'request[custom_fields][25089567]': 'legal_entity',
'request[custom_fields][25159868]': 'Beijing iQIYI Science & Technology Co.,Ltd',
'request[custom_fields][4869133282962]': 'Legal Department',
'request[subject]': 'Copyright infringement Notification',
'request[custom_fields][25613698]': 'url',
'request[description]': 'We request that you take immediate actionto stop the infringing activity, take steps to ensure that iQIYI Content is notre-posted on, re-linked to, or otherwise available through your site. Pleaseinform us of the actions you have taken and their results.\r\n1) please help remove these videos\r\n2) The drama series titles are 片名\r\n',
'request[description_mimetype]': 'text/plain',
'request[custom_fields][4769880845586]': [
'off',
'on',
],
'request[custom_fields][25626417]': [
'off',
'on',
],
'request[custom_fields][4769797363346]': [
'off',
'on',
],
'request[custom_fields][25159848]': [
'off',
'on',
],
'request[custom_fields][4769658191250]': [
'off',
'on',
],
}
response = requests.post('https://faq.dailymotion.com/hc/en-us/requests', cookies=cookies, headers=headers, data=data)
def prepare_data():
token = get_report_token('token')
cookies = get_report_token('cookies')
csrf_token = get_report_token('csrf_token')
max_update_time = max(d.get('update_time', 0) for d in (token, cookies, csrf_token) if d)
if max_update_time + (24 * 60 * 60) < time.time():
token = get_report_token('token')
access_token = token['token']['access_token']
refresh_token = token['token']['refresh_token']
get_cookies(access_token, refresh_token)
get_cookies1(access_token, refresh_token)
csrf_token = get_csrftoken()
if __name__ == '__main__':
cookies = get_report_token('cookies')['cookies']
http_client.session.cookies = requests.utils.cookiejar_from_dict(cookies)
csrf_token = get_csrftoken()['csrf_token']['current_session']['csrf_token']
report(csrf_token, 'Hunter X Hunter', 'https://www.dailymotion.com/video/x8kjx7v')
report(csrf_token['csrf_token']['current_session']['csrf_token'], cookies['cookies'])

View File

@ -4,7 +4,8 @@ charset-normalizer==3.4.2
et-xmlfile==1.1.0
idna==3.10
importlib-metadata==6.7.0
numpy==2.3.0
lxml==5.4.0
numpy==1.21.6
openpyxl==3.1.3
pandas==1.3.5
pkg_resources==0.0.0

View File

@ -1,35 +0,0 @@
from flask import Flask, send_file, abort, request, jsonify
from pathlib import Path
app = Flask(__name__)
PROJECT_ROOT = Path(__file__).parent.resolve()
SCREENSHOTS_DIR = Path("/opt/ql/daily_com/bin/screenshots").resolve()
@app.route('/image/screenshots/<path:filename>')
def serve_image(filename):
file_path = SCREENSHOTS_DIR / filename
# 防止路径越界访问
try:
file_path.resolve().relative_to(SCREENSHOTS_DIR.resolve())
except ValueError:
abort(403, description=f"禁止访问目录外文件: {file_path.resolve()}")
if not file_path.exists():
abort(404, description=f"文件不存在: {file_path.resolve()}")
return send_file(file_path, as_attachment=False)
# 自定义 404 错误响应
@app.errorhandler(404)
def handle_404(e):
return f"404 错误:{e.description}", 404
# 自定义 403 错误响应
@app.errorhandler(403)
def handle_403(e):
return f"403 错误:{e.description}", 403
if __name__ == '__main__':
app.run(host='0.0.0.0', debug=False, port=5000)

View File

@ -1,6 +0,0 @@
from DB import DBVidcon
db = DBVidcon()
account = db.get_account_info('4')
print(account)

View File

@ -1,10 +0,0 @@
from DB import DBVidcon
from logger import logger
db = DBVidcon()
logger.info("开始更新视频举报状态")
db.update_video_ts_status()
db.close()
logger.info("更改视频举报状态完成")