diff --git a/oneget.py b/oneget.py index 100278b..5ae2835 100644 --- a/oneget.py +++ b/oneget.py @@ -252,428 +252,430 @@ def parse_args() -> argparse.Namespace: def main(): - kwdata = db.get_web_items() - if not kwdata: - logger.error("没有获取到关键词数据") - time.sleep(30) - return - logger.info(f"搜索关键词数据: {kwdata}") - kwdata = kwdata[0][1] - rn = kwdata['rn'] - proxy_name = proxiesdict.get(rn) - # proxies_str = "http://127.0.0.1:10808" - proxies_str = db.get_proxy(proxy_name, '-1') - proxies = { - 'http': proxies_str, - 'https': proxies_str - } - kw = kwdata['keyword'] - dmheader_manager = DMHeaderManager(proxies=proxies) - dmvideo_info = DMVideoInfo(proxies=proxies) - headers = dmheader_manager.get_headers() - for i in range(1, 11): - data = { - "operationName": "SEARCH_QUERY", - "variables": { - "query": kw, - "shouldIncludeTopResults": True, # 是否包含热门结果 - "shouldIncludeChannels": False, # 是否包含频道 - "shouldIncludePlaylists": False, # 是否包含播放列表 - "shouldIncludeHashtags": False, # 是否包含标签 - "shouldIncludeVideos": False, # 是否包含视频 - "shouldIncludeLives": False, # 是否包含直播 - "page": i, - "limit": 20, - "recaptchaToken": None - }, - "query": """ - fragment VIDEO_BASE_FRAGMENT on Video { - id - xid - title - createdAt - duration - aspectRatio - thumbnail(height: PORTRAIT_240) { - id - url - __typename - } - creator { - id - xid - name - displayName - accountType - avatar(height: SQUARE_60) { - id - url - __typename - } - __typename - } - __typename - } - - fragment CHANNEL_BASE_FRAG on Channel { - id - xid - name - displayName - accountType - isFollowed - avatar(height: SQUARE_120) { - id - url - __typename - } - followerEngagement { - id - followDate - __typename - } - metrics { - id - engagement { - id - followers { - edges { - node { - id - total - __typename - } - __typename - } - __typename - } - __typename - } - __typename - } - __typename - } - - fragment PLAYLIST_BASE_FRAG on Collection { - id - xid - name - description - thumbnail(height: PORTRAIT_240) { - id - url - __typename - } - creator { - id - xid - name - displayName - accountType - avatar(height: SQUARE_60) { - id - url - __typename - } - __typename - } - metrics { - id - engagement { - id - videos(filter: {visibility: {eq: PUBLIC}}) { - edges { - node { - id - total - __typename - } - __typename - } - __typename - } - __typename - } - __typename - } - __typename - } - - fragment HASHTAG_BASE_FRAG on Hashtag { - id - xid - name - metrics { - id - engagement { - id - videos { - edges { - node { - id - total - __typename - } - __typename - } - __typename - } - __typename - } - __typename - } - __typename - } - - fragment LIVE_BASE_FRAGMENT on Live { - id - xid - title - audienceCount - aspectRatio - isOnAir - thumbnail(height: PORTRAIT_240) { - id - url - __typename - } - creator { - id - xid - name - displayName - accountType - avatar(height: SQUARE_60) { - id - url - __typename - } - __typename - } - __typename - } - - query SEARCH_QUERY( - $query: String!, - $shouldIncludeTopResults: Boolean!, - $shouldIncludeVideos: Boolean!, - $shouldIncludeChannels: Boolean!, - $shouldIncludePlaylists: Boolean!, - $shouldIncludeHashtags: Boolean!, - $shouldIncludeLives: Boolean!, - $page: Int, - $limit: Int, - $sortByVideos: SearchVideoSort, - $durationMinVideos: Int, - $durationMaxVideos: Int, - $createdAfterVideos: DateTime, - $recaptchaToken: String - ) { - search(token: $recaptchaToken) { - id - - stories(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeTopResults) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - ...VIDEO_BASE_FRAGMENT - ...CHANNEL_BASE_FRAG - ...PLAYLIST_BASE_FRAG - ...HASHTAG_BASE_FRAG - ...LIVE_BASE_FRAGMENT - __typename - } - __typename - } - __typename - } - - videos( - query: $query, - first: $limit, - page: $page, - sort: $sortByVideos, - durationMin: $durationMinVideos, - durationMax: $durationMaxVideos, - createdAfter: $createdAfterVideos - ) @include(if: $shouldIncludeVideos) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - id - ...VIDEO_BASE_FRAGMENT - __typename - } - __typename - } - __typename - } - - lives(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeLives) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - id - ...LIVE_BASE_FRAGMENT - __typename - } - __typename - } - __typename - } - - channels(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeChannels) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - id - ...CHANNEL_BASE_FRAG - __typename - } - __typename - } - __typename - } - - playlists: collections(query: $query, first: $limit, page: $page) @include(if: $shouldIncludePlaylists) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - id - ...PLAYLIST_BASE_FRAG - __typename - } - __typename - } - __typename - } - - hashtags(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeHashtags) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - id - ...HASHTAG_BASE_FRAG - __typename - } - __typename - } - __typename - } - - __typename - } - } - """ - } - - payload = json.dumps(data).encode() - - response = req.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload, - proxies=proxies) - - data = response.json() - edges = data['data']['search']['stories']['edges'] - edges_len = len(edges) - logger.info(f"第 {i} 页,关键词: {kw},获取到 {edges_len} 条数据") - tancks = [] - for j, edge in enumerate(edges): - node = edge.get("node", {}) - s_data = { - "keyword": kw, - "v_name": kwdata.get("v_name", ""), - "v_xid": node.get("xid"), - "batch": kwdata.get("batch"), - "rn": kwdata.get("rn"), - "machine_id": MACHINE_ID, - "index": (i - 1) * 20 + j + 1, - "level": 0, - } - tancks.append(s_data) - with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: - executor.map(dmvideo_info.get_video_info, tancks) - if edges_len < 20: - break - - -def run(): - parse_args() while True: - main() - time.sleep(60) + kwdata = db.get_web_items() + if not kwdata: + logger.error("没有获取到关键词数据") + time.sleep(30) + return + logger.info(f"搜索关键词数据: {kwdata}") + kwdata = kwdata[0][1] + rn = kwdata['rn'] + proxy_name = proxiesdict.get(rn) + # proxies_str = "http://127.0.0.1:10808" + proxies_str = db.get_proxy(proxy_name, '-1') + proxies = { + 'http': proxies_str, + 'https': proxies_str + } + kw = kwdata['keyword'] + dmheader_manager = DMHeaderManager(proxies=proxies) + dmvideo_info = DMVideoInfo(proxies=proxies) + headers = dmheader_manager.get_headers() + for i in range(1, 11): + data = { + "operationName": "SEARCH_QUERY", + "variables": { + "query": kw, + "shouldIncludeTopResults": True, # 是否包含热门结果 + "shouldIncludeChannels": False, # 是否包含频道 + "shouldIncludePlaylists": False, # 是否包含播放列表 + "shouldIncludeHashtags": False, # 是否包含标签 + "shouldIncludeVideos": False, # 是否包含视频 + "shouldIncludeLives": False, # 是否包含直播 + "page": i, + "limit": 20, + "recaptchaToken": None + }, + "query": """ + fragment VIDEO_BASE_FRAGMENT on Video { + id + xid + title + createdAt + duration + aspectRatio + thumbnail(height: PORTRAIT_240) { + id + url + __typename + } + creator { + id + xid + name + displayName + accountType + avatar(height: SQUARE_60) { + id + url + __typename + } + __typename + } + __typename + } + + fragment CHANNEL_BASE_FRAG on Channel { + id + xid + name + displayName + accountType + isFollowed + avatar(height: SQUARE_120) { + id + url + __typename + } + followerEngagement { + id + followDate + __typename + } + metrics { + id + engagement { + id + followers { + edges { + node { + id + total + __typename + } + __typename + } + __typename + } + __typename + } + __typename + } + __typename + } + + fragment PLAYLIST_BASE_FRAG on Collection { + id + xid + name + description + thumbnail(height: PORTRAIT_240) { + id + url + __typename + } + creator { + id + xid + name + displayName + accountType + avatar(height: SQUARE_60) { + id + url + __typename + } + __typename + } + metrics { + id + engagement { + id + videos(filter: {visibility: {eq: PUBLIC}}) { + edges { + node { + id + total + __typename + } + __typename + } + __typename + } + __typename + } + __typename + } + __typename + } + + fragment HASHTAG_BASE_FRAG on Hashtag { + id + xid + name + metrics { + id + engagement { + id + videos { + edges { + node { + id + total + __typename + } + __typename + } + __typename + } + __typename + } + __typename + } + __typename + } + + fragment LIVE_BASE_FRAGMENT on Live { + id + xid + title + audienceCount + aspectRatio + isOnAir + thumbnail(height: PORTRAIT_240) { + id + url + __typename + } + creator { + id + xid + name + displayName + accountType + avatar(height: SQUARE_60) { + id + url + __typename + } + __typename + } + __typename + } + + query SEARCH_QUERY( + $query: String!, + $shouldIncludeTopResults: Boolean!, + $shouldIncludeVideos: Boolean!, + $shouldIncludeChannels: Boolean!, + $shouldIncludePlaylists: Boolean!, + $shouldIncludeHashtags: Boolean!, + $shouldIncludeLives: Boolean!, + $page: Int, + $limit: Int, + $sortByVideos: SearchVideoSort, + $durationMinVideos: Int, + $durationMaxVideos: Int, + $createdAfterVideos: DateTime, + $recaptchaToken: String + ) { + search(token: $recaptchaToken) { + id + + stories(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeTopResults) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + ...VIDEO_BASE_FRAGMENT + ...CHANNEL_BASE_FRAG + ...PLAYLIST_BASE_FRAG + ...HASHTAG_BASE_FRAG + ...LIVE_BASE_FRAGMENT + __typename + } + __typename + } + __typename + } + + videos( + query: $query, + first: $limit, + page: $page, + sort: $sortByVideos, + durationMin: $durationMinVideos, + durationMax: $durationMaxVideos, + createdAfter: $createdAfterVideos + ) @include(if: $shouldIncludeVideos) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...VIDEO_BASE_FRAGMENT + __typename + } + __typename + } + __typename + } + + lives(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeLives) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...LIVE_BASE_FRAGMENT + __typename + } + __typename + } + __typename + } + + channels(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeChannels) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...CHANNEL_BASE_FRAG + __typename + } + __typename + } + __typename + } + + playlists: collections(query: $query, first: $limit, page: $page) @include(if: $shouldIncludePlaylists) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...PLAYLIST_BASE_FRAG + __typename + } + __typename + } + __typename + } + + hashtags(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeHashtags) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...HASHTAG_BASE_FRAG + __typename + } + __typename + } + __typename + } + + __typename + } + } + """ + } + payload = json.dumps(data).encode() + + response = req.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload, + proxies=proxies) + + data = response.json() + edges = data['data']['search']['stories']['edges'] + edges_len = len(edges) + logger.info(f"第 {i} 页,关键词: {kw},获取到 {edges_len} 条数据") + tancks = [] + for j, edge in enumerate(edges): + node = edge.get("node", {}) + s_data = { + "keyword": kw, + "v_name": kwdata.get("v_name", ""), + "v_xid": node.get("xid"), + "batch": kwdata.get("batch"), + "rn": kwdata.get("rn"), + "machine_id": MACHINE_ID, + "index": (i - 1) * 20 + j + 1, + "level": 0, + } + tancks.append(s_data) + with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: + executor.map(dmvideo_info.get_video_info, tancks) + if edges_len < 20: + break + time.sleep(5) + + time.sleep(10) if __name__ == '__main__': - run() + parse_args() + start_time = datetime.now() + logger.info(f"开始时间:{start_time.strftime('%Y-%m-%d %H:%M:%S')}") + main() + + +