New file |
| | |
| | | # 将用户主页url短链转成长链,存入matrix-marketing.aijuke_grab_user |
| | | |
| | | import asyncio |
| | | import re |
| | | from typing import Optional |
| | | |
| | | from playwright.async_api import async_playwright, BrowserContext |
| | | |
| | | from dao import grad_user_dao |
| | | from model.model import GrabUser |
| | | |
| | | max_count = 8 |
| | | |
| | | |
| | | async def get_sec_id(short_url: str, context: BrowserContext) -> Optional[str]: |
| | | page = await context.new_page() |
| | | try: |
| | | await asyncio.wait_for(page.goto(short_url), 3) |
| | | except asyncio.TimeoutError: |
| | | print("Timeout") |
| | | await page.reload() |
| | | |
| | | while True: |
| | | # print(page.url) |
| | | if page.url == 'https://www.douyin.com/?recommend=1': |
| | | await page.close() |
| | | return None |
| | | match = re.search(r'/user/([^/?]+)', page.url) |
| | | if match: |
| | | sec_id = match.group(1) |
| | | break |
| | | await page.close() |
| | | return sec_id |
| | | |
| | | |
| | | async def worker(context, task_queue, results): |
| | | while True: |
| | | grad_user: GrabUser = await task_queue.get() |
| | | if grad_user is None : |
| | | break |
| | | sec_id = await get_sec_id(grad_user.user_home_page_url, context) |
| | | results.append({'url': grad_user.user_home_page_url, 'video_id': sec_id}) |
| | | print({'url': grad_user.id, 'sec_id': sec_id}) |
| | | grad_user_dao.update_post_info(GrabUser(id=grad_user.id, sec_id=sec_id)) |
| | | task_queue.task_done() |
| | | |
| | | |
| | | async def transform(short_urls: list): |
| | | async with async_playwright() as playwright: |
| | | browser = await playwright.chromium.launch(headless=False) |
| | | context = await browser.new_context() |
| | | |
| | | task_queue = asyncio.Queue() |
| | | for url in short_urls: |
| | | task_queue.put_nowait(url) |
| | | |
| | | results = [] |
| | | workers = [asyncio.create_task(worker(context, task_queue, results)) for i in range(max_count)] |
| | | |
| | | await task_queue.join() |
| | | |
| | | for _ in range(max_count): |
| | | task_queue.put_nowait(None) |
| | | |
| | | await asyncio.gather(*workers) |
| | | |
| | | await browser.close() |
| | | |
| | | for url in results: |
| | | print(url) |
| | | return results |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | url_list = grad_user_dao.get_post_info() |
| | | print('查询到待处理数据:', len(url_list), '条') |
| | | |
| | | # [{'id': 17, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6osyERT/ qEH:/ 07/26 N@j.cN '}, {'id': 18, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6osDbEa/ wFh:/ C@h.Bg 02/07 '}, {'id': 19, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6osB5xs/ WMW:/ s@e.Bg 03/15 '}, {'id': 20, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6osdAHC/ 07/01 w@S.yT DUY:/ '}, {'id': 21, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6osU8Ag/ 02/04 icN:/ c@N.jC '}, {'id': 22, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6osNTND/ 01/18 iCH:/ t@E.Hv '}, {'id': 23, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6osVTXq/ l@P.Xm 11/21 JII:/ '}, {'id': 24, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6ossrjm/ V@Y.Mj dnD:/ 12/24 '}, {'id': 25, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6osufkc/ 01/24 S@l.PX eOx:/ '}, {'id': 26, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6os4Yw3/ s@r.RK 10/19 jpQ:/ '}, {'id': 27, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6oGADWn/ X@z.GV hbA:/ 08/14 '}, {'id': 28, 'aweme_id': None, 'short_url': 'https://v.douyin.com/i6oGD9oo/ 09/24 I@v.SL WMW:/ '}] |
| | | asyncio.run(transform(url_list)) |