yellowDog
2024-07-15 8a437260d3df96bffd888a56b3a7d5573b6cc458
feat:实现短链转长链功能
2 files modified
1 files added
132 ■■■■■ changed files
config/dy_get_data.py 26 ●●●● patch | view | raw | blame | history
store/douyin/__init__.py 3 ●●●● patch | view | raw | blame | history
tools/dy_transform_video_id.py 103 ●●●●● patch | view | raw | blame | history
config/dy_get_data.py
@@ -1,8 +1,9 @@
import asyncio
import os
import toml
from sqlalchemy import create_engine, Column, String, BigInteger
from sqlalchemy.orm import sessionmaker, declarative_base
from tools import dy_transform_video_id
Base = declarative_base()
current_dir = os.path.dirname(__file__)
@@ -22,17 +23,28 @@
# 创建数据库连接
engine = create_engine(
    f'mysql+pymysql://{mysql_config.get("user")}:{mysql_config.get("password")}@{mysql_config.get("host")}:{mysql_config.get("port")}/{mysql_config.get("database")}')
    f'mysql+pymysql://{mysql_config.get("user")}:{mysql_config.get("password")}@{mysql_config.get("host")}:{mysql_config.get("port")}/{mysql_config.get("database")}'
)
Session = sessionmaker(bind=engine)
# 创建一个会话
session = Session()
def get_dy_video_id():
    """查询视频id列表"""
    aweme_list = session.query(Aweme).all()
    aweme_list = [aweme.aweme_id for aweme in aweme_list]
    session = Session()
    try:
        aweme_list = session.query(Aweme).all()
        aweme_list = [aweme.aweme_id for aweme in aweme_list]
    except Exception as e:
        print(f"Error occurred while fetching data from the database: {e}")
        return []
    finally:
        session.close()
    try:
        aweme_list = asyncio.run(dy_transform_video_id.transform(aweme_list))
    except Exception as e:
        print(f"Error occurred while transforming video ids: {e}")
    return aweme_list
store/douyin/__init__.py
@@ -87,6 +87,7 @@
    save_comment_item = {
        'platform': 'DY',
        "tenant_id": '1',
        "source": "挖掘",
        "comment_id": comment_id,
        "comment_datetime": datetime.datetime.fromtimestamp(comment_item.get("create_time")).strftime("%Y-%m-%d %H:%M:%S"),
        'home_url': f'https://www.douyin.com/user/{comment_item.get("user").get("sec_uid")}',
@@ -108,7 +109,7 @@
        "url": f"https://www.douyin.com/video/{aweme_id}"
        # "parent_comment_id": parent_comment_id
    }
    print(save_comment_item)
    # print(save_comment_item)
    utils.logger.info(
        f"[store.douyin.update_dy_aweme_comment] douyin aweme comment: , content: {save_comment_item.get('content')}")
tools/dy_transform_video_id.py
New file
@@ -0,0 +1,103 @@
import asyncio
import re
from typing import Optional
from playwright.async_api import async_playwright, BrowserContext
max_count = 8
async def get_video_id(short_url: str, context: BrowserContext) -> Optional[str]:
    page = await context.new_page()
    await page.goto(short_url)
    video_id = None
    while True:
        match = re.search(r"https://www\.douyin\.com/video/(\d+)", page.url)
        if match:
            video_id = match.group(1)
            break
    await page.close()
    return video_id
async def worker(context, task_queue, results):
    while True:
        short_url = await task_queue.get()
        if short_url is None:
            break
        video_url = await get_video_id(short_url, context)
        results.append(video_url)
        task_queue.task_done()
async def transform(short_urls: list):
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=False)
        context = await browser.new_context()
        task_queue = asyncio.Queue()
        for url in short_urls:
            task_queue.put_nowait(url)
        results = []
        workers = [asyncio.create_task(worker(context, task_queue, results)) for i in range(max_count)]
        await task_queue.join()
        for _ in range(max_count):
            task_queue.put_nowait(None)
        await asyncio.gather(*workers)
        await browser.close()
        for url in results:
            print(url)
        return results
if __name__ == '__main__':
    url_list = [
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6E51g7r/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6E51g7r/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6E5kPtv/ HIi:/ 11/27 q@R.KJ ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPfUEb/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6E51g7r/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6E51g7r/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6E5kPtv/ HIi:/ 11/27 q@R.KJ ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPfUEb/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6E51g7r/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6E51g7r/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6E5kPtv/ HIi:/ 11/27 q@R.KJ ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPfUEb/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6Eah69H/ c@N.Wm 06/13 BgB:/ ",
        "https://v.douyin.com/i6E51g7r/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6E51g7r/ LWM:/ A@T.yT 07/01 ",
        "https://v.douyin.com/i6E5kPtv/ HIi:/ 11/27 q@R.KJ ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPYWnt/ 09/05 CHv:/ T@Y.zT ",
        "https://v.douyin.com/i6EPfUEb/ LWM:/ A@T.yT 07/01 ",
    ]
    asyncio.run(transform(url_list))