New file |
| | |
| | | import json |
| | | import traceback |
| | | import urllib |
| | | from typing import List |
| | | |
| | | import scrapy |
| | | from scrapy.http import Response |
| | | |
| | | from aijuke_spider.config.db_config import matrix_session |
| | | from aijuke_spider.items import CommentGot, V |
| | | from aijuke_spider.spiders.douyin.utils import gen_abogus |
| | | from aijuke_spider.spiders.kuaishou.utils import RandomUserAgent |
| | | |
| | | |
| | | class KuaiShouVUserProfileSpider(scrapy.Spider): |
| | | platform = "KS" |
| | | name = "kuaishou_v_user_profile_spider" |
| | | allowed_domains = ["kuaishou.com"] |
| | | base_url = "https://www.kuaishou.com" |
| | | headers = { |
| | | 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0", |
| | | 'Accept-Encoding': "gzip, deflate, br, zstd", |
| | | 'Content-Type': "application/json", |
| | | 'sec-ch-ua': "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Microsoft Edge\";v=\"126\"", |
| | | 'sec-ch-ua-mobile': "?0", |
| | | 'sec-ch-ua-platform': "\"Windows\"", |
| | | 'Origin': "https://www.kuaishou.com", |
| | | 'Sec-Fetch-Site': "same-origin", |
| | | 'Sec-Fetch-Mode': "cors", |
| | | 'Sec-Fetch-Dest': "empty", |
| | | 'Referer': "https://www.kuaishou.com/profile/3xwpkd9patqbvic", |
| | | 'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", |
| | | } |
| | | data = { |
| | | "operationName": "visionProfile", |
| | | "variables": { |
| | | "userId": '' |
| | | }, |
| | | "query": "query visionProfile($userId: String) {\n visionProfile(userId: $userId) {\n result\n hostName\n userProfile {\n ownerCount {\n fan\n photo\n follow\n photo_public\n __typename\n }\n profile {\n gender\n user_name\n user_id\n headurl\n user_text\n user_profile_bg_url\n __typename\n }\n isFollowing\n __typename\n }\n __typename\n }\n}\n" |
| | | } |
| | | |
| | | def __init__(self, *args, **kwargs): |
| | | super(KuaiShouVUserProfileSpider, self).__init__(*args, **kwargs) |
| | | # 从 kwargs 中获取自定义参数 |
| | | self.tenant_id = kwargs.get('tenant_id', None) |
| | | |
| | | def start_requests(self): |
| | | v_user_list = self.get_v_user() |
| | | for v_user in v_user_list: |
| | | data = self.data.copy() |
| | | data.get('variables').update({'userId': v_user.unique_no}) |
| | | headers = self.headers.copy() |
| | | yield scrapy.Request(url=f'{self.base_url}/graphql', method='POST', body=json.dumps(data), |
| | | headers=headers, meta={'v_user': v_user, 'data': data}, callback=self.parse) |
| | | |
| | | def parse(self, response: Response, *args, **kwargs): |
| | | try: |
| | | if response.status == 200 and len(response.text) > 0: |
| | | user_info = response.json().get('data',{}).get('visionProfile',{}).get('userProfile',{}) |
| | | if user_info: |
| | | yield { |
| | | 'user_info': user_info, |
| | | 'v_user': response.meta.get('v_user'), |
| | | } |
| | | self.logger.debug(f'抓取用户,{user_info.get("profile", None).get("user_name")},' |
| | | f'{user_info.get("profile", None).get("user_id")}') |
| | | else: |
| | | self.logger.debug(response.body) |
| | | except json.JSONDecodeError: |
| | | self.logger.error("响应体不是有效的 JSON 格式") |
| | | self.logger.error(response.body) |
| | | except Exception as e: |
| | | self.logger.error(f"处理响应出现问题{e}") |
| | | self.logger.error(response.body) |
| | | |
| | | def get_v_user(self) -> List[V]: |
| | | with matrix_session() as session: |
| | | try: |
| | | query = session.query(V).filter( |
| | | V.platform == self.platform, |
| | | V.url.isnot(None), |
| | | V.nickname.is_(None) |
| | | ).order_by(V.id.desc()) |
| | | if self.tenant_id is not None: |
| | | query = query.filter( |
| | | V.tenant_id == self.tenant_id, |
| | | ) |
| | | result = query.all() |
| | | self.logger.info(f'查询到{len(result)}条数据') |
| | | return result |
| | | except Exception as e: |
| | | self.logger.error(e) |
| | | traceback.print_exc() |
| | | return [] |