使用scrapy采集社交媒体的公开信息
yellowDog
2024-09-10 f56ca406a1ddecd4abb59b3f4abb433d736b78c7
feat:增加回调拓展
2 files modified
32 ■■■■■ changed files
aijuke_spider/extensions.py 29 ●●●●● patch | view | raw | blame | history
aijuke_spider/settings.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/extensions.py
@@ -1,3 +1,4 @@
import requests
from scrapy import signals
from aijuke_spider.config.db_config import matrix_session
@@ -36,4 +37,30 @@
            spider.logger.error(f"Error while logging task data: {e}")
            self.session.rollback()
        finally:
            self.session.close()
            self.session.close()
class TaskCallBackExtension:
    def __init__(self, settings):
        self.settings = settings
        self.base_url='http://matrix.uj345.cc'
    @classmethod
    def from_crawler(cls, crawler):
        # 实例化扩展
        extension = cls(crawler.settings)
        crawler.signals.connect(extension.spider_closed, signal=signals.spider_closed)
        return extension
    def spider_closed(self, spider):
        params={
            'uuid': self.settings.get('UUID'),
            'sec_id': self.settings.get('SEC_ID')
        }
        headers={
            'token':'jwjk2024@!',
            'tenant_id':self.settings.get('TENANT_ID'),
        }
        response=requests.get(url=f'{self.base_url}/engine/api/task/callback',params=params,headers=headers)
        if response.status_code == 200:
            spider.logger.info(f"回调成功")
        else:
            spider.logger.info(f"回调失败")
aijuke_spider/settings.py
@@ -14,8 +14,9 @@
SCRAPEOPS_API_KEY = 'a63cf18b-df14-410c-914e-84fcc902b730'
EXTENSIONS = {
    'aijuke_spider.extensions.TaskLogExtension': 500,
    'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500,
    'aijuke_spider.extensions.TaskLogExtension': 500,
    'aijuke_spider.extensions.TaskCallBackExtension': 500,
}
RETRY_ENABLED = True