使用scrapy采集社交媒体的公开信息
yellowDog
2024-09-10 bd2144f37c2b1b51742c0759c5db92aa015637dc
refactor:优化项目结构
16 files modified
2 files added
99 ■■■■■ changed files
aijuke_spider/items.py 24 ●●●●● patch | view | raw | blame | history
aijuke_spider/spiders/base/__init__.py patch | view | raw | blame | history
aijuke_spider/spiders/base/base_spider.py 28 ●●●●● patch | view | raw | blame | history
aijuke_spider/spiders/douyin/douyin_comment_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/douyin/douyin_hot_post_comment_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/douyin/douyin_search_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/douyin/douyin_user_profile_spider.py 4 ●●● patch | view | raw | blame | history
aijuke_spider/spiders/douyin/douyin_v_user.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/douyin/douyin_v_user_profile_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/kuaishou/kuaishou_comment_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/kuaishou/kuaishou_hot_post_comment_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/kuaishou/kuaishou_search_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/kuaishou/kuaishou_user_profile_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/kuaishou/kuaishou_v_user.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/kuaishou/kuaishou_v_user_profile_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/xhs/xhs_comment_spider.py 4 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/xhs/xhs_search_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/spiders/xhs/xhs_user_profile_spider.py 3 ●●●● patch | view | raw | blame | history
aijuke_spider/items.py
@@ -14,31 +14,7 @@
Base = declarative_base()
class BaseSpider(scrapy.Spider):
    def __init__(self, *args, **kwargs):
        super(BaseSpider, self).__init__(*args, **kwargs)
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BaseSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider.task_id = kwargs.get('task_id', None)
        spider.job_unique_no = kwargs.get('job_unique_no', None)
        spider.tenant_id = kwargs.get('tenant_id', None)
        spider.task_name = kwargs.get('task_name', None)
        spider.sec_id = kwargs.get('sec_id', None)
        spider.uuid = kwargs.get('uuid', None)
        spider.demand_id = kwargs.get('demand_id', None)
        # 将参数保存到 settings
        crawler.settings.set('TASK_ID', spider.task_id)
        crawler.settings.set('JOB_UNIQUE_NO', spider.job_unique_no)
        crawler.settings.set('TENANT_ID', spider.tenant_id)
        crawler.settings.set('TASK_NAME', spider.task_name)
        crawler.settings.set('SEC_ID', spider.sec_id)
        crawler.settings.set('UUID', spider.uuid)
        crawler.settings.set('DEMAND_ID', spider.demand_id)
        return spider
class Aweme(Base):
aijuke_spider/spiders/base/__init__.py
aijuke_spider/spiders/base/base_spider.py
New file
@@ -0,0 +1,28 @@
import scrapy
class BaseSpider(scrapy.Spider):
    def __init__(self, *args, **kwargs):
        super(BaseSpider, self).__init__(*args, **kwargs)
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BaseSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider.task_id = kwargs.get('task_id', None)
        spider.job_unique_no = kwargs.get('job_unique_no', None)
        spider.tenant_id = kwargs.get('tenant_id', None)
        spider.task_name = kwargs.get('task_name', None)
        spider.sec_id = kwargs.get('sec_id', None)
        spider.uuid = kwargs.get('uuid', None)
        spider.demand_id = kwargs.get('demand_id', None)
        # 将参数保存到 settings
        crawler.settings.set('TASK_ID', spider.task_id)
        crawler.settings.set('JOB_UNIQUE_NO', spider.job_unique_no)
        crawler.settings.set('TENANT_ID', spider.tenant_id)
        crawler.settings.set('TASK_NAME', spider.task_name)
        crawler.settings.set('SEC_ID', spider.sec_id)
        crawler.settings.set('UUID', spider.uuid)
        crawler.settings.set('DEMAND_ID', spider.demand_id)
        return spider
aijuke_spider/spiders/douyin/douyin_comment_spider.py
@@ -8,7 +8,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import Aweme, Demand, Task, BaseSpider
from aijuke_spider.items import Aweme, Demand, Task
from aijuke_spider.spiders.base.base_spider import BaseSpider
from aijuke_spider.spiders.douyin.utils import get_web_id, gen_abogus
aijuke_spider/spiders/douyin/douyin_hot_post_comment_spider.py
@@ -8,7 +8,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import Aweme, Demand, Task, BaseSpider
from aijuke_spider.spiders.base.base_spider import BaseSpider
from aijuke_spider.items import Aweme, Demand, Task
from aijuke_spider.spiders.douyin.utils import get_web_id, gen_abogus
aijuke_spider/spiders/douyin/douyin_search_spider.py
@@ -10,7 +10,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import Demand, Task, BaseSpider
from aijuke_spider.items import Demand, Task
from aijuke_spider.spiders.base.base_spider import BaseSpider
from aijuke_spider.spiders.douyin.utils import gen_abogus
aijuke_spider/spiders/douyin/douyin_user_profile_spider.py
@@ -5,7 +5,9 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import CommentGot, BaseSpider
from aijuke_spider.items import CommentGot
from aijuke_spider.spiders.base.base_spider import BaseSpider
from aijuke_spider.spiders.douyin.utils import gen_abogus
aijuke_spider/spiders/douyin/douyin_v_user.py
@@ -6,7 +6,8 @@
import scrapy
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import V, BaseSpider
from aijuke_spider.items import V
from aijuke_spider.spiders.base.base_spider import BaseSpider
from aijuke_spider.spiders.douyin.utils import gen_abogus
aijuke_spider/spiders/douyin/douyin_v_user_profile_spider.py
@@ -6,7 +6,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import V, BaseSpider
from aijuke_spider.items import V
from aijuke_spider.spiders.base.base_spider import BaseSpider
from aijuke_spider.spiders.douyin.utils import gen_abogus
aijuke_spider/spiders/kuaishou/kuaishou_comment_spider.py
@@ -7,7 +7,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import Aweme, Demand, Task, BaseSpider
from aijuke_spider.items import Aweme, Demand, Task
from aijuke_spider.spiders.base.base_spider import BaseSpider
class KuaiShouCommentSpider(BaseSpider):
aijuke_spider/spiders/kuaishou/kuaishou_hot_post_comment_spider.py
@@ -6,7 +6,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import Aweme, Demand, BaseSpider
from aijuke_spider.items import Aweme, Demand
from aijuke_spider.spiders.base.base_spider import BaseSpider
class KuaiShouHotPostCommentSpider(BaseSpider):
aijuke_spider/spiders/kuaishou/kuaishou_search_spider.py
@@ -6,7 +6,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import Demand, Task, BaseSpider
from aijuke_spider.items import Demand, Task
from aijuke_spider.spiders.base.base_spider import BaseSpider
class KuaiShouSearchSpider(BaseSpider):
aijuke_spider/spiders/kuaishou/kuaishou_user_profile_spider.py
@@ -5,7 +5,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import CommentGot, BaseSpider
from aijuke_spider.items import CommentGot
from aijuke_spider.spiders.base.base_spider import BaseSpider
class KuaiShouUserProfileSpider(BaseSpider):
aijuke_spider/spiders/kuaishou/kuaishou_v_user.py
@@ -5,7 +5,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import V, BaseSpider
from aijuke_spider.items import V
from aijuke_spider.spiders.base.base_spider import BaseSpider
class KuaiShouVUserSpider(BaseSpider):
aijuke_spider/spiders/kuaishou/kuaishou_v_user_profile_spider.py
@@ -6,7 +6,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import V, BaseSpider
from aijuke_spider.items import V
from aijuke_spider.spiders.base.base_spider import BaseSpider
class KuaiShouVUserProfileSpider(BaseSpider):
aijuke_spider/spiders/xhs/xhs_comment_spider.py
@@ -7,8 +7,8 @@
from sqlalchemy.dialects import mysql
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import Aweme, Demand, Task, BaseSpider
from aijuke_spider.items import Aweme, Demand, Task
from aijuke_spider.spiders.base.base_spider import BaseSpider
class XHSCommentSpider(BaseSpider):
    platform = "XHS"
aijuke_spider/spiders/xhs/xhs_search_spider.py
@@ -6,7 +6,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import Demand, Task, BaseSpider
from aijuke_spider.items import Demand, Task
from aijuke_spider.spiders.base.base_spider import BaseSpider
from aijuke_spider.spiders.xhs.utils import get_search_id
aijuke_spider/spiders/xhs/xhs_user_profile_spider.py
@@ -4,7 +4,8 @@
from scrapy.http import Response
from aijuke_spider.config.db_config import matrix_session
from aijuke_spider.items import CommentGot, BaseSpider
from aijuke_spider.items import CommentGot
from aijuke_spider.spiders.base.base_spider import BaseSpider
class XHSUserProfileSpider(BaseSpider):