尝试用 Flask 调用 scrapy 的 commandline 来执行爬虫,报 signal only works in main thread

    Traceback (most recent call last):
      File "/usr/local/lib/python2.7/site-packages/flask/app.py", line 1994, in __call__
        return self.wsgi_app(environ, start_response)
      File "/usr/local/lib/python2.7/site-packages/flask/app.py", line 1985, in wsgi_app
        response = self.handle_exception(e)
      File "/usr/local/lib/python2.7/site-packages/flask/app.py", line 1540, in handle_exception
        reraise(exc_type, exc_value, tb)
      File "/usr/local/lib/python2.7/site-packages/flask/app.py", line 1982, in wsgi_app
        response = self.full_dispatch_request()
      File "/usr/local/lib/python2.7/site-packages/flask/app.py", line 1614, in full_dispatch_request
        rv = self.handle_user_exception(e)
      File "/usr/local/lib/python2.7/site-packages/flask/app.py", line 1517, in handle_user_exception
        reraise(exc_type, exc_value, tb)
      File "/usr/local/lib/python2.7/site-packages/flask/app.py", line 1612, in full_dispatch_request
        rv = self.dispatch_request()
      File "/usr/local/lib/python2.7/site-packages/flask/app.py", line 1598, in dispatch_request
        return self.view_functions[rule.endpoint](**req.view_args)
      File "/Users/chenchen/code/flask_scrapy/webapp/run.py", line 91, in run
        crawler = CrawlerProcess(settings)
      File "/usr/local/lib/python2.7/site-packages/scrapy/crawler.py", line 239, in __init__
        install_shutdown_handlers(self._signal_shutdown)
      File "/usr/local/lib/python2.7/site-packages/scrapy/utils/ossignal.py", line 21, in install_shutdown_handlers
        reactor._handleSignals()
      File "/usr/local/lib/python2.7/site-packages/twisted/internet/posixbase.py", line 295, in _handleSignals
        _SignalReactorMixin._handleSignals(self)
      File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1154, in _handleSignals
        signal.signal(signal.SIGINT, self.sigInt)
    ValueError: signal only works in main thread

附上核心代码

@app.route('/run')
def run():
    project = dict()
    project['name'] = 'test'
    project['mod'] = 'debug'
    project['script'] = """
# -*- coding: utf-8 -*-

import scrapy


class TiebaCategorySpider(scrapy.Spider):
    name = "tieba_category"

    start_url = 'http://tieba.baidu.com/f/index/forumclass'

    def start_requests(self):
        yield scrapy.Request(self.start_url)

    def parse(self, response):
        try:
            links = response.xpath('//ul[@class="item-list-ul clearfix"]/li/a')
            for i in links:
                a = i.xpath('@href').extract_first()
                name = i.xpath('text()').extract_first()
                yield scrapy.Request(self.repair_url(a), callback=self.parse_category, meta={'sub_category': name})
        except Exception as e:
            print e
            return

    def parse_category(self, response):
        a_list = response.xpath('//a[@class="ba_href clearfix"]')
        category = response.xpath('//div[@class="ba_class_title"]/text()').extract_first()
        for i in a_list:
            item = CategoryItem()

            item['img'] = i.xpath('img[@class="ba_pic"]/@src').extract_first()
            item['name'] = i.xpath('div[@class="ba_content"]/p[@class="ba_name"]/text()').extract_first()
            item['member_count'] = i.xpath('div[@class="ba_content"]//span[@class="ba_m_num"]/text()').extract_first()
            item['post_count'] = i.xpath('div[@class="ba_content"]//span[@class="ba_p_num"]/text()').extract_first()
            item['sub_category'] = response.meta.get('sub_category')
            item['desc'] = i.xpath('div[@class="ba_content"]//p[@class="ba_desc"]/text()').extract_first()
            item['category'] = category

            yield item

        next_url = response.xpath('//div[@class="pagination"]/a[@class="next"]/@href').extract_first()
        if next_url:
            yield scrapy.Request(self.repair_url(next_url), callback=self.parse_category,
                                 meta={'sub_category': response.meta.get('sub_category')})

    @staticmethod
    def repair_url(url):
        if url.startswith('http'):
            pass
        else:
            url = ''. join(['http://tieba.baidu.com', url])
        return url

    """
    loader = ProjectLoader(project)
    module = loader.load_module('test_spider')
    a = module.__dict__
    for each in list(six.itervalues(module.__dict__)):
        if inspect.isclass(each) and issubclass(each, scrapy.Spider):
            module.__dict__['__handler_cls__'] = each
    _class = module.__dict__.get('__handler_cls__')
    assert _class is not None, "need BaseHandler in project module"

    spider = _class()

    settings = get_project_settings()
    crawler = CrawlerProcess(settings)
    crawler.crawl(spider)
    # crawler.start()
    return repr(module.__dict__)

Python

usr

lib

file

2 条回复 • 2016-12-28 14:34:15 +08:00

wwqgtxx

2016-12-28 13:55:25 +08:00

意思就是你这一样必须在主线程执行
crawler = CrawlerProcess(settings)

onlyice

2016-12-28 14:34:15 +08:00

Scrapy 依赖 twisted ，应该是 twisted 的异步网络模型的要求