I have written a Scrapy CrawlSpider.
class SiteCrawlerSpider(CrawlSpider): name = 'site_crawler' def __init__(self, start_url, **kw): super(SiteCrawlerSpider, self).__init__(**kw) self.rules = ( Rule(LinkExtractor(allow=()), callback='parse_start_url', follow=True), ) self.start_urls = [start_url] self.allowed_domains = tldextract.extract(start_url).registered_domain def parse_start_url(self, response): external_links = LinkExtractor(allow=(), deny=self.allowed_domains).extract_links(response) for link in external_links: i = FastcrawlerItem() i['pageurl'] = response.url i['ext_link'] = link.url i['ext_domain'] = tldextract.extract(link.url).registered_domain yield i
Now I am trying to run this script from another Python script as follows:
from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals from scrapy_fastcrawler.spiders.site_crawler import SiteCrawlerSpider from scrapy.utils.project import get_project_settings spider = SiteCrawlerSpider(start_url='http://www.health.com/') settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
Problem: Everything runs fine, but the major issue here is that the script processes only the 'start_url' and stops. It does not crawl and move to other links found on the start url and no processing being done. I have also setup pipelines and the items from start_url are correctly being saved to the pipeline setup.
Any help is greatly appreciated.