from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.loader import ItemLoader from mirror.items import Attributes from mirror.utils import join class Attr(CrawlSpider): name = "attr" rules = [Rule(LinkExtractor(), callback='parse_page')] start_urls = ["http://localhost"] allowed_domains = ["localhost"] def parse_page(self, resp): ldr = ItemLoader(item=Attributes(), response=resp) ldr.add_value('url', resp.url) ldr.add_xpath('keywords', 'string(//title)') ldr.add_xpath('words', "string(//*[name()!='title'])") ldr.add_xpath('links', '//a/@href') ldr.add_xpath('tags', '//*') ldr.add_xpath('medias', join(self.settings.get('MEDIA_TAGS'))) ldr.add_xpath('semantics', join(self.settings.get('SEMANTIC_TAGS'))) ldr.add_xpath('injections', join(self.settings.get('INJECT_TAGS'))) return ldr.load_item()