Source code for HuaweiCrawler.core.core

# -*- coding: utf-8 -*-
"""
core

:example:

    In the Docker Image ``quanpan302/huawei-crawler``:

    scrapy startproject tutorial
    
    scrapy runspider /notebooks/src/HuaweiCrawler/core/core.py -o mobile.csv -t csv
"""
import scrapy
import csv


[docs]class TmobileSpider(scrapy.Spider): name = 'tmobile_spider' start_urls = ['https://www.t-mobile.nl/shop/alle-telefoons?ch=es&cc=con&sc=acq'] file_name = open('{}.csv'.format(name), 'w') #Output_file.csv is name of output file fieldnames = ['url', 'name'] #adding header to file writer = csv.DictWriter(file_name, fieldnames=fieldnames) writer.writeheader()
[docs] def parse(self, response): # follow links to phone detail pages # for href in response.css('div.product-list-box + a.product-list-box-link::attr(href)'): for href in response.css('a.product-list-box-link::attr(href)'): print('-> href: {}'.format(href)) yield response.follow(href, self.parse_detail) # follow pagination links for href in response.css('li.next a::attr(href)'): yield response.follow(href, self.parse)
[docs] def parse_detail(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() url = response.url name = extract_with_css('h1[data-interaction-id="aproduct-details-title"]::text') self.writer.writerow( { 'url': url, 'name': name } ) #writing data into file. yield { 'name': name }
# class VodafoneSpider(scrapy.Spider): # name = 'vodafone_spider' # start_urls = ['https://www.vodafone.nl/shop/mobiel/telefoon/'] # def parse(self, response): # # follow links to phone detail pages # # for href in response.css('div.product-list-box + a.product-list-box-link::attr(href)'): # # for href in response.css('a:text(*="Selecteer")::attr(href)'): # for href in response.css('div#react-devicelisting-container a.cta::attr(href)'): # print('-> href: {}'.format(href)) # yield response.follow(href, self.parse_detail) # # follow pagination links # for href in response.css('li.next a::attr(href)'): # yield response.follow(href, self.parse) # def parse_detail(self, response): # def extract_with_css(query): # return response.css(query).get(default='').strip() # yield { # 'name': extract_with_css('h2[data-testid*="regular-pdp--device-title"]::text'), # # 'price': extract_with_css('.author-born-date::text'), # # 'image': extract_with_css('.author-description::text'), # }