Source code for HuaweiCrawler.core.core
# -*- coding: utf-8 -*-
"""
core
:example:
In the Docker Image ``quanpan302/huawei-crawler``:
scrapy startproject tutorial
scrapy runspider /notebooks/src/HuaweiCrawler/core/core.py -o mobile.csv -t csv
"""
import scrapy
import csv
[docs]class TmobileSpider(scrapy.Spider):
name = 'tmobile_spider'
start_urls = ['https://www.t-mobile.nl/shop/alle-telefoons?ch=es&cc=con&sc=acq']
file_name = open('{}.csv'.format(name), 'w') #Output_file.csv is name of output file
fieldnames = ['url', 'name'] #adding header to file
writer = csv.DictWriter(file_name, fieldnames=fieldnames)
writer.writeheader()
[docs] def parse(self, response):
# follow links to phone detail pages
# for href in response.css('div.product-list-box + a.product-list-box-link::attr(href)'):
for href in response.css('a.product-list-box-link::attr(href)'):
print('-> href: {}'.format(href))
yield response.follow(href, self.parse_detail)
# follow pagination links
for href in response.css('li.next a::attr(href)'):
yield response.follow(href, self.parse)
[docs] def parse_detail(self, response):
def extract_with_css(query):
return response.css(query).get(default='').strip()
url = response.url
name = extract_with_css('h1[data-interaction-id="aproduct-details-title"]::text')
self.writer.writerow(
{
'url': url,
'name': name
}
) #writing data into file.
yield {
'name': name
}
# class VodafoneSpider(scrapy.Spider):
# name = 'vodafone_spider'
# start_urls = ['https://www.vodafone.nl/shop/mobiel/telefoon/']
# def parse(self, response):
# # follow links to phone detail pages
# # for href in response.css('div.product-list-box + a.product-list-box-link::attr(href)'):
# # for href in response.css('a:text(*="Selecteer")::attr(href)'):
# for href in response.css('div#react-devicelisting-container a.cta::attr(href)'):
# print('-> href: {}'.format(href))
# yield response.follow(href, self.parse_detail)
# # follow pagination links
# for href in response.css('li.next a::attr(href)'):
# yield response.follow(href, self.parse)
# def parse_detail(self, response):
# def extract_with_css(query):
# return response.css(query).get(default='').strip()
# yield {
# 'name': extract_with_css('h2[data-testid*="regular-pdp--device-title"]::text'),
# # 'price': extract_with_css('.author-born-date::text'),
# # 'image': extract_with_css('.author-description::text'),
# }