Source code for HuaweiCrawler.core.core

# -*- coding: utf-8 -*-
"""
core

:example:

    In the Docker Image ``quanpan302/huawei-crawler``:

    scrapy startproject tutorial
    
    scrapy runspider /notebooks/src/HuaweiCrawler/core/core.py -o mobile.csv -t csv
"""
import scrapy
import csv


[docs]class TmobileSpider(scrapy.Spider):
    name = 'tmobile_spider'
    start_urls = ['https://www.t-mobile.nl/shop/alle-telefoons?ch=es&cc=con&sc=acq']

    file_name = open('{}.csv'.format(name), 'w') #Output_file.csv is name of output file
    fieldnames = ['url', 'name'] #adding header to file
    writer = csv.DictWriter(file_name, fieldnames=fieldnames)
    writer.writeheader()

[docs]    def parse(self, response):
        # follow links to phone detail pages
        # for href in response.css('div.product-list-box + a.product-list-box-link::attr(href)'):
        for href in response.css('a.product-list-box-link::attr(href)'):
            print('-> href: {}'.format(href))
            yield response.follow(href, self.parse_detail)

        # follow pagination links
        for href in response.css('li.next a::attr(href)'):
            yield response.follow(href, self.parse)

[docs]    def parse_detail(self, response):

        def extract_with_css(query):
            return response.css(query).get(default='').strip()

        url = response.url
        name = extract_with_css('h1[data-interaction-id="aproduct-details-title"]::text')
        self.writer.writerow(
            {
                'url': url,
                'name': name
            }
        ) #writing data into file.
        yield {
            'name':  name
        }


# class VodafoneSpider(scrapy.Spider):
#     name = 'vodafone_spider'
#     start_urls = ['https://www.vodafone.nl/shop/mobiel/telefoon/']

#     def parse(self, response):
#         # follow links to phone detail pages
#         # for href in response.css('div.product-list-box + a.product-list-box-link::attr(href)'):
#         # for href in response.css('a:text(*="Selecteer")::attr(href)'):
#         for href in response.css('div#react-devicelisting-container a.cta::attr(href)'):
#             print('-> href: {}'.format(href))
#             yield response.follow(href, self.parse_detail)

#         # follow pagination links
#         for href in response.css('li.next a::attr(href)'):
#             yield response.follow(href, self.parse)

#     def parse_detail(self, response):

#         def extract_with_css(query):
#             return response.css(query).get(default='').strip()

#         yield {
#             'name':  extract_with_css('h2[data-testid*="regular-pdp--device-title"]::text'),
#             # 'price': extract_with_css('.author-born-date::text'),
#             # 'image': extract_with_css('.author-description::text'),
#         }
Source code for HuaweiCrawler.core.core

HuaweiCrawler

Navigation

Related Topics