├── ebaykleinanzeigen
├── requirements.txt
├── ebaykleinanzeigen
│ ├── items.pyc
│ ├── __init__.pyc
│ ├── settings.pyc
│ ├── spiders
│ │ ├── __init__.pyc
│ │ ├── utilities.pyc
│ │ ├── elastic_functions.pyc
│ │ ├── ebay_kleinanzeigen.pyc
│ │ ├── __init__.py
│ │ ├── elastic_functions.py
│ │ ├── utilities.py
│ │ └── ebay_kleinanzeigen.py
│ ├── pipelines.py
│ ├── items.py
│ ├── settings.py
│ └── middlewares.py
├── start_urls.json
├── config_file.json
└── scrapy.cfg
├── README.md
└── LICENSE
/ebaykleinanzeigen/requirements.txt:
--------------------------------------------------------------------------------
1 | elasticsearch==7.0.2
2 | Scrapy==1.6.0
3 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/items.pyc
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/__init__.pyc
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/settings.pyc
--------------------------------------------------------------------------------
/ebaykleinanzeigen/start_urls.json:
--------------------------------------------------------------------------------
1 | {
2 | "urls":[
3 | "https://www.ebay-kleinanzeigen.de/s-berlin/l3331","https://www.ebay-kleinanzeigen.de/s-muenchen/l6411"
4 | ]
5 | }
6 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/spiders/__init__.pyc
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/spiders/utilities.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/spiders/utilities.pyc
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/spiders/elastic_functions.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/spiders/elastic_functions.pyc
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/spiders/ebay_kleinanzeigen.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/spiders/ebay_kleinanzeigen.pyc
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/config_file.json:
--------------------------------------------------------------------------------
1 | {
2 | "protocol":"http",
3 | "elastic_username":"elastic",
4 | "elastic_password":"changeme",
5 | "elastic_address":"127.0.0.1",
6 | "elastic_port":"9200",
7 | "elastic_index_name":"ebay_kleinanzeigen",
8 | "elastic_connection_retry":5,
9 | "scrape_next_pages": "True"
10 | }
11 |
12 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = ebaykleinanzeigen.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ebaykleinanzeigen
12 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class EbaykleinanzeigenPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class EbaykleinanzeigenItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/spiders/elastic_functions.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | import re
3 | import json
4 | from datetime import datetime
5 | from elasticsearch import Elasticsearch
6 | import logging
7 | import time
8 |
9 | class ElasticFunctions:
10 |
11 | def connection_to_elastic(self, connection_request, retry_number = 5):
12 | #We attempt to connect to Elastic several times until it works, otherwise we sent an error message
13 | es = Elasticsearch([connection_request], verify_certs=True)
14 | while(not es.ping() and i <= retry_number):
15 | i+=1
16 | if not es.ping() : return ValueError("Connection failed")
17 | if es.ping() : return es
18 |
19 | def check_article_existence(self, elastic_client, index_name, doc_id):
20 | return elastic_client.exists(index=index_name, id=doc_id)
21 |
22 | def add_article_to_elastic(self, elastic_client, index_name, bulk_data, doc_id):
23 | if elastic_client.ping() :
24 | res = elastic_client.index(index=index_name, doc_type="article", id=doc_id, body=bulk_data)
25 | print("------------------------------ ", "Article added with : ", res["_shards"])
26 | elastic_client.indices.refresh()
27 | else: return "Not connected to Elastic."
28 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/spiders/utilities.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | import re
3 | import json
4 | from datetime import datetime
5 | from elasticsearch import Elasticsearch
6 | import logging
7 | import time
8 |
9 | class Utilities:
10 |
11 | def load_urls(self, filename = "start_urls.json"):
12 | with open('start_urls.json') as file:
13 | links = json.load(file)
14 | return links['urls']
15 |
16 | def load_config_file(self, filename = "config_file.json"):
17 | with open('config_file.json') as file:
18 | config = json.load(file)
19 | return config
20 |
21 | def is_int(self, value):
22 | try:
23 | int(str(value))
24 | return True
25 | except ValueError:
26 | return False
27 |
28 | def is_float(self, value):
29 | try:
30 | float(str(value))
31 | return True
32 | except ValueError:
33 | return False
34 |
35 | def is_date(self, value):
36 | try:
37 | datetime.strptime(str(value), '%d.%m.%Y')
38 | return True
39 | except ValueError:
40 | return False
41 |
42 | def infer_data_types(self, article):
43 | for key in article:
44 | if isinstance(article[key], list): continue
45 | elif self.is_int(article[key]): article[key] = int(article[key])
46 | elif self.is_float(article[key]): article[key] = float(article[key])
47 | elif self.is_date(article[key]) and str(article[key]) is not None: article[key] = datetime.strptime(str(article[key]), '%d.%m.%Y')
48 | return article
49 |
50 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
An Ebay-kleinanzeigen Web scraper using Python and Scrapy to fetch data into an ElasticSearch cluster with Kibana
2 |
3 | The aim here is to extract data from https://www.ebay-kleinanzeigen.de/ automatically and rapidly in order to store them
4 | into an ElasticSearch cluster and get fast insights with Kibana.
5 |
6 | Requirements
7 | Python 3
8 | Elasticsearchb 7.0.2
9 | Scrapy 1.6.0
10 |
11 | How to set it :
12 |
13 | 1) Start your ElasticSearch cluster with Kibana installed on it. If you don't have it, a fast way to get it could be to install
14 | and use a Docker image with the following steps :
15 | ```bash
16 |
17 | git clone https://github.com/deviantony/docker-elk.git
18 | cd /docker-elk
19 | docker-compose up -d
20 | ```
21 |
22 | 2) Set the URLs (like https://www.ebay-kleinanzeigen.de/s-berlin/l3331 for example) you want to scrape in JSON file : start_urls.json
23 | 3) Set the various configuration parameters you wish :
24 | ```json
25 | {
26 | "protocol": "http or https",
27 | "elastic_username": "the username to connect on your ElasticSearch cluster",
28 | "elastic_password": "the needed password to connect on your ElasticSearch cluster",
29 | "elastic_address": "the binded ip address of your ElasticSearch cluster",
30 | "elastic_port": "the binded port of your ElasticSearch cluster",
31 | "elastic_index_name": "the index name of your ElasticSearch cluster",
32 | "elastic_connection_retry": "the number of tries to reconnect on your ElasticSearch in case of failure",
33 | "scrape_next_pages": "boolean to indicate if the web scraper check the next pages (1,2,3...) displayed at the bottom of page."
34 | }
35 | ```
36 | The default login and server parameters of the ElasticSearch Docker images are entered.
37 |
38 | 4) Change your current directory to the Scraper's one and start it through :
39 | ```bash
40 | cd .../ebaykleinanzeigen
41 | scrapy crawl ebay_kleinanzeigen
42 | ```
43 | 5) The results are automatically updated into ElasticSearch and Kibana as soon as the data are being scraped.
44 | Just enjoy the insights by connecting on your Kibana home page (by default in the Docker image : http://localhost:5601) !
45 |
46 | NB: The number of concurrent requests and time between has been defined in settings.py respectively to 20 and 0.8 by default
47 | in order to avoid problems on Ebay-kleinanzeigen's server.
48 |
49 | References
50 |
51 | -[Ebay-kleinanzeigen](https://www.ebay-kleinanzeigen.de/stadt/berlin/)
52 | -[Elasticsearch-py](https://elasticsearch-py.readthedocs.io/en/master/)
53 |
54 | Credits
55 |
56 | Copyright (c) 2019, HicBoux. Work released under Apache 2.0 License.
57 |
58 | (Please contact me if you wish to use my work in specific conditions not allowed automatically by the Apache 2.0 License.)
59 |
60 | Disclaimer
61 |
62 | This solution has been made available for informational and educational purposes only. I hereby disclaim any and all
63 | liability to any party for any direct, indirect, implied, punitive, special, incidental or other consequential
64 | damages arising directly or indirectly from any use of this content, which is provided as is, and without warranties.
65 | I also disclaim all responsibility for web scraping at a disruptive rate and eventual damages caused by a such use.
66 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for ebaykleinanzeigen project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'ebaykleinanzeigen'
13 |
14 | SPIDER_MODULES = ['ebaykleinanzeigen.spiders']
15 | NEWSPIDER_MODULE = 'ebaykleinanzeigen.spiders'
16 |
17 | #Crawl depth paramaters : which level of depth to scrape first
18 | #DEPTH_LIMIT = 0
19 | #DEPTH_PRIORITY = 0
20 |
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | USER_AGENT = 'ebaykleinanzeigen (+https://www.ebay-kleinanzeigen.de)'
23 |
24 | # Obey robots.txt rules
25 | ROBOTSTXT_OBEY = True
26 |
27 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
28 | CONCURRENT_REQUESTS = 20
29 |
30 | # Configure a delay for requests for the same website (default: 0)
31 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
32 | # See also autothrottle settings and docs
33 | DOWNLOAD_DELAY = 0.8
34 | # The download delay setting will honor only one of:
35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
36 | #CONCURRENT_REQUESTS_PER_IP = 16
37 |
38 | # Disable cookies (enabled by default)
39 | #COOKIES_ENABLED = False
40 |
41 | # Disable Telnet Console (enabled by default)
42 | #TELNETCONSOLE_ENABLED = False
43 |
44 | # Override the default request headers:
45 | #DEFAULT_REQUEST_HEADERS = {
46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
47 | # 'Accept-Language': 'en',
48 | #}
49 |
50 | # Enable or disable spider middlewares
51 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
52 | #SPIDER_MIDDLEWARES = {
53 | # 'ebaykleinanzeigen.middlewares.EbaykleinanzeigenSpiderMiddleware': 543,
54 | #}
55 |
56 | # Enable or disable downloader middlewares
57 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
58 | #DOWNLOADER_MIDDLEWARES = {
59 | # 'ebaykleinanzeigen.middlewares.EbaykleinanzeigenDownloaderMiddleware': 543,
60 | #}
61 |
62 | # Enable or disable extensions
63 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
64 | #EXTENSIONS = {
65 | # 'scrapy.extensions.telnet.TelnetConsole': None,
66 | #}
67 |
68 | # Configure item pipelines
69 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
70 | #ITEM_PIPELINES = {
71 | # 'ebaykleinanzeigen.pipelines.EbaykleinanzeigenPipeline': 300,
72 | #}
73 |
74 | # Enable and configure the AutoThrottle extension (disabled by default)
75 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
76 | #AUTOTHROTTLE_ENABLED = True
77 | # The initial download delay
78 | #AUTOTHROTTLE_START_DELAY = 5
79 | # The maximum download delay to be set in case of high latencies
80 | #AUTOTHROTTLE_MAX_DELAY = 60
81 | # The average number of requests Scrapy should be sending in parallel to
82 | # each remote server
83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
84 | # Enable showing throttling stats for every response received:
85 | #AUTOTHROTTLE_DEBUG = False
86 |
87 | # Enable and configure HTTP caching (disabled by default)
88 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
89 | #HTTPCACHE_ENABLED = True
90 | #HTTPCACHE_EXPIRATION_SECS = 0
91 | #HTTPCACHE_DIR = 'httpcache'
92 | #HTTPCACHE_IGNORE_HTTP_CODES = []
93 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
94 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class EbaykleinanzeigenSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class EbaykleinanzeigenDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/ebaykleinanzeigen/ebaykleinanzeigen/spiders/ebay_kleinanzeigen.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import re
4 | import json
5 | import logging
6 | from datetime import datetime
7 |
8 | from elasticsearch import Elasticsearch
9 |
10 | from ebaykleinanzeigen.spiders.utilities import Utilities
11 | from ebaykleinanzeigen.spiders.elastic_functions import ElasticFunctions
12 |
13 | class EbayKleinanzeigenSpider(scrapy.Spider):
14 | name = 'ebay_kleinanzeigen'
15 | allowed_domains = ['ebay-kleinanzeigen.de']
16 |
17 | def __init__(self, *args, **kwargs):
18 | #Load home made classes with functions
19 | self.utilities = Utilities()
20 | self.elastic_functions = ElasticFunctions()
21 |
22 | #Load of configuration files
23 | self.start_urls = self.utilities.load_urls("start_urls.json")
24 | self.config = self.utilities.load_config_file("config_file.json")
25 |
26 | #Connection to ELK
27 | connection_request = self.config["protocol"] + "://" + self.config["elastic_username"] + ":" + self.config["elastic_password"] + "@" + self.config["elastic_address"] + ":" + self.config["elastic_port"]
28 | self.es = self.elastic_functions.connection_to_elastic(connection_request, self.config["elastic_connection_retry"])
29 |
30 |
31 | def parse(self, response):
32 | article_urls = response.xpath("//a[@class='ellipsis']/@href").extract()
33 | for url in article_urls:
34 | domain = 'https://www.ebay-kleinanzeigen.de'
35 | article_page = response.urljoin(domain + url)
36 | print(article_page)
37 | request = scrapy.Request(url = article_page, callback=self.parse_article_page, dont_filter=True)
38 | yield request
39 |
40 | next_page = domain + str(response.xpath("//a[@class='pagination-next']/@href").extract_first())
41 | print(next_page)
42 | if next_page is not None and self.config["scrape_next_pages"] == "True": #If still some next pages to follow and if it's agreed in the config
43 | yield scrapy.Request(
44 | response.urljoin(next_page),
45 | callback=self.parse)
46 |
47 | def parse_article_page(self, response):
48 |
49 | #Retrieve some data about the article
50 | article_url = response.url
51 | article_title = response.xpath("//h1[@class='articleheader--title']//text()").extract_first()
52 | article_price = [s[7:] for s in response.xpath("//h2[@class='articleheader--price']//text()").extract()][0]
53 | article_description = response.xpath("//p[@itemprop='description']").extract_first()
54 | article_description = re.compile(r'<[^>]+>').sub('', article_description).split(" ").remove('') #Remove all HTML elements in order to have only one text block
55 |
56 | #Retrieve all data categories about the product ("type", "size", "delivery way" etc...)
57 | article_details_categories = [s.replace(":","") for s in response.xpath("//dt[@class='attributelist--key']//text()").extract()]
58 |
59 | #Retrieve the place where the article is sold
60 | seller_place = [s.strip() for s in response.xpath("//dd[@class='attributelist--value' and @itemprop='seller']//text()").extract()]
61 | seller_place = ''.join(seller_place)
62 |
63 | #Retrieve all data (values) about the product ("2m", "Man" etc...)
64 | article_details_values = list(filter(None, [s.strip() for s in response.xpath("//dd[@class='attributelist--value' and not(@itemprop='seller')]//text()").extract()]))
65 | #Add the seller place to the list
66 | article_details_values.insert(0, seller_place)
67 | if ',' in article_details_values : article_details_values.remove(',') #Remove useless cells
68 | #Since some fields (like "Ausstatung" or "Art") have several values, we try to merge them into a given field
69 | len_cat = len(article_details_categories)
70 | len_val = len(article_details_values)
71 | article_details_values[len_cat-1] = article_details_values[len_cat-1] + ' ' #Avoid the first ending values to not be taken into account
72 | article_details_values[len_cat-1 : len_val] = [' '.join(article_details_values[len_cat-1 : len_val]).replace(",","")] #Merge all ending values
73 | article_details_values[len(article_details_values)-1] = filter(None, article_details_values[len(article_details_values)-1].split(" ")) #Remove empty cells and split values
74 | #In the case there is only 1 field, we transform this list into a simple String
75 | if len(article_details_values[len(article_details_values)-1]) == 1 : article_details_values[len(article_details_values)-1] = article_details_values[len(article_details_values)-1][0]
76 |
77 | #Definition of the dict to return to ELK
78 | article = []
79 | article.append(("URL", article_url))
80 | article.append(("Artikelstitel", article_title))
81 | article.append(("Preis", article_price))
82 | for i in range(len(article_details_categories)):
83 | article.append((article_details_categories[i], article_details_values[i]))
84 | article.append(("Artikelsbeschreibung", article_description))
85 | article = dict(article) #Transformation into a dictionary
86 | doc_id = article["URL"].split("/")[5] #Get the article's ID available in the URL
87 |
88 | #Transform possible values into float or dates
89 | article = self.utilities.infer_data_types(article)
90 |
91 |
92 | #Push the article into ELK : if and only if the document doesn't exist already in the index
93 | if not self.elastic_functions.check_article_existence(self.es, self.config["elastic_index_name"], doc_id):
94 | self.elastic_functions.add_article_to_elastic(self.es, self.config["elastic_index_name"], article, doc_id)
95 | else : print("-------------------------------- Article already indexed in Elastic.")
96 |
97 | yield article
98 |
99 |
100 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------