├── ebaykleinanzeigen ├── requirements.txt ├── ebaykleinanzeigen │ ├── items.pyc │ ├── __init__.pyc │ ├── settings.pyc │ ├── spiders │ │ ├── __init__.pyc │ │ ├── utilities.pyc │ │ ├── elastic_functions.pyc │ │ ├── ebay_kleinanzeigen.pyc │ │ ├── __init__.py │ │ ├── elastic_functions.py │ │ ├── utilities.py │ │ └── ebay_kleinanzeigen.py │ ├── pipelines.py │ ├── items.py │ ├── settings.py │ └── middlewares.py ├── start_urls.json ├── config_file.json └── scrapy.cfg ├── README.md └── LICENSE /ebaykleinanzeigen/requirements.txt: -------------------------------------------------------------------------------- 1 | elasticsearch==7.0.2 2 | Scrapy==1.6.0 3 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/items.pyc -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/__init__.pyc -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/settings.pyc -------------------------------------------------------------------------------- /ebaykleinanzeigen/start_urls.json: -------------------------------------------------------------------------------- 1 | { 2 | "urls":[ 3 | "https://www.ebay-kleinanzeigen.de/s-berlin/l3331","https://www.ebay-kleinanzeigen.de/s-muenchen/l6411" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/spiders/__init__.pyc -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/spiders/utilities.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/spiders/utilities.pyc -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/spiders/elastic_functions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/spiders/elastic_functions.pyc -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/spiders/ebay_kleinanzeigen.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HicBoux/Ebay-kleinanzeigen-scrapy-elastic/HEAD/ebaykleinanzeigen/ebaykleinanzeigen/spiders/ebay_kleinanzeigen.pyc -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/config_file.json: -------------------------------------------------------------------------------- 1 | { 2 | "protocol":"http", 3 | "elastic_username":"elastic", 4 | "elastic_password":"changeme", 5 | "elastic_address":"127.0.0.1", 6 | "elastic_port":"9200", 7 | "elastic_index_name":"ebay_kleinanzeigen", 8 | "elastic_connection_retry":5, 9 | "scrape_next_pages": "True" 10 | } 11 | 12 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ebaykleinanzeigen.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ebaykleinanzeigen 12 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class EbaykleinanzeigenPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class EbaykleinanzeigenItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/spiders/elastic_functions.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | import json 4 | from datetime import datetime 5 | from elasticsearch import Elasticsearch 6 | import logging 7 | import time 8 | 9 | class ElasticFunctions: 10 | 11 | def connection_to_elastic(self, connection_request, retry_number = 5): 12 | #We attempt to connect to Elastic several times until it works, otherwise we sent an error message 13 | es = Elasticsearch([connection_request], verify_certs=True) 14 | while(not es.ping() and i <= retry_number): 15 | i+=1 16 | if not es.ping() : return ValueError("Connection failed") 17 | if es.ping() : return es 18 | 19 | def check_article_existence(self, elastic_client, index_name, doc_id): 20 | return elastic_client.exists(index=index_name, id=doc_id) 21 | 22 | def add_article_to_elastic(self, elastic_client, index_name, bulk_data, doc_id): 23 | if elastic_client.ping() : 24 | res = elastic_client.index(index=index_name, doc_type="article", id=doc_id, body=bulk_data) 25 | print("------------------------------ ", "Article added with : ", res["_shards"]) 26 | elastic_client.indices.refresh() 27 | else: return "Not connected to Elastic." 28 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/spiders/utilities.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | import json 4 | from datetime import datetime 5 | from elasticsearch import Elasticsearch 6 | import logging 7 | import time 8 | 9 | class Utilities: 10 | 11 | def load_urls(self, filename = "start_urls.json"): 12 | with open('start_urls.json') as file: 13 | links = json.load(file) 14 | return links['urls'] 15 | 16 | def load_config_file(self, filename = "config_file.json"): 17 | with open('config_file.json') as file: 18 | config = json.load(file) 19 | return config 20 | 21 | def is_int(self, value): 22 | try: 23 | int(str(value)) 24 | return True 25 | except ValueError: 26 | return False 27 | 28 | def is_float(self, value): 29 | try: 30 | float(str(value)) 31 | return True 32 | except ValueError: 33 | return False 34 | 35 | def is_date(self, value): 36 | try: 37 | datetime.strptime(str(value), '%d.%m.%Y') 38 | return True 39 | except ValueError: 40 | return False 41 | 42 | def infer_data_types(self, article): 43 | for key in article: 44 | if isinstance(article[key], list): continue 45 | elif self.is_int(article[key]): article[key] = int(article[key]) 46 | elif self.is_float(article[key]): article[key] = float(article[key]) 47 | elif self.is_date(article[key]) and str(article[key]) is not None: article[key] = datetime.strptime(str(article[key]), '%d.%m.%Y') 48 | return article 49 | 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

An Ebay-kleinanzeigen Web scraper using Python and Scrapy to fetch data into an ElasticSearch cluster with Kibana

2 | 3 | The aim here is to extract data from https://www.ebay-kleinanzeigen.de/ automatically and rapidly in order to store them 4 | into an ElasticSearch cluster and get fast insights with Kibana. 5 | 6 |

Requirements

7 | Python 3
8 | Elasticsearchb 7.0.2
9 | Scrapy 1.6.0
10 | 11 |

How to set it :

12 | 13 | 1) Start your ElasticSearch cluster with Kibana installed on it. If you don't have it, a fast way to get it could be to install 14 | and use a Docker image with the following steps : 15 | ```bash 16 | 17 | git clone https://github.com/deviantony/docker-elk.git 18 | cd /docker-elk 19 | docker-compose up -d 20 | ``` 21 | 22 | 2) Set the URLs (like https://www.ebay-kleinanzeigen.de/s-berlin/l3331 for example) you want to scrape in JSON file : start_urls.json 23 | 3) Set the various configuration parameters you wish : 24 | ```json 25 | { 26 | "protocol": "http or https", 27 | "elastic_username": "the username to connect on your ElasticSearch cluster", 28 | "elastic_password": "the needed password to connect on your ElasticSearch cluster", 29 | "elastic_address": "the binded ip address of your ElasticSearch cluster", 30 | "elastic_port": "the binded port of your ElasticSearch cluster", 31 | "elastic_index_name": "the index name of your ElasticSearch cluster", 32 | "elastic_connection_retry": "the number of tries to reconnect on your ElasticSearch in case of failure", 33 | "scrape_next_pages": "boolean to indicate if the web scraper check the next pages (1,2,3...) displayed at the bottom of page." 34 | } 35 | ``` 36 | The default login and server parameters of the ElasticSearch Docker images are entered. 37 | 38 | 4) Change your current directory to the Scraper's one and start it through : 39 | ```bash 40 | cd .../ebaykleinanzeigen 41 | scrapy crawl ebay_kleinanzeigen 42 | ``` 43 | 5) The results are automatically updated into ElasticSearch and Kibana as soon as the data are being scraped. 44 | Just enjoy the insights by connecting on your Kibana home page (by default in the Docker image : http://localhost:5601) ! 45 | 46 | NB: The number of concurrent requests and time between has been defined in settings.py respectively to 20 and 0.8 by default 47 | in order to avoid problems on Ebay-kleinanzeigen's server. 48 | 49 |

References

50 | 51 | -[Ebay-kleinanzeigen](https://www.ebay-kleinanzeigen.de/stadt/berlin/)
52 | -[Elasticsearch-py](https://elasticsearch-py.readthedocs.io/en/master/)
53 | 54 |

Credits

55 | 56 | Copyright (c) 2019, HicBoux. Work released under Apache 2.0 License. 57 | 58 | (Please contact me if you wish to use my work in specific conditions not allowed automatically by the Apache 2.0 License.) 59 | 60 |

Disclaimer

61 | 62 | This solution has been made available for informational and educational purposes only. I hereby disclaim any and all 63 | liability to any party for any direct, indirect, implied, punitive, special, incidental or other consequential 64 | damages arising directly or indirectly from any use of this content, which is provided as is, and without warranties. 65 | I also disclaim all responsibility for web scraping at a disruptive rate and eventual damages caused by a such use. 66 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for ebaykleinanzeigen project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'ebaykleinanzeigen' 13 | 14 | SPIDER_MODULES = ['ebaykleinanzeigen.spiders'] 15 | NEWSPIDER_MODULE = 'ebaykleinanzeigen.spiders' 16 | 17 | #Crawl depth paramaters : which level of depth to scrape first 18 | #DEPTH_LIMIT = 0 19 | #DEPTH_PRIORITY = 0 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | USER_AGENT = 'ebaykleinanzeigen (+https://www.ebay-kleinanzeigen.de)' 23 | 24 | # Obey robots.txt rules 25 | ROBOTSTXT_OBEY = True 26 | 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 28 | CONCURRENT_REQUESTS = 20 29 | 30 | # Configure a delay for requests for the same website (default: 0) 31 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 32 | # See also autothrottle settings and docs 33 | DOWNLOAD_DELAY = 0.8 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | #CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | #COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | #TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | #DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | #} 49 | 50 | # Enable or disable spider middlewares 51 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 52 | #SPIDER_MIDDLEWARES = { 53 | # 'ebaykleinanzeigen.middlewares.EbaykleinanzeigenSpiderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable downloader middlewares 57 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 58 | #DOWNLOADER_MIDDLEWARES = { 59 | # 'ebaykleinanzeigen.middlewares.EbaykleinanzeigenDownloaderMiddleware': 543, 60 | #} 61 | 62 | # Enable or disable extensions 63 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 64 | #EXTENSIONS = { 65 | # 'scrapy.extensions.telnet.TelnetConsole': None, 66 | #} 67 | 68 | # Configure item pipelines 69 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 70 | #ITEM_PIPELINES = { 71 | # 'ebaykleinanzeigen.pipelines.EbaykleinanzeigenPipeline': 300, 72 | #} 73 | 74 | # Enable and configure the AutoThrottle extension (disabled by default) 75 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 76 | #AUTOTHROTTLE_ENABLED = True 77 | # The initial download delay 78 | #AUTOTHROTTLE_START_DELAY = 5 79 | # The maximum download delay to be set in case of high latencies 80 | #AUTOTHROTTLE_MAX_DELAY = 60 81 | # The average number of requests Scrapy should be sending in parallel to 82 | # each remote server 83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 | # Enable showing throttling stats for every response received: 85 | #AUTOTHROTTLE_DEBUG = False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | #HTTPCACHE_ENABLED = True 90 | #HTTPCACHE_EXPIRATION_SECS = 0 91 | #HTTPCACHE_DIR = 'httpcache' 92 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 93 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class EbaykleinanzeigenSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class EbaykleinanzeigenDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /ebaykleinanzeigen/ebaykleinanzeigen/spiders/ebay_kleinanzeigen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import json 5 | import logging 6 | from datetime import datetime 7 | 8 | from elasticsearch import Elasticsearch 9 | 10 | from ebaykleinanzeigen.spiders.utilities import Utilities 11 | from ebaykleinanzeigen.spiders.elastic_functions import ElasticFunctions 12 | 13 | class EbayKleinanzeigenSpider(scrapy.Spider): 14 | name = 'ebay_kleinanzeigen' 15 | allowed_domains = ['ebay-kleinanzeigen.de'] 16 | 17 | def __init__(self, *args, **kwargs): 18 | #Load home made classes with functions 19 | self.utilities = Utilities() 20 | self.elastic_functions = ElasticFunctions() 21 | 22 | #Load of configuration files 23 | self.start_urls = self.utilities.load_urls("start_urls.json") 24 | self.config = self.utilities.load_config_file("config_file.json") 25 | 26 | #Connection to ELK 27 | connection_request = self.config["protocol"] + "://" + self.config["elastic_username"] + ":" + self.config["elastic_password"] + "@" + self.config["elastic_address"] + ":" + self.config["elastic_port"] 28 | self.es = self.elastic_functions.connection_to_elastic(connection_request, self.config["elastic_connection_retry"]) 29 | 30 | 31 | def parse(self, response): 32 | article_urls = response.xpath("//a[@class='ellipsis']/@href").extract() 33 | for url in article_urls: 34 | domain = 'https://www.ebay-kleinanzeigen.de' 35 | article_page = response.urljoin(domain + url) 36 | print(article_page) 37 | request = scrapy.Request(url = article_page, callback=self.parse_article_page, dont_filter=True) 38 | yield request 39 | 40 | next_page = domain + str(response.xpath("//a[@class='pagination-next']/@href").extract_first()) 41 | print(next_page) 42 | if next_page is not None and self.config["scrape_next_pages"] == "True": #If still some next pages to follow and if it's agreed in the config 43 | yield scrapy.Request( 44 | response.urljoin(next_page), 45 | callback=self.parse) 46 | 47 | def parse_article_page(self, response): 48 | 49 | #Retrieve some data about the article 50 | article_url = response.url 51 | article_title = response.xpath("//h1[@class='articleheader--title']//text()").extract_first() 52 | article_price = [s[7:] for s in response.xpath("//h2[@class='articleheader--price']//text()").extract()][0] 53 | article_description = response.xpath("//p[@itemprop='description']").extract_first() 54 | article_description = re.compile(r'<[^>]+>').sub('', article_description).split(" ").remove('') #Remove all HTML elements in order to have only one text block 55 | 56 | #Retrieve all data categories about the product ("type", "size", "delivery way" etc...) 57 | article_details_categories = [s.replace(":","") for s in response.xpath("//dt[@class='attributelist--key']//text()").extract()] 58 | 59 | #Retrieve the place where the article is sold 60 | seller_place = [s.strip() for s in response.xpath("//dd[@class='attributelist--value' and @itemprop='seller']//text()").extract()] 61 | seller_place = ''.join(seller_place) 62 | 63 | #Retrieve all data (values) about the product ("2m", "Man" etc...) 64 | article_details_values = list(filter(None, [s.strip() for s in response.xpath("//dd[@class='attributelist--value' and not(@itemprop='seller')]//text()").extract()])) 65 | #Add the seller place to the list 66 | article_details_values.insert(0, seller_place) 67 | if ',' in article_details_values : article_details_values.remove(',') #Remove useless cells 68 | #Since some fields (like "Ausstatung" or "Art") have several values, we try to merge them into a given field 69 | len_cat = len(article_details_categories) 70 | len_val = len(article_details_values) 71 | article_details_values[len_cat-1] = article_details_values[len_cat-1] + ' ' #Avoid the first ending values to not be taken into account 72 | article_details_values[len_cat-1 : len_val] = [' '.join(article_details_values[len_cat-1 : len_val]).replace(",","")] #Merge all ending values 73 | article_details_values[len(article_details_values)-1] = filter(None, article_details_values[len(article_details_values)-1].split(" ")) #Remove empty cells and split values 74 | #In the case there is only 1 field, we transform this list into a simple String 75 | if len(article_details_values[len(article_details_values)-1]) == 1 : article_details_values[len(article_details_values)-1] = article_details_values[len(article_details_values)-1][0] 76 | 77 | #Definition of the dict to return to ELK 78 | article = [] 79 | article.append(("URL", article_url)) 80 | article.append(("Artikelstitel", article_title)) 81 | article.append(("Preis", article_price)) 82 | for i in range(len(article_details_categories)): 83 | article.append((article_details_categories[i], article_details_values[i])) 84 | article.append(("Artikelsbeschreibung", article_description)) 85 | article = dict(article) #Transformation into a dictionary 86 | doc_id = article["URL"].split("/")[5] #Get the article's ID available in the URL 87 | 88 | #Transform possible values into float or dates 89 | article = self.utilities.infer_data_types(article) 90 | 91 | 92 | #Push the article into ELK : if and only if the document doesn't exist already in the index 93 | if not self.elastic_functions.check_article_existence(self.es, self.config["elastic_index_name"], doc_id): 94 | self.elastic_functions.add_article_to_elastic(self.es, self.config["elastic_index_name"], article, doc_id) 95 | else : print("-------------------------------- Article already indexed in Elastic.") 96 | 97 | yield article 98 | 99 | 100 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------