├── scripts └── geckodriver │ ├── geckodriver-v0.19.1-linux64.tar.gz │ └── download.sh ├── .gitignore ├── requirements.txt ├── milanuncios ├── __init__.py ├── utils.py └── core.py ├── .travis.yml ├── COPYING ├── docs ├── install │ └── raspberrypi.md └── usage │ ├── english.ipynb │ └── spanish.ipynb ├── setup.py ├── README.md └── test └── test.py /scripts/geckodriver/geckodriver-v0.19.1-linux64.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mondeja/milanuncios/HEAD/scripts/geckodriver/geckodriver-v0.19.1-linux64.tar.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Syntax checker 2 | .pylintrc 3 | 4 | # Virtualenv 5 | env/ 6 | 7 | # Builds and cache 8 | __pycache__/ 9 | build/ 10 | 11 | # Log files 12 | geckodriver.log 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.0 2 | bs4==0.0.1 3 | cachetools==2.0.1 4 | EasyProcess==0.2.3 5 | numpy==1.13.3 6 | pandas==0.20.0 7 | python-dateutil==2.6.1 8 | pytz==2017.3 9 | PyVirtualDisplay==0.2.1 10 | selenium==3.8.0 11 | six==1.11.0 12 | tqdm==4.19.5 13 | psutil==5.4.2 14 | -------------------------------------------------------------------------------- /milanuncios/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Milanuncios source""" 5 | 6 | __title__ = 'milanuncios' 7 | __version__ = '0.9.13' 8 | __author__ = 'Alvaro Mondejar Rubio ' 9 | __repo__ = 'https://github.com/mondeja/milanuncios' 10 | __license__ = 'BSD License' 11 | 12 | from .core import MilAnuncios, MilAnunciosLoginError 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | language: python 4 | 5 | python: 6 | - "3.6" 7 | #- "nightly" 8 | 9 | os: 10 | - linux 11 | 12 | addons: 13 | apt: 14 | packages: 15 | - "python3-pip" 16 | 17 | 18 | before_install: 19 | - sudo bash scripts/geckodriver/download.sh travis 20 | 21 | install: 22 | - sudo pip3 install -r requirements.txt 23 | - sudo python3 setup.py install 24 | 25 | before_script: 26 | # https://docs.travis-ci.com/user/gui-and-headless-browsers/#Using-xvfb-to-Run-Tests-That-Require-a-GUI 27 | - "export DISPLAY=:99.0" 28 | - "sh -e /etc/init.d/xvfb start" 29 | - sleep 3 # give xvfb some time to start 30 | 31 | 32 | script: 33 | - sudo python3 test/test.py 34 | 35 | branches: 36 | only: 37 | - staging -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Álvaro Mondéjar Rubio . 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms are permitted 5 | provided that the above copyright notice and this paragraph are 6 | duplicated in all such forms and that any documentation, advertising 7 | materials, and other materials related to such distribution and use 8 | acknowledge that the software was developed by Álvaro Mondéjar Rubio. The 9 | name of the Álvaro Mondéjar Rubio may not be used to endorse or promote 10 | products derived from this software without specific prior written 11 | permission. 12 | 13 | THIS SOFTWARE IS PROVIDED “AS IS” AND WITHOUT ANY EXPRESS OR IMPLIED 14 | WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 15 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. -------------------------------------------------------------------------------- /docs/install/raspberrypi.md: -------------------------------------------------------------------------------- 1 | ## Install on RaspberryPi 2 | ``` 3 | wget https://github.com/mozilla/geckodriver/releases/download/v0.16.0/geckodriver-v0.16.0-arm7hf.tar.gz 4 | tar -xvzf geckodriver-v0.16.0-arm7hf.tar.gz 5 | sudo mv geckodriver /usr/local/bin/geckodriver 6 | sudo apt-get install iceweasel xvfb 7 | pip3 install bs4 cachetools pyvirtualdisplay selenium==3.3.2 tqdm psutil 8 | git clone https://github.com/mondeja/milanuncios.git 9 | cd milanuncios 10 | python3 setup.py install 11 | ``` 12 | 13 | If you want to recopile info, you need to install `pandas` also, but for autorenovate ads it's unnecesary. 14 | 15 | ### Usage tip 16 | RaspberryPi has little RAM memory, so you need to set a big delay between commands (10 seconds must be enough): 17 | ``` 18 | from milanuncios import MilAnuncios 19 | with MilAnuncios(delay=10) as ma: 20 | ... 21 | ``` -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | from distutils.core import setup 6 | 7 | BASEDIR = os.path.dirname(__file__) 8 | 9 | with open(os.path.join(BASEDIR, 'requirements.txt')) as f: 10 | requirements = f.readlines() 11 | 12 | with open(os.path.join(BASEDIR, 'README.md')) as f: 13 | readme = f.read() 14 | 15 | setup( 16 | name = 'milanuncios', 17 | version = '0.9.13', 18 | url = 'https://github.com/mondeja/milanuncios', 19 | download_url = 'https://github.com/mondeja/milanuncios/archive/master.zip', 20 | author = 'Alvaro Mondejar ', 21 | author_email = 'mondejar1994@gmail.com', 22 | license = 'BSD License', 23 | packages = ['milanuncios'], 24 | description = 'Python3 web scraper for milanuncios.com.', 25 | long_description = readme, 26 | keywords = ['milanuncios', 'anuncios', 'segunda mano', 'scraper', 'dinamic scraping', 'python', 'big data'], 27 | install_requires = requirements 28 | ) 29 | -------------------------------------------------------------------------------- /scripts/geckodriver/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to get neccesary version of geckodriver 4 | # across multiples OS. Need wget installed yet 5 | 6 | travis=1 7 | if [ -z $1 ] # travis is not first command? 8 | then 9 | travis=0 10 | fi 11 | 12 | 13 | # ======= URLs by OS and architecture ======= 14 | Linux86_64=https://github.com/mozilla/geckodriver/releases/download/v0.19.1/geckodriver-v0.19.1-linux64.tar.gz 15 | # =============================================== 16 | 17 | # Get OS 18 | case "$OSTYPE" in 19 | linux*) OS="linux" ;; 20 | darwin*) OS="mac" ;; 21 | msys*) OS="windows" ;; 22 | solaris*) OS="solaris" ;; 23 | bsd*) OS="bsd" ;; 24 | *) OS="unknown" ;; 25 | esac 26 | 27 | 28 | 29 | # Linux environments 30 | if [ $OS == "linux" ] 31 | then 32 | 33 | # Install wget? 34 | if [ $travis -eq 0 ] # If we are not in travis, try it 35 | then 36 | sudo apt-get install wget 37 | fi 38 | 39 | # 64bit architecture? 40 | if [ `uname -m` == "x86_64" ] 41 | then 42 | wget -N $Linux86_64 P ~/ # Download driver and store at home 43 | tar -xvf ~/geckodriver-v0.19.1-linux64.tar.gz 44 | rm ~/geckodriver-v0.19.1-linux64.tar.gz 45 | ls ~/ 46 | fi 47 | 48 | fi 49 | 50 | # If we are in TravisCI, geckodriver needs to be in path 51 | if [ $travis -eq 1 ] 52 | then 53 | sudo mv -f ~/geckodriver /usr/local/share/geckodriver 54 | sudo chmod +x /usr/local/share/geckodriver 55 | sudo ln -s /usr/local/share/geckodriver /usr/local/bin/geckodriver 56 | fi 57 | 58 | echo "Where is geckodriver?" 59 | whereis geckodriver 60 | 61 | -------------------------------------------------------------------------------- /milanuncios/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Utils module""" 5 | 6 | import logging 7 | import datetime 8 | 9 | DEFAULT_FORMAT = "%(asctime)s %(levelname)-8s %(name)s:%(lineno)d: %(message)s" 10 | DEFAULT_FORMATTER = logging.Formatter(DEFAULT_FORMAT) 11 | 12 | def create_logger(name, level=logging.INFO, handler=logging.StreamHandler(), 13 | propagate=True): 14 | """Returns a logger with given name, level and handler.""" 15 | logger = logging.getLogger(name) 16 | logger.setLevel(level) 17 | handler.setFormatter(DEFAULT_FORMATTER) 18 | logger.addHandler(handler) 19 | logger.propagate = propagate 20 | return logger 21 | 22 | def extract_number(string, parse): 23 | """Returns a number from a string parsing it with a given type""" 24 | response = "" 25 | for char in string: 26 | if char.isdigit(): 27 | response += char 28 | return parse(response) 29 | 30 | def parse_string_to_timedelta(string): 31 | """Convert a string in the form "4 horas" to timedelta object""" 32 | string_mapping = {"horas": "hours", 33 | "hora": "hours", 34 | "días": "days", 35 | "día": "days", 36 | "dia": "days", 37 | "dias": "days", 38 | "seg": "seconds", 39 | "min": "minutes"} 40 | num = extract_number(string, int) 41 | for inp, outp in string_mapping.items(): 42 | if inp in string: 43 | arg = outp 44 | break 45 | kwarg = {arg: num} 46 | return datetime.timedelta(**kwarg) 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # milanuncios 2 | 3 | [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/mondeja/milanuncios/master?filepath=docs%2Fusage%2Fenglish.ipynb) 4 | 5 | ### Python3 web scraper and automatic ad renovator for [milanuncios.com](https.//www.milanuncios.com). 6 | ### Scraper y autorenovador de anuncios para [milanuncios.com](https.//www.milanuncios.com) en Python3. 7 | 8 | Milanuncios doesn't allow to scrap content for their website by usual methods, but it's posible to scrap dinamic content through other formulas. In their [terms of service](https://www.milanuncios.com/condiciones/), they don't specifies what are the conditions for this kind of scraping, but **AUTORENOVATION IS STRICTLY FORBBIDEN** so I don't take responsability about how you use this program, this **has been made for purely educational purposes**. 9 | 10 | I'dont know if works in Windows Mac or Python2, only tested in Linux and Python3. 11 | 12 | ## Requirements 13 | - Mozilla Firefox >= 57.0 14 | - [Geckodriver](https://github.com/mozilla/geckodriver/releases) 15 | 16 | ## Install 17 | From source use: 18 | ``` 19 | pip3 install https://github.com/mondeja/milanuncios/archive/master.zip 20 | ``` 21 | 22 | or 23 | 24 | ``` 25 | git clone https://github.com/mondeja/milanuncios.git 26 | cd milanuncios 27 | pip3 install -r requirements.txt 28 | python3 setup.py install 29 | ``` 30 | 31 | #### [Install on RaspberryPi](https://github.com/mondeja/milanuncios/tree/master/docs/install/raspberrypi.md) 32 | 33 | ## Usage 34 | - [Usage](https://mybinder.org/v2/gh/mondeja/milanuncios/master?filepath=docs%2Fusage%2Fenglish.ipynb) (english version) 35 | - [Uso](https://mybinder.org/v2/gh/mondeja/milanuncios/master?filepath=docs%2Fusage%2Fspanish.ipynb) (versión en español) 36 | 37 | ## Contribute 38 | 39 | - Issue Tracker: https://github.com/mondeja/milanuncios/issues 40 | - Source Code: https://github.com/mondeja/milanuncios 41 | 42 | ## Support 43 | 44 | If you are having issues, please let me know (mondejar1994@gmail.com). 45 | 46 | ## License 47 | 48 | Copyright (c) 2017 Álvaro Mondéjar Rubio. 49 | All rights reserved. 50 | 51 | Redistribution and use in source and binary forms are permitted 52 | provided that the above copyright notice and this paragraph are 53 | duplicated in all such forms and that any documentation, advertising 54 | materials, and other materials related to such distribution and use 55 | acknowledge that the software was developed by Álvaro Mondéjar Rubio. The 56 | name of the Álvaro Mondéjar Rubio may not be used to endorse or promote 57 | products derived from this software without specific prior written 58 | permission. 59 | 60 | THIS SOFTWARE IS PROVIDED “AS IS” AND WITHOUT ANY EXPRESS OR IMPLIED 61 | WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 62 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 63 | 64 | 65 | ## Buy me a coffee? 66 | 67 | If you feel like buying me a coffee (or a beer?), donations are welcome: 68 | 69 | ``` 70 | BTC : 1LfUF4AcvH7Wd1wTc7Mmqobj4AypUbpvN5 71 | ETH : 0x7428fE875226880DaD222c726F6340eec42Db567 72 | STEEM: @mondeja 73 | ``` 74 | 75 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Test milanuncios web scraper and adverts renewer""" 5 | 6 | # To disable warnings: python3 -W ignore test.py 7 | 8 | # Standard libraries 9 | import unittest 10 | 11 | # External libraries 12 | from pandas import DataFrame 13 | 14 | # Internal modules 15 | from milanuncios import MilAnuncios 16 | 17 | # If you want to test renewer you need to config email and password 18 | config = { 19 | "email": None, 20 | "password": None, 21 | "ad_titles": None, # Used to renew ads by title name (list) 22 | "ads_number": 1, # Used to renew ads by number of them (int) 23 | 24 | "debug": False, 25 | "delay": 3, 26 | "timeout": 15, 27 | # Selenium drivers 28 | "executable_path": "geckodriver", 29 | "log_path": "geckodriver.log", 30 | 31 | # Test end to end 32 | "full": False 33 | } 34 | 35 | params = ["debug", "delay", "timeout", 36 | "executable_path", "log_path"] 37 | options = {param: config[param] for param in params} 38 | 39 | class TestWebScraper(unittest.TestCase): 40 | """MilAnuncios web scraper tests""" 41 | 42 | def setUp(self): 43 | self.ma = MilAnuncios(**options) 44 | self.ma.__enter__() 45 | 46 | def tearDown(self): 47 | self.ma.__exit__() 48 | if self.ma.debug: 49 | self.ma.logger.debug("Firefox processes opened: %r", 50 | self.ma.firefox_user_processes) 51 | 52 | # === INFO TESTS === 53 | def test_regions(self): 54 | collect_regions = self.ma._get_regions() 55 | hardcoded_regions = self.ma.regions 56 | self.assertEqual(collect_regions, hardcoded_regions) 57 | 58 | def test_categories(self): 59 | categories = self.ma.categories 60 | self.assertIs(type(categories), list) 61 | self.assertGreater(len(categories), 10) 62 | 63 | def test_subcategories(self): 64 | subcategories = self.ma.subcategories("motor") 65 | self.assertIs(type(subcategories), list) 66 | self.assertGreater(len(subcategories), 5) 67 | 68 | # === SEARCH TESTS === 69 | def test_search(self): 70 | # Query basic search 71 | response = self.ma.search("sofa") 72 | self.assertIn(type(response), (DataFrame, list)) 73 | 74 | def test_search_category(self): 75 | # Query basic search by category 76 | response = self.ma.search_category("motor") 77 | self.assertIs(type(response), DataFrame) 78 | self.assertEqual(response.empty, False) 79 | 80 | 81 | @unittest.skipIf(not config["full"], 82 | 'Cache testing only posible if config["full"] == True') 83 | class TestWebScraperCache(unittest.TestCase): 84 | """MilAnuncios cache tests""" 85 | def setUp(self): 86 | options["init_cache"] = True 87 | self.ma = MilAnuncios(**options) 88 | del options["init_cache"] 89 | self.ma.__enter__() 90 | 91 | def tearDown(self): 92 | self.ma.__exit__() 93 | 94 | def assert_cached(self, dictionary): 95 | self.assertIs(type(dictionary), dict) 96 | self.assertGreater(dictionary, 10) 97 | 98 | def test_categories_cache(self): 99 | self.assert_cached(self.ma.cache["categories"]) 100 | 101 | def test_subcategories_cache(self): 102 | self.assert_cached(self.ma.cache["subcategories"]) 103 | 104 | 105 | @unittest.skipIf(not config["email"] or not config["password"], 106 | "For account methods testing you must provide email and password in config") 107 | class TestAccount(unittest.TestCase): 108 | """MilAnuncios account tests""" 109 | def setUp(self): 110 | self.ma = MilAnuncios(**options) 111 | self.ma.__enter__() 112 | 113 | def tearDown(self): 114 | self.ma.__exit__() 115 | 116 | def test_login(self): 117 | self.ma.login(config["email"], 118 | config["password"]) 119 | self.assertEqual(self.ma.logged, True) 120 | 121 | def test_my_ads(self): 122 | # Test my ads with login 123 | ads = self.ma.my_ads(config["email"], 124 | config["password"]) 125 | self.assertIs(type(ads), DataFrame) 126 | 127 | def test_login_my_ads(self): 128 | # First login, then get my_ads 129 | if self.ma.login(config["email"], 130 | config["password"]): 131 | ads = self.ma.my_ads() 132 | self.assertIs(type(ads), DataFrame) 133 | 134 | def test_renew_ads(self): 135 | # If we are renewing by name 136 | if config["ad_titles"]: 137 | if self.ma.login(config["email"], 138 | config["password"]): 139 | renewed = self.ma.renew_ads(ads=config["ad_titles"]) 140 | else: 141 | if config["ads_number"]: 142 | # If we are renewing by number 143 | if self.ma.login(config["email"], 144 | config["password"]): 145 | renewed = self.ma.renew_ads(number=config["ads_number"]) 146 | else: 147 | if self.ma.login(config["email"], 148 | config["password"]): 149 | renewed = self.ma.renew_ads() 150 | self.assertGreater(renewed, 0) 151 | 152 | if __name__ == "__main__": 153 | unittest.main() -------------------------------------------------------------------------------- /docs/usage/english.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#
How it works
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Certain web pages try to restrict the possibility of being visited by robots creating content that shows on clients by a dinamic way (trough Javascript code), like [milanuncios.com](https://www.milanuncios.com/).\n", 15 | "\n", 16 | "### Static scraping in Python\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "None\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import requests\n", 34 | "from bs4 import BeautifulSoup\n", 35 | "\n", 36 | "url = \"https://www.milanuncios.com/anuncios/masajes-relajantes-en-jerez.htm\"\n", 37 | "\n", 38 | "soup = BeautifulSoup(requests.get(url).content, \"html.parser\")\n", 39 | "print(soup.find(class_=\"aditem-detail-title\"))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Although, is very easy avoid these limitations...\n", 47 | "\n", 48 | "### Dinamic scraping in Python" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "MASAJES Y TRATAMIENTOS FISIO DOMICILIO\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "from pyvirtualdisplay import Display # pip3 install pyvirtualdisplay\n", 66 | "from selenium import webdriver # pip3 install selenium\n", 67 | "from bs4 import BeautifulSoup # pip3 install bs4\n", 68 | "import time\n", 69 | "\n", 70 | "url = \"https://www.milanuncios.com/anuncios/masajes-relajantes-en-jerez.htm\"\n", 71 | "\n", 72 | "display = Display(visible=0, size=(800, 600))\n", 73 | "display.start()\n", 74 | "browser = webdriver.Firefox() # You need geckodriver and Mozilla Firefox\n", 75 | "browser.get(url)\n", 76 | "time.sleep(.8)\n", 77 | "soup = BeautifulSoup(browser.page_source, \"html.parser\")\n", 78 | "print(soup.find(class_=\"aditem-detail-title\"))\n", 79 | "browser.quit()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "________________________________________________\n", 87 | "\n", 88 | "#
Documentation
\n", 89 | "\n", 90 | "## Basic usage\n", 91 | "The 4 main methods to realize most of queries in are the next:" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "from milanuncios import MilAnuncios\n", 101 | "\n", 102 | "# We must enter to milanuncios like context (with statement)\n", 103 | "with MilAnuncios() as ma:\n", 104 | "\n", 105 | " # Obtain main categories of home page:\n", 106 | " print(ma.categories) \n", 107 | "\n", 108 | " # Obtain subcategories (and subsubcategories recursively) from a main category:\n", 109 | " print(ma.subcategories(\"servicios\"))\n", 110 | "\n", 111 | " # Realize a query (we indicates number of pages, 1 page as default):\n", 112 | " print(ma.search(\"gatos\", pages=3)) # Returns a pandas' DataFrame\n", 113 | "\n", 114 | " # Realize a search by category/sucategory:\n", 115 | " print(ma.search_category(\"juegos\", subcategory=\"videoconsolas\").tail())" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "### Ads renewal" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from milanuncios import MilAnunciosLoginError\n", 132 | "\n", 133 | "# delay parameter indicates how many seconds to wait loading pages and before perform actions\n", 134 | "# Is 1.5 as default, if you have runtime troubles try to increase it\n", 135 | "with MilAnuncios(delay=3) as ma:\n", 136 | " # Login in milanuncios\n", 137 | " ma.login(\"tu_email@proveedor.com\", \"tu_contraseña\") # If login fails MilAnunciosLoginError will be raised\n", 138 | " assert ma.logged == True\n", 139 | " \n", 140 | " # Obtain your ads\n", 141 | " ma.my_ads(dataframe=False) # As default returns a pandas' DataFrame, but you can retrieve a list also\n", 142 | " \n", 143 | " # Renew your ads\n", 144 | " # You can renew by title of by number of adverts\n", 145 | " # The program will ignore that adverts wich can't be renewed yet\n", 146 | " ma.renew_ads(title=[\"Título de mi anuncio\", \"Otro, da igual si es en minúscula o mayúscula\"]) # Por nombre\n", 147 | " \n", 148 | " ma.renew_ads(number=3) # First 3 that can be renewed in your adverts list\n", 149 | " \n", 150 | " # This method returns the number of adverts renewed" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "You also can obtain or renew your ads without execute `login()` method, only pass it your email and password to ` my_ads()` o `renew_ads()` as two first parameters:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "options = {}\n", 167 | "#options = dict(debug=True) # If you want to display on screen renovation process activate debug\n", 168 | "\n", 169 | "with MilAnuncios(**options) as ma:\n", 170 | " ma.my_ads(\"tu_email@proveedor.com\", \"tu_contraseña\")\n", 171 | " \n", 172 | " # If you don't pass number or titles list parameters will be renewed all adverts\n", 173 | " ma.renew_ads(\"tu_email@proveedor.com\", \"tu_contraseña\") " 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "\n", 181 | "_____________________________\n", 182 | "## Advanced usage\n", 183 | "### Filters" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "# We can filter by regions in both methods:\n", 193 | "print(ma.search(\"guitarra flamenca\", region=\"cadiz\").head())\n", 194 | "# To consult all possible regions (provinces and autonomous communities):\n", 195 | "print(ma.regions)\n", 196 | "\n", 197 | "# If you want to search only by regions (https://www.milanuncios.com/anuncios-en-sevilla/)\n", 198 | "print(ma.search(\"\", region=\"sevilla\").head()) # Void string in query parameter\n", 199 | "\n", 200 | "# We can filter by offer and demand. As default, offer and demand paramters are True:\n", 201 | "print(ma.search_category(\"inmobiliaria\", subcategory=\"alquiler de casas\", offer=False))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Caching\n", 209 | "If you are going to make a lot of queries in only one session, it is convenient to caching all subcategories when instantiating the scraper class. It will take some time storing in memory all subcategories, but then the performance increases:" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "from pprint import pprint\n", 219 | "\n", 220 | "ma = MilAnuncios(init_cache=True)\n", 221 | "pprint(ma.cache[\"categories\"])\n", 222 | "pprint(ma.cache[\"subcategories\"])" 223 | ] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.5.3" 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 2 247 | } 248 | -------------------------------------------------------------------------------- /docs/usage/spanish.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#
Cómo funciona
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Ciertas páginas web intentan restringir la posibilidad de ser visitadas por robots creando el contenido que muestran en el cliente de forma dinámica (por medio de código Javascript), como es el caso de [milanuncios.com](https://www.milanuncios.com/).\n", 15 | "\n", 16 | "### Scraping estático en Python\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "None\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import requests\n", 34 | "from bs4 import BeautifulSoup\n", 35 | "\n", 36 | "url = \"https://www.milanuncios.com/anuncios/masajes-relajantes-en-jerez.htm\"\n", 37 | "\n", 38 | "soup = BeautifulSoup(requests.get(url).content, \"html.parser\")\n", 39 | "print(soup.find(class_=\"aditem-detail-title\"))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Sin embargo, es muy fácil saltarse estas limitaciones...\n", 47 | "\n", 48 | "### Scraping dinámico en Python" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "MASAJES Y TRATAMIENTOS FISIO DOMICILIO\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "from pyvirtualdisplay import Display # pip3 install pyvirtualdisplay\n", 66 | "from selenium import webdriver # pip3 install selenium\n", 67 | "from bs4 import BeautifulSoup # pip3 install bs4\n", 68 | "import time\n", 69 | "\n", 70 | "url = \"https://www.milanuncios.com/anuncios/masajes-relajantes-en-jerez.htm\"\n", 71 | "\n", 72 | "display = Display(visible=0, size=(800, 600))\n", 73 | "display.start()\n", 74 | "browser = webdriver.Firefox() # Necesitas geckodriver y Mozilla Firefox\n", 75 | "browser.get(url)\n", 76 | "time.sleep(.8)\n", 77 | "soup = BeautifulSoup(browser.page_source, \"html.parser\")\n", 78 | "print(soup.find(class_=\"aditem-detail-title\"))\n", 79 | "browser.quit()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "________________________________________________\n", 87 | "\n", 88 | "#
Documentación
\n", 89 | "\n", 90 | "## Uso básico\n", 91 | "Los 4 métodos principales para realizar la gran mayoría de consultas en la página de [milanuncios.com](https://www.milanuncios.com/) son los siguientes:" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "from milanuncios import MilAnuncios\n", 101 | "\n", 102 | "# Debemos entrar a MilAnuncios como contexto (con la sentencia with)\n", 103 | "with MilAnuncios() as ma:\n", 104 | "\n", 105 | " # Obtener las familias de categorías de la página principal:\n", 106 | " print(ma.categories) \n", 107 | "\n", 108 | " # Obtener las subcategorías de una categoría:\n", 109 | " print(ma.subcategories(\"servicios\"))\n", 110 | "\n", 111 | " # Realizar una consulta (indicamos el número de páginas, por defecto 1 página):\n", 112 | " print(ma.search(\"gatos\", pages=3)) # Devuelve un DataFrame de pandas\n", 113 | "\n", 114 | " # Realizar una búsqueda por categoría/subcategoría:\n", 115 | " print(ma.search_category(\"juegos\", subcategory=\"videoconsolas\").tail())" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "### Renovación de anuncios" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from milanuncios import MilAnunciosLoginError\n", 132 | "\n", 133 | "# El parámetro delay indica los segundos que esperamos en cargar las páginas y antes de ejecutar las acciones.\n", 134 | "# Por defecto es 1.5, si tienes problemas de ejecución prueba a aumentarlo\n", 135 | "with MilAnuncios(delay=3) as ma:\n", 136 | " # Loguearte en milanuncios\n", 137 | " ma.login(\"tu_email@proveedor.com\", \"tu_contraseña\") # Si falla el login se levantará MilAnunciosLoginError\n", 138 | " assert ma.logged == True\n", 139 | " \n", 140 | " # Obtener tus anuncios\n", 141 | " ma.my_ads(dataframe=False) # Por defecto devuelve un DataFrame de pandas, pero así devuelve una lista\n", 142 | " \n", 143 | " # Renovar tus anuncios\n", 144 | " # Puedes hacerlo por títulos o por número de anuncios\n", 145 | " # El programa ignorará los anuncios que no se pueden renovar aún\n", 146 | " ma.renew_ads(title=[\"Título de mi anuncio\", \"Otro, da igual si es en minúscula o mayúscula\"]) # Por nombre\n", 147 | " \n", 148 | " ma.renew_ads(number=3) # Los primeros 3 que se puedan renovar de tu lista de anuncios\n", 149 | " \n", 150 | " # Este método devuelve el número de anuncios renovados" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "También puedes obtener tus anuncios o renovarlos sin pasar por el método `login()`, sólo tienes que proveer tu email y contraseña a los métodos `my_ads()` o `renew_ads()` como primeros dos parámetros:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "options = {}\n", 167 | "#options = dict(debug=True) # Si quieres mostrar en pantalla el proceso de renovación activa el debug\n", 168 | "\n", 169 | "with MilAnuncios(**options) as ma:\n", 170 | " ma.my_ads(\"tu_email@proveedor.com\", \"tu_contraseña\")\n", 171 | " \n", 172 | " # Si no pasas número ni lista de títulos se renovarán todos los anuncios:\n", 173 | " ma.renew_ads(\"tu_email@proveedor.com\", \"tu_contraseña\") " 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "___________________________________\n", 181 | "\n", 182 | "## Uso avanzado\n", 183 | "### Filtros" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "ma = MilAnuncios()\n", 193 | "ma.__enter__() # Así también podemos entrar como contexto\n", 194 | "\n", 195 | "# Podemos filtrar por regiones en ambos métodos:\n", 196 | "print(ma.search(\"guitarra flamenca\", region=\"cadiz\").head())\n", 197 | "# Para consultar todas las regiones posibles (provincias y comunidades autónomas):\n", 198 | "print(ma.regions)\n", 199 | "\n", 200 | "# Si quieres buscar sólo por regiones (https://www.milanuncios.com/anuncios-en-sevilla/)\n", 201 | "print(ma.search(\"\", region=\"sevilla\").head()) # Cadena vacía en el parámetro query\n", 202 | "\n", 203 | "# Podemos filtrar por oferta y demanda. Por defecto, los parámetros offer y demand equivalen a True:\n", 204 | "print(ma.search_category(\"inmobiliaria\", subcategory=\"alquiler de casas\", offer=False))\n", 205 | "\n", 206 | "ma.__exit__() # No te olvides de salir o acumularás procesos de Firefox en memoria" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "### Cache\n", 214 | "Si vas a realizar muchas consultas en una sola sesión, es conveniente cachear todas las subcategorías al instanciar el scraper. Tardará un tiempo en guardar en memoria todas las subcategorías pero luego el rendimiento aumentará considerablemente:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "from pprint import pprint\n", 224 | "\n", 225 | "with MilAnuncios(init_cache=True) as ma:\n", 226 | " pprint(ma.cache[\"categories\"])\n", 227 | " pprint(ma.cache[\"subcategories\"])" 228 | ] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "Python 3", 234 | "language": "python", 235 | "name": "python3" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.5.3" 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 2 252 | } 253 | -------------------------------------------------------------------------------- /milanuncios/core.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Core module""" 5 | 6 | # Standard libraries 7 | import os 8 | import signal 9 | import psutil 10 | import time 11 | import re 12 | import random 13 | import logging 14 | import datetime 15 | import platform 16 | from uuid import uuid4 17 | from subprocess import Popen, PIPE 18 | 19 | # External libraries 20 | from pyvirtualdisplay import Display 21 | from cachetools import Cache 22 | from bs4 import BeautifulSoup 23 | from tqdm import tqdm 24 | from selenium import webdriver 25 | from selenium.common.exceptions import NoSuchElementException 26 | 27 | # Internal modules 28 | from milanuncios.utils import (create_logger, 29 | parse_string_to_timedelta) 30 | 31 | class MilAnunciosLoginError(RuntimeError): 32 | """Exception raised if login fails""" 33 | pass 34 | 35 | class MilAnuncios: 36 | """Main Scraper class, used as context 37 | 38 | Args: 39 | delay (float, optional): Time to wait until the page is loaded 40 | before scrap it (in seconds). As default, 1.5 41 | timeout (float, optional): Timeout for requests. As default 15 42 | executable_path (str, optional): Geckodriver executable path. 43 | As default, "geckodriver" (needs to be in sys.path) 44 | log_path (str, optional): Geckodriver log path. As default, 45 | "geckodriver.log" 46 | firefox_binary (str, optional): Firefox binary path (used if you 47 | are running on RaspberryPi). As default "/usr/bin/firefox" 48 | display (bool, optional): Display web browser navigation 49 | on real time, useful for debug (doesn't work on RaspberryPi). 50 | As default False 51 | """ 52 | def __init__(self, delay=1.5, timeout=15, init_cache=False, 53 | executable_path="geckodriver", log_path="geckodriver.log", 54 | cache=Cache(24), logger=create_logger("milanuncios"), 55 | debug=False, firefox_binary="/usr/bin/firefox", 56 | display=False): 57 | self.main_url = "https://www.milanuncios.com" 58 | 59 | self.timeout = timeout 60 | self.delay = delay 61 | self.debug = debug 62 | self.init_cache = init_cache 63 | 64 | self.logger = logger 65 | if self.debug: 66 | self.logger.setLevel(logging.DEBUG) 67 | self.cache = cache 68 | 69 | self._executable_path = executable_path 70 | self._log_path = log_path 71 | self._firefox_binary = firefox_binary 72 | 73 | # Attributes defined on __enter__ 74 | self.session = None 75 | self.firefox_user_processes = None 76 | self.browser = None 77 | self.browser_pid = None 78 | 79 | self.display = display 80 | 81 | # Account methods 82 | self.logged = False 83 | self._logged_soup = None 84 | 85 | def __enter__(self): 86 | self._start_session() 87 | if self.init_cache: 88 | self._initialize_cache() 89 | return self 90 | 91 | def __exit__(self, *excs): 92 | if excs: # We are calling like...? -> MilAnuncios().__exit__() 93 | if None not in excs: 94 | self.logger.error(excs[2], exc_info=True) 95 | if not self.debug: 96 | self._end_session() 97 | return False 98 | 99 | def _initialize_cache(self): 100 | """Internal function to initialize cache""" 101 | self.logger.info("Caching categories tree, please wait...") 102 | for category in tqdm(self.categories): 103 | self.subcategories(category) 104 | 105 | @staticmethod 106 | def _get_firefox_processes(): 107 | """Internal function to get already opened user firefox processes""" 108 | response = [] 109 | for proc in psutil.process_iter(): 110 | if "firefox" in proc.name(): 111 | response.append(int(proc._pid)) 112 | return response 113 | 114 | def _start_in_raspberry(self): 115 | """Internal function to start session if we are running 116 | on RaspberryPi. You need to install iceweasel and download 117 | geckodriver version 0.16.0""" 118 | msg = "Initializing driver for RaspberryPi. Firefox binary path: %s" 119 | self.logger.debug(msg, self._firefox_binary) 120 | caps = webdriver.DesiredCapabilities().FIREFOX 121 | caps["marionette"] = False 122 | binary = webdriver.firefox.firefox_binary.FirefoxBinary(self._firefox_binary) 123 | return webdriver.Firefox(firefox_binary=binary) 124 | 125 | def _start_session(self): 126 | """Internal function to start a virtual session""" 127 | self.session = uuid4() 128 | self.logger.debug("Starting session %s...", self.session) 129 | 130 | # Obtain user processes 131 | self.firefox_user_processes = self._get_firefox_processes() 132 | 133 | # pyvirtualdisplay magic happens here 134 | visible = 1 if self.display == True and platform.node() != "raspberrypi" else 0 135 | display = Display(visible=visible, size=(1024, 768)) 136 | display.start() 137 | 138 | # selenium browser 139 | if platform.node() == "raspberrypi": 140 | self.browser = self._start_in_raspberry() 141 | else: 142 | self.browser = webdriver.Firefox(executable_path=self._executable_path, 143 | log_path=self._log_path) 144 | self.browser.set_script_timeout(self.timeout) 145 | self.browser.set_page_load_timeout(self.timeout) 146 | 147 | # Save new process 148 | for pid in self._get_firefox_processes(): 149 | if pid not in self.firefox_user_processes: 150 | self.browser_pid = int(pid) 151 | 152 | def _end_session(self): 153 | """End scraper session""" 154 | self.logged = False 155 | os.kill(self.browser_pid, signal.SIGKILL) 156 | 157 | def kill_firefox(self): 158 | """Function to kill all firefox processes. Util for development 159 | or if you experiments errors in requests.""" 160 | for pid in self._get_firefox_processes(): 161 | os.kill(int(pid), signal.SIGKILL) 162 | self._start_session() # We need to restart session 163 | 164 | def _get_regions(self): 165 | """Search in milanuncios.com all the regions 166 | (use regions property for a faster response)""" 167 | def parser(soup): 168 | """Regions parser""" 169 | response = [] 170 | for prov in soup.find(id="protmp").find_all("option"): 171 | prov = prov["value"] 172 | if prov != "": 173 | response.append(prov) 174 | return response 175 | url = "https://www.milanuncios.com/ofertas-de-empleo/" 176 | response = self.__call__(url, parser) 177 | return response 178 | 179 | @property 180 | def regions(self): 181 | """Returns all posible regions hardcoded for filter responses""" 182 | return [ 183 | 'alava', 'albacete', 'alicante', 'almeria', 'andalucia', 'aragon', 184 | 'asturias', 'avila', 'badajoz', 'baleares', 'barcelona', 'burgos', 185 | 'caceres', 'cadiz', 'cantabria', 'canarias', 'castellon', 186 | 'castilla_la_mancha', 'castilla_y_leon', 'catalunya', 'ceuta', 187 | 'ciudad_real', 'cordoba', 'cuenca', 'extremadura', 'galicia', 188 | 'girona', 'granada', 'guadalajara', 'guipuzcoa', 'huelva', 189 | 'huesca', 'jaen', 'la_coruna', 'la_rioja', 'las_palmas', 'leon', 190 | 'lleida', 'lugo', 'madrid', 'malaga', 'melilla', 'murcia', 'navarra', 191 | 'ourense', 'pais_vasco', 'palencia', 'pontevedra', 'salamanca', 192 | 'segovia', 'sevilla', 'soria', 'tarragona', 'tenerife', 'teruel', 193 | 'toledo', 'valencia', 'comunidad_valenciana', 'valladolid', 'vizcaya', 194 | 'zamora', 'zaragoza' 195 | ] 196 | 197 | @staticmethod 198 | def _offer_demand_parser(offer, demand): 199 | """Internal function filter for offer/demand parameters""" 200 | demand_param = None 201 | if not (offer and demand) and not (not offer and not demand): 202 | if offer: 203 | demand_param = "n" 204 | else: 205 | demand_param = "s" 206 | return demand_param 207 | 208 | @property 209 | def current_soup(self): 210 | """Function to get current page source code displaying on browser""" 211 | return BeautifulSoup(self.browser.page_source, "html.parser") 212 | 213 | def __call__(self, url, callback): 214 | """Main internal function to call all the requests of the scraper 215 | 216 | Args: 217 | url (str): Endpoint to use in the method 218 | callback (function): Callback that returns a string 219 | with the whole page html. 220 | 221 | Returns (function): 222 | callback(soup) 223 | """ 224 | self.browser.get(url) 225 | time.sleep(self.delay) 226 | response = callback(self.current_soup) 227 | return response 228 | 229 | @property 230 | def categories(self): 231 | """Obtains all main categories from home page 232 | 233 | Returns: list""" 234 | self.logger.debug("Obtaining main categories...") 235 | def parser(soup): 236 | """Categories parser""" 237 | response = {} 238 | categorias = soup.find_all(class_="catIcono") 239 | for categoria in categorias: 240 | categoria = categoria.find("a") 241 | response[categoria["title"].lower()] = self.main_url + categoria["href"] 242 | try: 243 | self.cache["categories"] 244 | except KeyError: 245 | self.cache["categories"] = response 246 | finally: 247 | return list(response.keys()) 248 | try: 249 | response = self.cache["categories"] 250 | except KeyError: 251 | response = self.__call__(self.main_url, parser) 252 | return response 253 | 254 | def subcategories(self, category): 255 | """Obtain all subcategories (and sub-subcategories recursively) 256 | from a given main category 257 | 258 | Args: 259 | category (str): Category for obtain all nested subcategories 260 | 261 | Returns: list 262 | """ 263 | self.logger.debug("Obtaining subcategories for %s category", category) 264 | def parser(soup): 265 | """Subcategories parser""" 266 | response = {} 267 | classes = ["smoMainCat", "smoL2Cat", "smoL3Cat", "smoL4Cat", "smoL5Cat"] 268 | for cls in classes: 269 | subcategories = soup.find_all(class_=cls) 270 | for subcategory in subcategories: 271 | name = subcategory.string.lower() 272 | if name[-1] == " ": 273 | name = name[:-1] 274 | href = subcategory.find("a")["href"] 275 | response[name] = href 276 | try: 277 | self.cache["subcategories"][name] = self.main_url + href 278 | except KeyError: 279 | self.cache["subcategories"] = {} 280 | self.cache["subcategories"][name] = self.main_url + href 281 | return list(response.keys()) 282 | 283 | try: 284 | self.cache["categories"] 285 | except KeyError: 286 | self.categories 287 | 288 | try: 289 | url = self.cache["categories"][category] 290 | except KeyError: 291 | raise ValueError("Category %s not found in milanuncios.com" % category) 292 | else: 293 | return self.__call__(url, parser) 294 | 295 | def _ads_parser(self, soup): 296 | """Internal parser function for get all ads in every page""" 297 | response = [] 298 | for anuncio in soup.find_all(class_="aditem-detail"): 299 | _title = anuncio.find(class_="aditem-detail-title") 300 | title = _title.string 301 | href = self.main_url + _title["href"] 302 | desc = re.sub(r"<.*?>", "", repr(anuncio.find(class_="tx"))) 303 | try: 304 | price = anuncio.find(class_="aditem-price").next_element 305 | except AttributeError: 306 | price = None 307 | response.append({"title": title, "desc": desc, 308 | "price": price, "href": href}) 309 | return response 310 | 311 | def search(self, query, pages=1, region=None, offer=True, demand=True): 312 | """Search by query 313 | 314 | Args: 315 | query (str): String to search in milanuncios 316 | pages: (int): Number of pages retieved in the search 317 | 318 | Returns: 319 | pandas.DataFrame 320 | """ 321 | from pandas import DataFrame 322 | self.logger.info("Searching all adverts that contain %s", query) 323 | 324 | query = query.replace(" ", "-") 325 | response = [] 326 | endpoint = "/anuncios/" 327 | 328 | # Region filter 329 | if region: 330 | region = region.replace(" ", "_").lower() 331 | if region in self.regions: 332 | endpoint += "-en-%s/" % region 333 | else: 334 | raise ValueError("Region %s is not a valid region, see self.regions" % region) 335 | 336 | demand_param = self._offer_demand_parser(offer, demand) 337 | 338 | for page in tqdm(range(1, pages + 1)): 339 | url = self.main_url + "%s" % endpoint 340 | if query != "": 341 | url += "%s.htm" % query 342 | url += "?pagina=%d&" % page 343 | if demand_param: 344 | url += "demanda=%s&" % demand_param 345 | new_ads = self.__call__(url, self._ads_parser) 346 | response += new_ads 347 | if not new_ads: 348 | self.logger.info("%d pages found", (page - 1)) 349 | break 350 | 351 | if response: 352 | return DataFrame(response, columns=response[0].keys()) 353 | return [] 354 | 355 | def search_category(self, category, subcategory=None, pages=1, 356 | region=None, offer=True, demand=True): 357 | """Search by category (and optional subcategory) 358 | 359 | Args: 360 | category (str): Category to search. 361 | subcategory (str, optional): You can select an optional 362 | subcategory for a more precise search. As default None. 363 | pages (int, optional): Maximun number of pages to retrieve. 364 | As default 1. 365 | 366 | Returns: 367 | pandas.DataFrame 368 | """ 369 | from pandas import DataFrame 370 | self.logger.info("Searching by category: %s", category) 371 | 372 | if subcategory: 373 | try: 374 | endpoint = self.cache["subcategories"][subcategory.lower()] 375 | except KeyError: # If fails, get subcategories from parent 376 | self.subcategories(category) 377 | endpoint = self.cache["subcategories"][subcategory.lower()] 378 | else: 379 | try: 380 | endpoint = self.cache["categories"][category.lower()] 381 | except KeyError: # If fails, reload categories 382 | self.categories 383 | endpoint = self.cache["categories"][category.lower()] 384 | 385 | if region: 386 | region = region.replace(" ", "_").lower() 387 | if region in self.regions: 388 | endpoint = endpoint[:-1] + "-en-%s" % region 389 | 390 | demand_param = self._offer_demand_parser(offer, demand) 391 | 392 | response = [] 393 | for page in tqdm(range(1, pages + 1)): 394 | _url = endpoint + "/?pagina=%d&" % page 395 | if demand_param: 396 | _url += "demanda=%s&" % demand_param 397 | new_ads = self.__call__(_url, self._ads_parser) 398 | response += new_ads 399 | if not new_ads: 400 | self.logger.info("%d pages found", (page - 1)) 401 | break 402 | 403 | if response: 404 | return DataFrame(response, columns=response[0].keys()) 405 | return [] 406 | 407 | def login(self, email, password, remember=False, attempts=5): 408 | """Login in milanuncios to perform actions on your account 409 | 410 | 411 | """ 412 | self.logger.info("Trying to login in milanuncios.com... Email: %s", email) 413 | 414 | def _login(): 415 | # Input fields 416 | email_input = self.browser.find_element_by_id("email") 417 | password_input = self.browser.find_element_by_id("contra") 418 | remember_input = self.browser.find_element_by_id("rememberme") 419 | # Perform actions 420 | email_input.send_keys(email) 421 | time.sleep(random.uniform(1., 1.8)) 422 | password_input.send_keys(password) 423 | time.sleep(random.uniform(1.5, 1.8)) 424 | selected = remember_input.is_selected() 425 | if selected != remember: 426 | remember_input.click() 427 | # Submit button 428 | submit = self.browser.find_element_by_class_name("submit") 429 | submit.click() 430 | return True 431 | 432 | def check_login(): 433 | """Check if login passed""" 434 | soup = self.current_soup 435 | return (soup.find(class_="cat1") != None, soup) 436 | 437 | # Go to my ads page 438 | self.browser.get(self.main_url + "/mis-anuncios/") 439 | time.sleep(self.delay) 440 | 441 | # Check if we are logged 442 | self.logger.debug("Checking login...") 443 | logged, soup = check_login() 444 | self.logger.debug("Logged? -> %r", logged) 445 | 446 | # If we aren't logged, try to login X times (attempts param) 447 | login_passed = False 448 | while not logged and attempts > 0: 449 | time.sleep(self.delay) 450 | try: 451 | login_passed = _login() 452 | except NoSuchElementException: # Hey! We are logging in 453 | login_passed = True 454 | if login_passed: 455 | logged, soup = check_login() 456 | self.logger.debug("Logged? -> %r", logged) 457 | else: # This is not secure yet? 458 | msg = "Login error, if persists send a mail to mondejar1994@gmail.com" 459 | self.logger.warning(msg) 460 | if logged: 461 | break 462 | attempts -= 1 463 | 464 | if attempts == 0: # If all attempts fails 465 | msg = "Login not posible after %d attemps. Please, check your credentials." 466 | self.logger.error(msg) 467 | raise MilAnunciosLoginError(msg) 468 | 469 | self.logger.info("Login successfully.") 470 | self.logged = True 471 | self._logged_soup = soup 472 | return True 473 | 474 | def my_ads(self, *args, dataframe=True, _container=False, **kwargs): 475 | """Get your adverts 476 | 477 | Args: 478 | email (str): Email of your milanuncios account 479 | password (str): Password of your milanuncios account 480 | remember (bool, optional): Do you want to be remembered 481 | in login? False as default 482 | dataframe (bool, optional): If True, returns a pandas.DataFrame, 483 | otherwise returns a list of dictionaries. As default True 484 | 485 | Returns: pandas.DataFrame / list 486 | """ 487 | if dataframe: 488 | from pandas import DataFrame 489 | if not self.logged: 490 | self.login(args[0], args[1], **kwargs) 491 | soup = self._logged_soup 492 | 493 | self.logger.info("Retrieving your ads") 494 | 495 | def get_ad_info(container): 496 | """Get advert info""" 497 | response = {"renovable": False} 498 | 499 | content = container.find(class_="aditem-detail") 500 | 501 | # Get title 502 | title_link = content.find(class_="aditem-detail-title") 503 | response["title"] = title_link.string 504 | 505 | # Get description and time to expire 506 | desc_expire = re.sub(r"<.*?>", "", 507 | repr(content.find(class_="tx"))) 508 | desc, expire_string = desc_expire.split("Caduca en ") 509 | response["desc"] = desc 510 | 511 | response["href"] = self.main_url + title_link["href"] 512 | 513 | # Get ad's expire time 514 | expire = parse_string_to_timedelta(expire_string) 515 | response["expire"] = expire 516 | 517 | # Last renew 518 | last_renew_string = container.find(class_="x6").string 519 | last_renew = parse_string_to_timedelta(last_renew_string) 520 | response["last_renew"] = last_renew 521 | 522 | # Has photos? 523 | view_photos_div = content.find(class_="vef") 524 | if view_photos_div: 525 | response["has_photos"] = True 526 | else: 527 | response["has_photos"] = False 528 | 529 | # If we are renewing ads we need the container 530 | if _container: 531 | response["container"] = container 532 | 533 | return response 534 | 535 | ads = [] 536 | for container in soup.find_all(class_="aditem"): 537 | # Get ad info 538 | ads.append(get_ad_info(container)) 539 | 540 | self.logger.debug("%d ads published in your account", len(ads)) 541 | 542 | if ads: 543 | if dataframe: 544 | return DataFrame(ads, columns=ads[0].keys()) 545 | return ads 546 | return [] 547 | 548 | 549 | def renew_ads(self, *args, ads=None, number=None, **kwargs): 550 | """Renew ads 551 | 552 | Args: 553 | email (str): Email of your milanuncios account 554 | password (str): Password of your milanuncios account 555 | remember (bool, optional): Do you want to be remembered 556 | in login? False as default 557 | ads (list, optional): List with all ads title that you want to renew. 558 | If None, automatically will be renewed all of these 559 | wich can be renovated. 560 | number (int, optional): Number of ads maximun to renovate. 561 | If you specifies ad titles in ads param, this param 562 | will be ignored. As default None. 563 | 564 | Returns (int): 565 | Number of ads that were renewed 566 | """ 567 | # Get all ads of my account 568 | if not self.logged: 569 | all_ads = self.my_ads(args[0], args[1], dataframe=False, 570 | _container=True, **kwargs) 571 | else: 572 | all_ads = self.my_ads(dataframe=False, _container=True, **kwargs) 573 | 574 | if not all_ads: 575 | self.logger.warning("0 ads found. Maybe yo don't have ads pusblished?") 576 | return 577 | 578 | if ads: 579 | self.logger.debug("Renewing %d ads: %s" % (len(ads), str(ads))) 580 | else: 581 | self.logger.debug("Renewing all ads (%d)", len(all_ads)) 582 | 583 | def renew(container): 584 | """Internal function to renew an ad""" 585 | footer = container.find(class_="aditem-footer").find("div") 586 | # Get renew button 587 | renew_button_href = footer.find(class_="icon-renew").parent["href"] 588 | renew_button = self.browser.find_element_by_xpath( 589 | '//a[@href="%s"]' % renew_button_href) 590 | renew_button.click() # Click renew button 591 | time.sleep(self.delay) 592 | 593 | # Change to internal renew iframe 594 | iframe = self.browser.find_element_by_id("ifrw") 595 | self.browser.switch_to.frame(iframe) 596 | # Get confirm renew button 597 | confirm_renew_button = self.browser.find_element_by_id("lren") 598 | confirm_renew_button.click() # Click renew 599 | time.sleep(1) # Go to my ads page again 600 | return True 601 | 602 | def count_new_ad(stats): 603 | stats["ads_found"]["n"] += 1 604 | stats["ads_found"]["ads"].append(advert["title"]) 605 | return True 606 | 607 | minimun_time_between_renews = datetime.timedelta(hours=24) 608 | 609 | stats = { 610 | "n_renews": 0, 611 | "ads_to_renew": { 612 | "n": len(all_ads) if not ads else len(ads), 613 | "ads": all_ads if not ads else ads, 614 | }, 615 | "ads_found": { # Check if there are title errors in ads param 616 | "n": 0, 617 | "ads": [] 618 | } 619 | } 620 | 621 | for advert in all_ads: 622 | to_renew, renovated = (False, False) 623 | if ads: 624 | if advert["title"] in ads or advert["title"].upper() in ads: 625 | to_renew = count_new_ad(stats) 626 | stats["ads_found"]["n"] += 1 627 | stats["ads_found"]["ads"].append(advert["title"]) 628 | else: 629 | to_renew = count_new_ad(stats) 630 | 631 | if to_renew and advert["last_renew"] > minimun_time_between_renews: 632 | renovated = renew(advert["container"]) 633 | if renovated: 634 | stats["n_renews"] += 1 635 | 636 | self.logger.info("%d adverts renovated", stats["n_renews"]) 637 | 638 | if ads: 639 | # Check if all titles on ads list param were found 640 | if stats["ads_found"]["n"] < stats["ads_to_renew"]["n"]: 641 | self.logger.warning("%d ads not found:", 642 | stats["ads_to_renew"]["n"] - stats["ads_found"]["n"]) 643 | for ad in ads: 644 | if ad not in stats["ads_found"]["ads"]: 645 | self.logger.warning(ad) 646 | 647 | # Check number of ads not renewed (only for debug) 648 | if stats["n_renews"] < stats["ads_to_renew"]["n"]: 649 | self.logger.debug("%d adverts were not renovated", 650 | stats["ads_to_renew"]["n"] - stats["n_renews"]) 651 | 652 | return stats["n_renews"] 653 | --------------------------------------------------------------------------------