├── scripts
    └── geckodriver
    │   ├── geckodriver-v0.19.1-linux64.tar.gz
    │   └── download.sh
├── .gitignore
├── requirements.txt
├── milanuncios
    ├── __init__.py
    ├── utils.py
    └── core.py
├── .travis.yml
├── COPYING
├── docs
    ├── install
    │   └── raspberrypi.md
    └── usage
    │   ├── english.ipynb
    │   └── spanish.ipynb
├── setup.py
├── README.md
└── test
    └── test.py


/scripts/geckodriver/geckodriver-v0.19.1-linux64.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mondeja/milanuncios/HEAD/scripts/geckodriver/geckodriver-v0.19.1-linux64.tar.gz


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Syntax checker
 2 | .pylintrc
 3 | 
 4 | # Virtualenv
 5 | env/
 6 | 
 7 | # Builds and cache
 8 | __pycache__/
 9 | build/
10 | 
11 | # Log files
12 | geckodriver.log
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.6.0
 2 | bs4==0.0.1
 3 | cachetools==2.0.1
 4 | EasyProcess==0.2.3
 5 | numpy==1.13.3
 6 | pandas==0.20.0
 7 | python-dateutil==2.6.1
 8 | pytz==2017.3
 9 | PyVirtualDisplay==0.2.1
10 | selenium==3.8.0
11 | six==1.11.0
12 | tqdm==4.19.5
13 | psutil==5.4.2
14 | 


--------------------------------------------------------------------------------
/milanuncios/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Milanuncios source"""
 5 | 
 6 | __title__ = 'milanuncios'
 7 | __version__ = '0.9.13'
 8 | __author__ = 'Alvaro Mondejar Rubio <mondejar1994@gmail.com>'
 9 | __repo__ = 'https://github.com/mondeja/milanuncios'
10 | __license__ = 'BSD License'
11 | 
12 | from .core import MilAnuncios, MilAnunciosLoginError
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - "3.6"
 7 |   #- "nightly"
 8 | 
 9 | os:
10 |   - linux
11 | 
12 | addons:
13 |   apt:
14 |     packages:
15 |       - "python3-pip" 
16 | 
17 | 
18 | before_install:
19 |   - sudo bash scripts/geckodriver/download.sh travis
20 | 
21 | install:
22 |   - sudo pip3 install -r requirements.txt
23 |   - sudo python3 setup.py install
24 | 
25 | before_script:
26 |   # https://docs.travis-ci.com/user/gui-and-headless-browsers/#Using-xvfb-to-Run-Tests-That-Require-a-GUI
27 |   - "export DISPLAY=:99.0"
28 |   - "sh -e /etc/init.d/xvfb start"
29 |   - sleep 3 # give xvfb some time to start
30 | 
31 | 
32 | script:
33 |   - sudo python3 test/test.py
34 | 
35 | branches:
36 |   only:
37 |     - staging


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Álvaro Mondéjar Rubio <mondejar1994@gmail.com>.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms are permitted
 5 | provided that the above copyright notice and this paragraph are
 6 | duplicated in all such forms and that any documentation, advertising
 7 | materials, and other materials related to such distribution and use
 8 | acknowledge that the software was developed by Álvaro Mondéjar Rubio. The
 9 | name of the Álvaro Mondéjar Rubio may not be used to endorse or promote
10 | products derived from this software without specific prior written
11 | permission.
12 | 
13 | THIS SOFTWARE IS PROVIDED “AS IS” AND WITHOUT ANY EXPRESS OR IMPLIED
14 | WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
15 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.


--------------------------------------------------------------------------------
/docs/install/raspberrypi.md:
--------------------------------------------------------------------------------
 1 | ## Install on RaspberryPi
 2 | ```
 3 | wget https://github.com/mozilla/geckodriver/releases/download/v0.16.0/geckodriver-v0.16.0-arm7hf.tar.gz
 4 | tar -xvzf geckodriver-v0.16.0-arm7hf.tar.gz
 5 | sudo mv geckodriver /usr/local/bin/geckodriver
 6 | sudo apt-get install iceweasel xvfb
 7 | pip3 install bs4 cachetools pyvirtualdisplay selenium==3.3.2 tqdm psutil
 8 | git clone https://github.com/mondeja/milanuncios.git
 9 | cd milanuncios
10 | python3 setup.py install
11 | ```
12 | 
13 | If you want to recopile info, you need to install `pandas` also, but for autorenovate ads it's unnecesary.
14 | 
15 | ### Usage tip
16 | RaspberryPi has little RAM memory, so you need to set a big delay between commands (10 seconds must be enough):
17 | ```
18 | from milanuncios import MilAnuncios
19 | with MilAnuncios(delay=10) as ma:
20 |    ...
21 | ```


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | from distutils.core import setup
 6 | 
 7 | BASEDIR = os.path.dirname(__file__)
 8 | 
 9 | with open(os.path.join(BASEDIR, 'requirements.txt')) as f:
10 |     requirements = f.readlines()
11 | 
12 | with open(os.path.join(BASEDIR, 'README.md')) as f:
13 |     readme = f.read()
14 | 
15 | setup(
16 |     name = 'milanuncios',
17 |     version = '0.9.13',
18 |     url = 'https://github.com/mondeja/milanuncios',
19 |     download_url = 'https://github.com/mondeja/milanuncios/archive/master.zip',
20 |     author = 'Alvaro Mondejar <mondejar1994@gmail.com>',
21 |     author_email = 'mondejar1994@gmail.com',
22 |     license = 'BSD License',
23 |     packages = ['milanuncios'],
24 |     description = 'Python3 web scraper for milanuncios.com.',
25 |     long_description = readme,
26 |     keywords = ['milanuncios', 'anuncios', 'segunda mano', 'scraper', 'dinamic scraping', 'python', 'big data'],
27 |     install_requires = requirements
28 | )
29 | 


--------------------------------------------------------------------------------
/scripts/geckodriver/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to get neccesary version of geckodriver
 4 | # across multiples OS. Need wget installed yet
 5 | 
 6 | travis=1
 7 | if [ -z $1 ]  # travis is not first command?
 8 | then
 9 |   travis=0
10 | fi
11 | 
12 | 
13 | # =======   URLs by OS and architecture   =======
14 | Linux86_64=https://github.com/mozilla/geckodriver/releases/download/v0.19.1/geckodriver-v0.19.1-linux64.tar.gz
15 | # ===============================================
16 | 
17 | # Get OS
18 | case "$OSTYPE" in
19 |   linux*)   OS="linux" ;;
20 |   darwin*)  OS="mac" ;;
21 |   msys*)    OS="windows" ;;
22 |   solaris*) OS="solaris" ;;
23 |   bsd*)     OS="bsd" ;;
24 |   *)        OS="unknown" ;;
25 | esac
26 | 
27 | 
28 | 
29 | # Linux environments
30 | if [ $OS == "linux" ]
31 | then
32 | 
33 |   # Install wget?
34 |   if [ $travis -eq 0 ]  # If we are not in travis, try it
35 |   then
36 |     sudo apt-get install wget
37 |   fi
38 | 
39 |   # 64bit architecture?
40 |   if [ `uname -m` == "x86_64" ]
41 |   then
42 |     wget -N $Linux86_64 P ~/  # Download driver and store at home
43 |     tar -xvf ~/geckodriver-v0.19.1-linux64.tar.gz
44 |     rm ~/geckodriver-v0.19.1-linux64.tar.gz
45 |     ls ~/
46 |   fi
47 | 
48 | fi
49 | 
50 | # If we are in TravisCI, geckodriver needs to be in path
51 | if [ $travis -eq 1 ]
52 | then
53 |   sudo mv -f ~/geckodriver /usr/local/share/geckodriver
54 |   sudo chmod +x /usr/local/share/geckodriver
55 |   sudo ln -s /usr/local/share/geckodriver /usr/local/bin/geckodriver
56 | fi
57 | 
58 | echo "Where is geckodriver?"
59 | whereis geckodriver
60 | 
61 | 


--------------------------------------------------------------------------------
/milanuncios/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Utils module"""
 5 | 
 6 | import logging
 7 | import datetime
 8 | 
 9 | DEFAULT_FORMAT = "%(asctime)s %(levelname)-8s %(name)s:%(lineno)d: %(message)s"
10 | DEFAULT_FORMATTER = logging.Formatter(DEFAULT_FORMAT)
11 | 
12 | def create_logger(name, level=logging.INFO, handler=logging.StreamHandler(),
13 |                   propagate=True):
14 |     """Returns a logger with given name, level and handler."""
15 |     logger = logging.getLogger(name)
16 |     logger.setLevel(level)
17 |     handler.setFormatter(DEFAULT_FORMATTER)
18 |     logger.addHandler(handler)
19 |     logger.propagate = propagate
20 |     return logger
21 | 
22 | def extract_number(string, parse):
23 |     """Returns a number from a string parsing it with a given type"""
24 |     response = ""
25 |     for char in string:
26 |         if char.isdigit():
27 |             response += char
28 |     return parse(response)
29 | 
30 | def parse_string_to_timedelta(string):
31 |     """Convert a string in the form "4 horas" to timedelta object"""
32 |     string_mapping = {"horas": "hours",
33 |                       "hora": "hours",
34 |                       "días": "days",
35 |                       "día": "days",
36 |                       "dia": "days",
37 |                       "dias": "days",
38 |                       "seg": "seconds",
39 |                       "min": "minutes"}
40 |     num = extract_number(string, int)
41 |     for inp, outp in string_mapping.items():
42 |         if inp in string:
43 |             arg = outp
44 |             break
45 |     kwarg = {arg: num}
46 |     return datetime.timedelta(**kwarg)
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # milanuncios
 2 | 
 3 | [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/mondeja/milanuncios/master?filepath=docs%2Fusage%2Fenglish.ipynb)
 4 | 
 5 | ### Python3 web scraper and automatic ad renovator for [milanuncios.com](https.//www.milanuncios.com).
 6 | ### Scraper y autorenovador de anuncios para [milanuncios.com](https.//www.milanuncios.com) en Python3.
 7 | 
 8 | Milanuncios doesn't allow to scrap content for their website by usual methods, but it's posible to scrap dinamic content through other formulas. In their [terms of service](https://www.milanuncios.com/condiciones/), they don't specifies what are the conditions for this kind of scraping, but **AUTORENOVATION IS STRICTLY FORBBIDEN** so I don't take responsability about how you use this program, this **has been made for purely educational purposes**.
 9 | 
10 | I'dont know if works in Windows Mac or Python2, only tested in Linux and Python3.
11 | 
12 | ## Requirements
13 | - Mozilla Firefox >= 57.0
14 | - [Geckodriver](https://github.com/mozilla/geckodriver/releases)
15 | 
16 | ## Install
17 | From source use:
18 | ```
19 | pip3 install https://github.com/mondeja/milanuncios/archive/master.zip
20 | ```
21 | 
22 | or
23 | 
24 | ```
25 | git clone https://github.com/mondeja/milanuncios.git
26 | cd milanuncios
27 | pip3 install -r requirements.txt
28 | python3 setup.py install
29 | ```
30 | 
31 | #### [Install on RaspberryPi](https://github.com/mondeja/milanuncios/tree/master/docs/install/raspberrypi.md)
32 | 
33 | ## Usage
34 | - [Usage](https://mybinder.org/v2/gh/mondeja/milanuncios/master?filepath=docs%2Fusage%2Fenglish.ipynb) (english version)
35 | - [Uso](https://mybinder.org/v2/gh/mondeja/milanuncios/master?filepath=docs%2Fusage%2Fspanish.ipynb) (versión en español)
36 | 
37 | ## Contribute
38 | 
39 | - Issue Tracker: https://github.com/mondeja/milanuncios/issues
40 | - Source Code: https://github.com/mondeja/milanuncios
41 | 
42 | ## Support
43 | 
44 | If you are having issues, please let me know (mondejar1994@gmail.com).
45 | 
46 | ## License
47 | 
48 | Copyright (c) 2017 Álvaro Mondéjar Rubio.
49 | All rights reserved.
50 | 
51 | Redistribution and use in source and binary forms are permitted
52 | provided that the above copyright notice and this paragraph are
53 | duplicated in all such forms and that any documentation, advertising
54 | materials, and other materials related to such distribution and use
55 | acknowledge that the software was developed by Álvaro Mondéjar Rubio. The
56 | name of the Álvaro Mondéjar Rubio may not be used to endorse or promote
57 | products derived from this software without specific prior written
58 | permission.
59 | 
60 | THIS SOFTWARE IS PROVIDED “AS IS” AND WITHOUT ANY EXPRESS OR IMPLIED
61 | WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
62 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
63 | 
64 | 
65 | ## Buy me a coffee?
66 | 
67 | If you feel like buying me a coffee (or a beer?), donations are welcome:
68 | 
69 | ```
70 | BTC : 1LfUF4AcvH7Wd1wTc7Mmqobj4AypUbpvN5
71 | ETH : 0x7428fE875226880DaD222c726F6340eec42Db567
72 | STEEM: @mondeja
73 | ```
74 | 
75 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Test milanuncios web scraper and adverts renewer"""
  5 | 
  6 | # To disable warnings: python3 -W ignore test.py
  7 | 
  8 | # Standard libraries
  9 | import unittest
 10 | 
 11 | # External libraries
 12 | from pandas import DataFrame
 13 | 
 14 | # Internal modules
 15 | from milanuncios import MilAnuncios
 16 | 
 17 | # If you want to test renewer you need to config email and password
 18 | config = {
 19 |     "email": None,
 20 |     "password": None,
 21 |     "ad_titles": None,  # Used to renew ads by title name (list)
 22 |     "ads_number": 1, # Used to renew ads by number of them (int)
 23 | 
 24 |     "debug": False,
 25 |     "delay": 3,
 26 |     "timeout": 15,
 27 |     # Selenium drivers
 28 |     "executable_path": "geckodriver",
 29 |     "log_path": "geckodriver.log",
 30 | 
 31 |     # Test end to end
 32 |     "full": False
 33 | }
 34 | 
 35 | params = ["debug", "delay", "timeout",
 36 |               "executable_path", "log_path"]
 37 | options = {param: config[param] for param in params}
 38 | 
 39 | class TestWebScraper(unittest.TestCase):
 40 |     """MilAnuncios web scraper tests"""
 41 | 
 42 |     def setUp(self):
 43 |         self.ma = MilAnuncios(**options)
 44 |         self.ma.__enter__()
 45 | 
 46 |     def tearDown(self):
 47 |         self.ma.__exit__()
 48 |         if self.ma.debug:
 49 |             self.ma.logger.debug("Firefox processes opened: %r",
 50 |                                  self.ma.firefox_user_processes)
 51 | 
 52 |     # ===   INFO TESTS   ===
 53 |     def test_regions(self):
 54 |         collect_regions = self.ma._get_regions()
 55 |         hardcoded_regions = self.ma.regions
 56 |         self.assertEqual(collect_regions, hardcoded_regions)
 57 | 
 58 |     def test_categories(self):
 59 |         categories = self.ma.categories
 60 |         self.assertIs(type(categories), list)
 61 |         self.assertGreater(len(categories), 10)
 62 | 
 63 |     def test_subcategories(self):
 64 |         subcategories = self.ma.subcategories("motor")
 65 |         self.assertIs(type(subcategories), list)
 66 |         self.assertGreater(len(subcategories), 5)
 67 | 
 68 |     # ===   SEARCH TESTS   ===
 69 |     def test_search(self):
 70 |         # Query basic search
 71 |         response = self.ma.search("sofa")
 72 |         self.assertIn(type(response), (DataFrame, list))
 73 | 
 74 |     def test_search_category(self):
 75 |         # Query basic search by category
 76 |         response = self.ma.search_category("motor")
 77 |         self.assertIs(type(response), DataFrame)
 78 |         self.assertEqual(response.empty, False)
 79 | 
 80 | 
 81 | @unittest.skipIf(not config["full"],
 82 |     'Cache testing only posible if config["full"] == True')
 83 | class TestWebScraperCache(unittest.TestCase):
 84 |     """MilAnuncios cache tests"""
 85 |     def setUp(self):
 86 |         options["init_cache"] = True
 87 |         self.ma = MilAnuncios(**options)
 88 |         del options["init_cache"]
 89 |         self.ma.__enter__()
 90 | 
 91 |     def tearDown(self):
 92 |         self.ma.__exit__()
 93 | 
 94 |     def assert_cached(self, dictionary):
 95 |         self.assertIs(type(dictionary), dict)
 96 |         self.assertGreater(dictionary, 10)
 97 | 
 98 |     def test_categories_cache(self):
 99 |         self.assert_cached(self.ma.cache["categories"])
100 | 
101 |     def test_subcategories_cache(self):
102 |         self.assert_cached(self.ma.cache["subcategories"])
103 | 
104 | 
105 | @unittest.skipIf(not config["email"] or not config["password"],
106 |     "For account methods testing you must provide email and password in config")
107 | class TestAccount(unittest.TestCase):
108 |     """MilAnuncios account tests"""
109 |     def setUp(self):
110 |         self.ma = MilAnuncios(**options)
111 |         self.ma.__enter__()
112 | 
113 |     def tearDown(self):
114 |         self.ma.__exit__()
115 | 
116 |     def test_login(self):
117 |         self.ma.login(config["email"],
118 |                       config["password"])
119 |         self.assertEqual(self.ma.logged, True)
120 | 
121 |     def test_my_ads(self):
122 |         # Test my ads with login
123 |         ads = self.ma.my_ads(config["email"],
124 |                              config["password"])
125 |         self.assertIs(type(ads), DataFrame)
126 | 
127 |     def test_login_my_ads(self):
128 |         # First login, then get my_ads
129 |         if self.ma.login(config["email"],
130 |                          config["password"]):
131 |             ads = self.ma.my_ads()
132 |             self.assertIs(type(ads), DataFrame)
133 | 
134 |     def test_renew_ads(self):
135 |         # If we are renewing by name
136 |         if config["ad_titles"]:
137 |             if self.ma.login(config["email"],
138 |                              config["password"]):
139 |                 renewed = self.ma.renew_ads(ads=config["ad_titles"])
140 |         else:
141 |             if config["ads_number"]:
142 |                 # If we are renewing by number
143 |                 if self.ma.login(config["email"],
144 |                                  config["password"]):
145 |                     renewed = self.ma.renew_ads(number=config["ads_number"])
146 |             else:
147 |                 if self.ma.login(config["email"],
148 |                                  config["password"]):
149 |                     renewed = self.ma.renew_ads()
150 |         self.assertGreater(renewed, 0)
151 | 
152 | if __name__ == "__main__":
153 |     unittest.main()


--------------------------------------------------------------------------------
/docs/usage/english.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# <center>How it works</center>"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Certain web pages try to restrict the possibility of being visited by robots creating content that shows on clients by a dinamic way (trough Javascript code), like [milanuncios.com](https://www.milanuncios.com/).\n",
 15 |     "\n",
 16 |     "### Static scraping in Python\n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "None\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "import requests\n",
 34 |     "from bs4 import BeautifulSoup\n",
 35 |     "\n",
 36 |     "url = \"https://www.milanuncios.com/anuncios/masajes-relajantes-en-jerez.htm\"\n",
 37 |     "\n",
 38 |     "soup = BeautifulSoup(requests.get(url).content, \"html.parser\")\n",
 39 |     "print(soup.find(class_=\"aditem-detail-title\"))"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "Although, is very easy avoid these limitations...\n",
 47 |     "\n",
 48 |     "### Dinamic scraping in Python"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 2,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "<a class=\"aditem-detail-title\" href=\"/masajistas/masajes-y-tratamientos-fisio-domicilio-246582919.htm\" target=\"_blank\"><b class=\"sub2\">MASAJES</b> Y TRATAMIENTOS FISIO DOMICILIO</a>\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "from pyvirtualdisplay import Display  # pip3 install pyvirtualdisplay\n",
 66 |     "from selenium import webdriver        # pip3 install selenium\n",
 67 |     "from bs4 import BeautifulSoup         # pip3 install bs4\n",
 68 |     "import time\n",
 69 |     "\n",
 70 |     "url = \"https://www.milanuncios.com/anuncios/masajes-relajantes-en-jerez.htm\"\n",
 71 |     "\n",
 72 |     "display = Display(visible=0, size=(800, 600))\n",
 73 |     "display.start()\n",
 74 |     "browser = webdriver.Firefox()  # You need geckodriver and Mozilla Firefox\n",
 75 |     "browser.get(url)\n",
 76 |     "time.sleep(.8)\n",
 77 |     "soup = BeautifulSoup(browser.page_source, \"html.parser\")\n",
 78 |     "print(soup.find(class_=\"aditem-detail-title\"))\n",
 79 |     "browser.quit()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "________________________________________________\n",
 87 |     "\n",
 88 |     "# <center>Documentation</center>\n",
 89 |     "\n",
 90 |     "## Basic usage\n",
 91 |     "The 4 main methods to realize most of queries in are the next:"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from milanuncios import MilAnuncios\n",
101 |     "\n",
102 |     "# We must enter to milanuncios like context (with statement)\n",
103 |     "with MilAnuncios() as ma:\n",
104 |     "\n",
105 |     "    # Obtain main categories of home page:\n",
106 |     "    print(ma.categories) \n",
107 |     "\n",
108 |     "    # Obtain subcategories (and subsubcategories recursively) from a main category:\n",
109 |     "    print(ma.subcategories(\"servicios\"))\n",
110 |     "\n",
111 |     "    # Realize a query (we indicates number of pages, 1 page as default):\n",
112 |     "    print(ma.search(\"gatos\", pages=3)) # Returns a pandas' DataFrame\n",
113 |     "\n",
114 |     "    # Realize a search by category/sucategory:\n",
115 |     "    print(ma.search_category(\"juegos\", subcategory=\"videoconsolas\").tail())"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "### Ads renewal"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "from milanuncios import MilAnunciosLoginError\n",
132 |     "\n",
133 |     "# delay parameter indicates how many seconds to wait loading pages and before perform actions\n",
134 |     "# Is 1.5 as default, if you have runtime troubles try to increase it\n",
135 |     "with MilAnuncios(delay=3) as ma:\n",
136 |     "    # Login in milanuncios\n",
137 |     "    ma.login(\"tu_email@proveedor.com\", \"tu_contraseña\")  # If login fails MilAnunciosLoginError will be raised\n",
138 |     "    assert ma.logged == True\n",
139 |     "    \n",
140 |     "    # Obtain your ads\n",
141 |     "    ma.my_ads(dataframe=False) # As default returns a pandas' DataFrame, but you can retrieve a list also\n",
142 |     "    \n",
143 |     "    # Renew your ads\n",
144 |     "        # You can renew by title of by number of adverts\n",
145 |     "        # The program will ignore that adverts wich can't be renewed yet\n",
146 |     "    ma.renew_ads(title=[\"Título de mi anuncio\", \"Otro, da igual si es en minúscula o mayúscula\"])  # Por nombre\n",
147 |     "    \n",
148 |     "    ma.renew_ads(number=3)  # First 3 that can be renewed in your adverts list\n",
149 |     "    \n",
150 |     "    # This method returns the number of adverts renewed"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "You also can obtain or renew your ads without execute `login()` method, only pass it your email and password to ` my_ads()` o `renew_ads()` as two first parameters:"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "options = {}\n",
167 |     "#options = dict(debug=True)  # If you want to display on screen renovation process activate debug\n",
168 |     "\n",
169 |     "with MilAnuncios(**options) as ma:\n",
170 |     "    ma.my_ads(\"tu_email@proveedor.com\", \"tu_contraseña\")\n",
171 |     "    \n",
172 |     "    # If you don't pass number or titles list parameters will be renewed all adverts\n",
173 |     "    ma.renew_ads(\"tu_email@proveedor.com\", \"tu_contraseña\")       "
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "\n",
181 |     "_____________________________\n",
182 |     "## Advanced usage\n",
183 |     "### Filters"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "# We can filter by regions in both methods:\n",
193 |     "print(ma.search(\"guitarra flamenca\", region=\"cadiz\").head())\n",
194 |     "# To consult all possible regions (provinces and autonomous communities):\n",
195 |     "print(ma.regions)\n",
196 |     "\n",
197 |     "# If you want to search only by regions (https://www.milanuncios.com/anuncios-en-sevilla/)\n",
198 |     "print(ma.search(\"\", region=\"sevilla\").head())  # Void string in query parameter\n",
199 |     "\n",
200 |     "# We can filter by offer and demand. As default, offer and demand paramters are True:\n",
201 |     "print(ma.search_category(\"inmobiliaria\", subcategory=\"alquiler de casas\", offer=False))"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "### Caching\n",
209 |     "If you are going to make a lot of queries in only one session, it is convenient to caching all subcategories when instantiating the scraper class. It will take some time storing in memory all subcategories, but then the performance increases:"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "from pprint import pprint\n",
219 |     "\n",
220 |     "ma = MilAnuncios(init_cache=True)\n",
221 |     "pprint(ma.cache[\"categories\"])\n",
222 |     "pprint(ma.cache[\"subcategories\"])"
223 |    ]
224 |   }
225 |  ],
226 |  "metadata": {
227 |   "kernelspec": {
228 |    "display_name": "Python 3",
229 |    "language": "python",
230 |    "name": "python3"
231 |   },
232 |   "language_info": {
233 |    "codemirror_mode": {
234 |     "name": "ipython",
235 |     "version": 3
236 |    },
237 |    "file_extension": ".py",
238 |    "mimetype": "text/x-python",
239 |    "name": "python",
240 |    "nbconvert_exporter": "python",
241 |    "pygments_lexer": "ipython3",
242 |    "version": "3.5.3"
243 |   }
244 |  },
245 |  "nbformat": 4,
246 |  "nbformat_minor": 2
247 | }
248 | 


--------------------------------------------------------------------------------
/docs/usage/spanish.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# <center>Cómo funciona</center>"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Ciertas páginas web intentan restringir la posibilidad de ser visitadas por robots creando el contenido que muestran en el cliente de forma dinámica (por medio de código Javascript), como es el caso de [milanuncios.com](https://www.milanuncios.com/).\n",
 15 |     "\n",
 16 |     "### Scraping estático en Python\n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "None\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "import requests\n",
 34 |     "from bs4 import BeautifulSoup\n",
 35 |     "\n",
 36 |     "url = \"https://www.milanuncios.com/anuncios/masajes-relajantes-en-jerez.htm\"\n",
 37 |     "\n",
 38 |     "soup = BeautifulSoup(requests.get(url).content, \"html.parser\")\n",
 39 |     "print(soup.find(class_=\"aditem-detail-title\"))"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "Sin embargo, es muy fácil saltarse estas limitaciones...\n",
 47 |     "\n",
 48 |     "### Scraping dinámico en Python"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 2,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "<a class=\"aditem-detail-title\" href=\"/masajistas/masajes-y-tratamientos-fisio-domicilio-246582919.htm\" target=\"_blank\"><b class=\"sub2\">MASAJES</b> Y TRATAMIENTOS FISIO DOMICILIO</a>\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "from pyvirtualdisplay import Display  # pip3 install pyvirtualdisplay\n",
 66 |     "from selenium import webdriver        # pip3 install selenium\n",
 67 |     "from bs4 import BeautifulSoup         # pip3 install bs4\n",
 68 |     "import time\n",
 69 |     "\n",
 70 |     "url = \"https://www.milanuncios.com/anuncios/masajes-relajantes-en-jerez.htm\"\n",
 71 |     "\n",
 72 |     "display = Display(visible=0, size=(800, 600))\n",
 73 |     "display.start()\n",
 74 |     "browser = webdriver.Firefox()  # Necesitas geckodriver y Mozilla Firefox\n",
 75 |     "browser.get(url)\n",
 76 |     "time.sleep(.8)\n",
 77 |     "soup = BeautifulSoup(browser.page_source, \"html.parser\")\n",
 78 |     "print(soup.find(class_=\"aditem-detail-title\"))\n",
 79 |     "browser.quit()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "________________________________________________\n",
 87 |     "\n",
 88 |     "# <center>Documentación</center>\n",
 89 |     "\n",
 90 |     "## Uso básico\n",
 91 |     "Los 4 métodos principales para realizar la gran mayoría de consultas en la página de [milanuncios.com](https://www.milanuncios.com/) son los siguientes:"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from milanuncios import MilAnuncios\n",
101 |     "\n",
102 |     "# Debemos entrar a MilAnuncios como contexto (con la sentencia with)\n",
103 |     "with MilAnuncios() as ma:\n",
104 |     "\n",
105 |     "    # Obtener las familias de categorías de la página principal:\n",
106 |     "    print(ma.categories) \n",
107 |     "\n",
108 |     "    # Obtener las subcategorías de una categoría:\n",
109 |     "    print(ma.subcategories(\"servicios\"))\n",
110 |     "\n",
111 |     "    # Realizar una consulta (indicamos el número de páginas, por defecto 1 página):\n",
112 |     "    print(ma.search(\"gatos\", pages=3)) # Devuelve un DataFrame de pandas\n",
113 |     "\n",
114 |     "    # Realizar una búsqueda por categoría/subcategoría:\n",
115 |     "    print(ma.search_category(\"juegos\", subcategory=\"videoconsolas\").tail())"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "### Renovación de anuncios"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "from milanuncios import MilAnunciosLoginError\n",
132 |     "\n",
133 |     "# El parámetro delay indica los segundos que esperamos en cargar las páginas y antes de ejecutar las acciones.\n",
134 |     "# Por defecto es 1.5, si tienes problemas de ejecución prueba a aumentarlo\n",
135 |     "with MilAnuncios(delay=3) as ma:\n",
136 |     "    # Loguearte en milanuncios\n",
137 |     "    ma.login(\"tu_email@proveedor.com\", \"tu_contraseña\")  # Si falla el login se levantará MilAnunciosLoginError\n",
138 |     "    assert ma.logged == True\n",
139 |     "    \n",
140 |     "    # Obtener tus anuncios\n",
141 |     "    ma.my_ads(dataframe=False) # Por defecto devuelve un DataFrame de pandas, pero así devuelve una lista\n",
142 |     "    \n",
143 |     "    # Renovar tus anuncios\n",
144 |     "        # Puedes hacerlo por títulos o por número de anuncios\n",
145 |     "        # El programa ignorará los anuncios que no se pueden renovar aún\n",
146 |     "    ma.renew_ads(title=[\"Título de mi anuncio\", \"Otro, da igual si es en minúscula o mayúscula\"])  # Por nombre\n",
147 |     "    \n",
148 |     "    ma.renew_ads(number=3)  # Los primeros 3 que se puedan renovar de tu lista de anuncios\n",
149 |     "    \n",
150 |     "    # Este método devuelve el número de anuncios renovados"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "También puedes obtener tus anuncios o renovarlos sin pasar por el método `login()`, sólo tienes que proveer tu email y contraseña a los métodos `my_ads()` o `renew_ads()` como primeros dos parámetros:"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "options = {}\n",
167 |     "#options = dict(debug=True)  # Si quieres mostrar en pantalla el proceso de renovación activa el debug\n",
168 |     "\n",
169 |     "with MilAnuncios(**options) as ma:\n",
170 |     "    ma.my_ads(\"tu_email@proveedor.com\", \"tu_contraseña\")\n",
171 |     "    \n",
172 |     "    # Si no pasas número ni lista de títulos se renovarán todos los anuncios:\n",
173 |     "    ma.renew_ads(\"tu_email@proveedor.com\", \"tu_contraseña\")                 "
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "___________________________________\n",
181 |     "\n",
182 |     "## Uso avanzado\n",
183 |     "### Filtros"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "ma = MilAnuncios()\n",
193 |     "ma.__enter__()  # Así también podemos entrar como contexto\n",
194 |     "\n",
195 |     "# Podemos filtrar por regiones en ambos métodos:\n",
196 |     "print(ma.search(\"guitarra flamenca\", region=\"cadiz\").head())\n",
197 |     "# Para consultar todas las regiones posibles (provincias y comunidades autónomas):\n",
198 |     "print(ma.regions)\n",
199 |     "\n",
200 |     "# Si quieres buscar sólo por regiones (https://www.milanuncios.com/anuncios-en-sevilla/)\n",
201 |     "print(ma.search(\"\", region=\"sevilla\").head())  # Cadena vacía en el parámetro query\n",
202 |     "\n",
203 |     "# Podemos filtrar por oferta y demanda. Por defecto, los parámetros offer y demand equivalen a True:\n",
204 |     "print(ma.search_category(\"inmobiliaria\", subcategory=\"alquiler de casas\", offer=False))\n",
205 |     "\n",
206 |     "ma.__exit__()   # No te olvides de salir o acumularás procesos de Firefox en memoria"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "### Cache\n",
214 |     "Si vas a realizar muchas consultas en una sola sesión, es conveniente cachear todas las subcategorías al instanciar el scraper. Tardará un tiempo en guardar en memoria todas las subcategorías pero luego el rendimiento aumentará considerablemente:"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "from pprint import pprint\n",
224 |     "\n",
225 |     "with MilAnuncios(init_cache=True) as ma:\n",
226 |     "    pprint(ma.cache[\"categories\"])\n",
227 |     "    pprint(ma.cache[\"subcategories\"])"
228 |    ]
229 |   }
230 |  ],
231 |  "metadata": {
232 |   "kernelspec": {
233 |    "display_name": "Python 3",
234 |    "language": "python",
235 |    "name": "python3"
236 |   },
237 |   "language_info": {
238 |    "codemirror_mode": {
239 |     "name": "ipython",
240 |     "version": 3
241 |    },
242 |    "file_extension": ".py",
243 |    "mimetype": "text/x-python",
244 |    "name": "python",
245 |    "nbconvert_exporter": "python",
246 |    "pygments_lexer": "ipython3",
247 |    "version": "3.5.3"
248 |   }
249 |  },
250 |  "nbformat": 4,
251 |  "nbformat_minor": 2
252 | }
253 | 


--------------------------------------------------------------------------------
/milanuncios/core.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Core module"""
  5 | 
  6 | # Standard libraries
  7 | import os
  8 | import signal
  9 | import psutil
 10 | import time
 11 | import re
 12 | import random
 13 | import logging
 14 | import datetime
 15 | import platform
 16 | from uuid import uuid4
 17 | from subprocess import Popen, PIPE
 18 | 
 19 | # External libraries
 20 | from pyvirtualdisplay import Display
 21 | from cachetools import Cache
 22 | from bs4 import BeautifulSoup
 23 | from tqdm import tqdm
 24 | from selenium import webdriver
 25 | from selenium.common.exceptions import NoSuchElementException
 26 | 
 27 | # Internal modules
 28 | from milanuncios.utils import (create_logger,
 29 |                                parse_string_to_timedelta)
 30 | 
 31 | class MilAnunciosLoginError(RuntimeError):
 32 |     """Exception raised if login fails"""
 33 |     pass
 34 | 
 35 | class MilAnuncios:
 36 |     """Main Scraper class, used as context
 37 | 
 38 |     Args:
 39 |         delay (float, optional): Time to wait until the page is loaded
 40 |             before scrap it (in seconds). As default, 1.5
 41 |         timeout (float, optional): Timeout for requests. As default 15
 42 |         executable_path (str, optional): Geckodriver executable path.
 43 |             As default, "geckodriver" (needs to be in sys.path)
 44 |         log_path (str, optional): Geckodriver log path. As default,
 45 |             "geckodriver.log"
 46 |         firefox_binary (str, optional): Firefox binary path (used if you
 47 |             are running on RaspberryPi). As default "/usr/bin/firefox"
 48 |         display (bool, optional): Display web browser navigation
 49 |             on real time, useful for debug (doesn't work on RaspberryPi).
 50 |             As default False
 51 |     """
 52 |     def __init__(self, delay=1.5, timeout=15, init_cache=False,
 53 |                  executable_path="geckodriver", log_path="geckodriver.log",
 54 |                  cache=Cache(24), logger=create_logger("milanuncios"),
 55 |                  debug=False, firefox_binary="/usr/bin/firefox",
 56 |                  display=False):
 57 |         self.main_url = "https://www.milanuncios.com"
 58 | 
 59 |         self.timeout = timeout
 60 |         self.delay = delay
 61 |         self.debug = debug
 62 |         self.init_cache = init_cache
 63 | 
 64 |         self.logger = logger
 65 |         if self.debug:
 66 |             self.logger.setLevel(logging.DEBUG)
 67 |         self.cache = cache
 68 | 
 69 |         self._executable_path = executable_path
 70 |         self._log_path = log_path
 71 |         self._firefox_binary = firefox_binary
 72 | 
 73 |         # Attributes defined on __enter__
 74 |         self.session = None
 75 |         self.firefox_user_processes = None
 76 |         self.browser = None
 77 |         self.browser_pid = None
 78 | 
 79 |         self.display = display
 80 | 
 81 |         # Account methods
 82 |         self.logged = False
 83 |         self._logged_soup = None
 84 | 
 85 |     def __enter__(self):
 86 |         self._start_session()
 87 |         if self.init_cache:
 88 |             self._initialize_cache()
 89 |         return self
 90 | 
 91 |     def __exit__(self, *excs):
 92 |         if excs:   # We are calling like...? ->  MilAnuncios().__exit__()
 93 |             if None not in excs:
 94 |                 self.logger.error(excs[2], exc_info=True)
 95 |         if not self.debug:
 96 |             self._end_session()
 97 |         return False
 98 | 
 99 |     def _initialize_cache(self):
100 |         """Internal function to initialize cache"""
101 |         self.logger.info("Caching categories tree, please wait...")
102 |         for category in tqdm(self.categories):
103 |             self.subcategories(category)
104 | 
105 |     @staticmethod
106 |     def _get_firefox_processes():
107 |         """Internal function to get already opened user firefox processes"""
108 |         response = []
109 |         for proc in psutil.process_iter():
110 |             if "firefox" in proc.name():
111 |                 response.append(int(proc._pid))
112 |         return response
113 | 
114 |     def _start_in_raspberry(self):
115 |         """Internal function to start session if we are running
116 |         on RaspberryPi. You need to install iceweasel and download
117 |         geckodriver version 0.16.0"""
118 |         msg = "Initializing driver for RaspberryPi. Firefox binary path: %s"
119 |         self.logger.debug(msg, self._firefox_binary)
120 |         caps = webdriver.DesiredCapabilities().FIREFOX
121 |         caps["marionette"] = False
122 |         binary = webdriver.firefox.firefox_binary.FirefoxBinary(self._firefox_binary)
123 |         return webdriver.Firefox(firefox_binary=binary)
124 | 
125 |     def _start_session(self):
126 |         """Internal function to start a virtual session"""
127 |         self.session = uuid4()
128 |         self.logger.debug("Starting session %s...", self.session)
129 | 
130 |         # Obtain user processes
131 |         self.firefox_user_processes = self._get_firefox_processes()
132 | 
133 |         # pyvirtualdisplay magic happens here
134 |         visible = 1 if self.display == True and platform.node() != "raspberrypi" else 0
135 |         display = Display(visible=visible, size=(1024, 768))
136 |         display.start()
137 | 
138 |         # selenium browser
139 |         if platform.node() == "raspberrypi":
140 |             self.browser = self._start_in_raspberry()
141 |         else:
142 |             self.browser = webdriver.Firefox(executable_path=self._executable_path,
143 |                                              log_path=self._log_path)
144 |         self.browser.set_script_timeout(self.timeout)
145 |         self.browser.set_page_load_timeout(self.timeout)
146 | 
147 |         # Save new process
148 |         for pid in self._get_firefox_processes():
149 |             if pid not in self.firefox_user_processes:
150 |                 self.browser_pid = int(pid)
151 | 
152 |     def _end_session(self):
153 |         """End scraper session"""
154 |         self.logged = False
155 |         os.kill(self.browser_pid, signal.SIGKILL)
156 | 
157 |     def kill_firefox(self):
158 |         """Function to kill all firefox processes. Util for development
159 |         or if you experiments errors in requests."""
160 |         for pid in self._get_firefox_processes():
161 |             os.kill(int(pid), signal.SIGKILL)
162 |         self._start_session()  # We need to restart session
163 | 
164 |     def _get_regions(self):
165 |         """Search in milanuncios.com all the regions
166 |         (use regions property for a faster response)"""
167 |         def parser(soup):
168 |             """Regions parser"""
169 |             response = []
170 |             for prov in soup.find(id="protmp").find_all("option"):
171 |                 prov = prov["value"]
172 |                 if prov != "":
173 |                     response.append(prov)
174 |             return response
175 |         url = "https://www.milanuncios.com/ofertas-de-empleo/"
176 |         response = self.__call__(url, parser)
177 |         return response
178 | 
179 |     @property
180 |     def regions(self):
181 |         """Returns all posible regions hardcoded for filter responses"""
182 |         return [
183 |             'alava', 'albacete', 'alicante', 'almeria', 'andalucia', 'aragon',
184 |             'asturias', 'avila', 'badajoz', 'baleares', 'barcelona', 'burgos',
185 |             'caceres', 'cadiz', 'cantabria', 'canarias', 'castellon',
186 |             'castilla_la_mancha', 'castilla_y_leon', 'catalunya', 'ceuta',
187 |             'ciudad_real', 'cordoba', 'cuenca', 'extremadura', 'galicia',
188 |             'girona', 'granada', 'guadalajara', 'guipuzcoa', 'huelva',
189 |             'huesca', 'jaen', 'la_coruna', 'la_rioja', 'las_palmas', 'leon',
190 |             'lleida', 'lugo', 'madrid', 'malaga', 'melilla', 'murcia', 'navarra',
191 |             'ourense', 'pais_vasco', 'palencia', 'pontevedra', 'salamanca',
192 |             'segovia', 'sevilla', 'soria', 'tarragona', 'tenerife', 'teruel',
193 |             'toledo', 'valencia', 'comunidad_valenciana', 'valladolid', 'vizcaya',
194 |             'zamora', 'zaragoza'
195 |         ]
196 | 
197 |     @staticmethod
198 |     def _offer_demand_parser(offer, demand):
199 |         """Internal function filter for offer/demand parameters"""
200 |         demand_param = None
201 |         if not (offer and demand) and not (not offer and not demand):
202 |             if offer:
203 |                 demand_param = "n"
204 |             else:
205 |                 demand_param = "s"
206 |         return demand_param
207 | 
208 |     @property
209 |     def current_soup(self):
210 |         """Function to get current page source code displaying on browser"""
211 |         return BeautifulSoup(self.browser.page_source, "html.parser")
212 | 
213 |     def __call__(self, url, callback):
214 |         """Main internal function to call all the requests of the scraper
215 | 
216 |         Args:
217 |             url (str): Endpoint to use in the method
218 |             callback (function): Callback that returns a string
219 |                 with the whole page html.
220 | 
221 |         Returns (function):
222 |              callback(soup)
223 |         """
224 |         self.browser.get(url)
225 |         time.sleep(self.delay)
226 |         response = callback(self.current_soup)
227 |         return response
228 | 
229 |     @property
230 |     def categories(self):
231 |         """Obtains all main categories from home page
232 | 
233 |         Returns: list"""
234 |         self.logger.debug("Obtaining main categories...")
235 |         def parser(soup):
236 |             """Categories parser"""
237 |             response = {}
238 |             categorias = soup.find_all(class_="catIcono")
239 |             for categoria in categorias:
240 |                 categoria = categoria.find("a")
241 |                 response[categoria["title"].lower()] = self.main_url + categoria["href"]
242 |             try:
243 |                 self.cache["categories"]
244 |             except KeyError:
245 |                 self.cache["categories"] = response
246 |             finally:
247 |                 return list(response.keys())
248 |         try:
249 |             response = self.cache["categories"]
250 |         except KeyError:
251 |             response = self.__call__(self.main_url, parser)
252 |             return response
253 | 
254 |     def subcategories(self, category):
255 |         """Obtain all subcategories (and sub-subcategories recursively)
256 |         from a given main category
257 | 
258 |         Args:
259 |             category (str): Category for obtain all nested subcategories
260 | 
261 |         Returns: list
262 |         """
263 |         self.logger.debug("Obtaining subcategories for %s category", category)
264 |         def parser(soup):
265 |             """Subcategories parser"""
266 |             response = {}
267 |             classes = ["smoMainCat", "smoL2Cat", "smoL3Cat", "smoL4Cat", "smoL5Cat"]
268 |             for cls in classes:
269 |                 subcategories = soup.find_all(class_=cls)
270 |                 for subcategory in subcategories:
271 |                     name = subcategory.string.lower()
272 |                     if name[-1] == " ":
273 |                         name = name[:-1]
274 |                     href = subcategory.find("a")["href"]
275 |                     response[name] = href
276 |                     try:
277 |                         self.cache["subcategories"][name] = self.main_url + href
278 |                     except KeyError:
279 |                         self.cache["subcategories"] = {}
280 |                         self.cache["subcategories"][name] = self.main_url + href
281 |             return list(response.keys())
282 | 
283 |         try:
284 |             self.cache["categories"]
285 |         except KeyError:
286 |             self.categories
287 | 
288 |         try:
289 |             url = self.cache["categories"][category]
290 |         except KeyError:
291 |             raise ValueError("Category %s not found in milanuncios.com" % category)
292 |         else:
293 |             return self.__call__(url, parser)
294 | 
295 |     def _ads_parser(self, soup):
296 |         """Internal parser function for get all ads in every page"""
297 |         response = []
298 |         for anuncio in soup.find_all(class_="aditem-detail"):
299 |             _title = anuncio.find(class_="aditem-detail-title")
300 |             title = _title.string
301 |             href = self.main_url + _title["href"]
302 |             desc = re.sub(r"<.*?>", "", repr(anuncio.find(class_="tx")))
303 |             try:
304 |                 price = anuncio.find(class_="aditem-price").next_element
305 |             except AttributeError:
306 |                 price = None
307 |             response.append({"title": title, "desc": desc,
308 |                              "price": price, "href": href})
309 |         return response
310 | 
311 |     def search(self, query, pages=1, region=None, offer=True, demand=True):
312 |         """Search by query
313 | 
314 |         Args:
315 |             query (str): String to search in milanuncios
316 |             pages: (int): Number of pages retieved in the search
317 | 
318 |         Returns:
319 |             pandas.DataFrame
320 |         """
321 |         from pandas import DataFrame
322 |         self.logger.info("Searching all adverts that contain %s", query)
323 | 
324 |         query = query.replace(" ", "-")
325 |         response = []
326 |         endpoint = "/anuncios/"
327 | 
328 |         # Region filter
329 |         if region:
330 |             region = region.replace(" ", "_").lower()
331 |             if region in self.regions:
332 |                 endpoint += "-en-%s/" % region
333 |             else:
334 |                 raise ValueError("Region %s is not a valid region, see self.regions" % region)
335 | 
336 |         demand_param = self._offer_demand_parser(offer, demand)
337 | 
338 |         for page in tqdm(range(1, pages + 1)):
339 |             url = self.main_url + "%s" % endpoint
340 |             if query != "":
341 |                 url += "%s.htm" % query
342 |             url += "?pagina=%d&" % page
343 |             if demand_param:
344 |                 url += "demanda=%s&" % demand_param
345 |             new_ads = self.__call__(url, self._ads_parser)
346 |             response += new_ads
347 |             if not new_ads:
348 |                 self.logger.info("%d pages found", (page - 1))
349 |                 break
350 | 
351 |         if response:
352 |             return DataFrame(response, columns=response[0].keys())
353 |         return []
354 | 
355 |     def search_category(self, category, subcategory=None, pages=1,
356 |                         region=None, offer=True, demand=True):
357 |         """Search by category (and optional subcategory)
358 | 
359 |         Args:
360 |             category (str): Category to search.
361 |             subcategory (str, optional): You can select an optional
362 |                 subcategory for a more precise search. As default None.
363 |             pages (int, optional): Maximun number of pages to retrieve.
364 |                 As default 1.
365 | 
366 |         Returns:
367 |             pandas.DataFrame
368 |         """
369 |         from pandas import DataFrame
370 |         self.logger.info("Searching by category: %s", category)
371 | 
372 |         if subcategory:
373 |             try:
374 |                 endpoint = self.cache["subcategories"][subcategory.lower()]
375 |             except KeyError:  # If fails, get subcategories from parent
376 |                 self.subcategories(category)
377 |                 endpoint = self.cache["subcategories"][subcategory.lower()]
378 |         else:
379 |             try:
380 |                 endpoint = self.cache["categories"][category.lower()]
381 |             except KeyError:  # If fails, reload categories
382 |                 self.categories
383 |                 endpoint = self.cache["categories"][category.lower()]
384 | 
385 |         if region:
386 |             region = region.replace(" ", "_").lower()
387 |             if region in self.regions:
388 |                 endpoint = endpoint[:-1] + "-en-%s" % region
389 | 
390 |         demand_param = self._offer_demand_parser(offer, demand)
391 | 
392 |         response = []
393 |         for page in tqdm(range(1, pages + 1)):
394 |             _url = endpoint + "/?pagina=%d&" % page
395 |             if demand_param:
396 |                 _url += "demanda=%s&" % demand_param
397 |             new_ads = self.__call__(_url, self._ads_parser)
398 |             response += new_ads
399 |             if not new_ads:
400 |                 self.logger.info("%d pages found", (page - 1))
401 |                 break
402 | 
403 |         if response:
404 |             return DataFrame(response, columns=response[0].keys())
405 |         return []
406 | 
407 |     def login(self, email, password, remember=False, attempts=5):
408 |         """Login in milanuncios to perform actions on your account
409 | 
410 | 
411 |         """
412 |         self.logger.info("Trying to login in milanuncios.com... Email: %s", email)
413 | 
414 |         def _login():
415 |             # Input fields
416 |             email_input = self.browser.find_element_by_id("email")
417 |             password_input = self.browser.find_element_by_id("contra")
418 |             remember_input = self.browser.find_element_by_id("rememberme")
419 |             # Perform actions
420 |             email_input.send_keys(email)
421 |             time.sleep(random.uniform(1., 1.8))
422 |             password_input.send_keys(password)
423 |             time.sleep(random.uniform(1.5, 1.8))
424 |             selected = remember_input.is_selected()
425 |             if selected != remember:
426 |                 remember_input.click()
427 |             # Submit button
428 |             submit = self.browser.find_element_by_class_name("submit")
429 |             submit.click()
430 |             return True
431 | 
432 |         def check_login():
433 |             """Check if login passed"""
434 |             soup = self.current_soup
435 |             return (soup.find(class_="cat1") != None, soup)
436 | 
437 |         # Go to my ads page
438 |         self.browser.get(self.main_url + "/mis-anuncios/")
439 |         time.sleep(self.delay)
440 | 
441 |         # Check if we are logged
442 |         self.logger.debug("Checking login...")
443 |         logged, soup = check_login()
444 |         self.logger.debug("Logged? -> %r", logged)
445 | 
446 |         # If we aren't logged, try to login X times (attempts param)
447 |         login_passed = False
448 |         while not logged and attempts > 0:
449 |             time.sleep(self.delay)
450 |             try:
451 |                 login_passed = _login()
452 |             except NoSuchElementException:  # Hey! We are logging in
453 |                 login_passed = True
454 |             if login_passed:
455 |                 logged, soup = check_login()
456 |                 self.logger.debug("Logged? -> %r", logged)
457 |             else:  # This is not secure yet?
458 |                 msg = "Login error, if persists send a mail to mondejar1994@gmail.com"
459 |                 self.logger.warning(msg)
460 |             if logged:
461 |                 break
462 |             attempts -= 1
463 | 
464 |         if attempts == 0:  # If all attempts fails
465 |             msg = "Login not posible after %d attemps. Please, check your credentials."
466 |             self.logger.error(msg)
467 |             raise MilAnunciosLoginError(msg)
468 | 
469 |         self.logger.info("Login successfully.")
470 |         self.logged = True
471 |         self._logged_soup = soup
472 |         return True
473 | 
474 |     def my_ads(self, *args, dataframe=True, _container=False, **kwargs):
475 |         """Get your adverts
476 | 
477 |         Args:
478 |             email (str): Email of your milanuncios account
479 |             password (str): Password of your milanuncios account
480 |             remember (bool, optional): Do you want to be remembered
481 |                 in login? False as default
482 |             dataframe (bool, optional): If True, returns a pandas.DataFrame,
483 |                 otherwise returns a list of dictionaries. As default True
484 | 
485 |         Returns: pandas.DataFrame / list
486 |         """
487 |         if dataframe:
488 |             from pandas import DataFrame
489 |         if not self.logged:
490 |             self.login(args[0], args[1], **kwargs)
491 |         soup = self._logged_soup
492 | 
493 |         self.logger.info("Retrieving your ads")
494 | 
495 |         def get_ad_info(container):
496 |             """Get advert info"""
497 |             response = {"renovable": False}
498 | 
499 |             content = container.find(class_="aditem-detail")
500 | 
501 |             # Get title
502 |             title_link = content.find(class_="aditem-detail-title")
503 |             response["title"] = title_link.string
504 | 
505 |             # Get description and time to expire
506 |             desc_expire = re.sub(r"<.*?>", "",
507 |                                  repr(content.find(class_="tx")))
508 |             desc, expire_string = desc_expire.split("Caduca en ")
509 |             response["desc"] = desc
510 | 
511 |             response["href"] = self.main_url + title_link["href"]
512 | 
513 |             # Get ad's expire time
514 |             expire = parse_string_to_timedelta(expire_string)
515 |             response["expire"] = expire
516 | 
517 |             # Last renew
518 |             last_renew_string = container.find(class_="x6").string
519 |             last_renew = parse_string_to_timedelta(last_renew_string)
520 |             response["last_renew"] = last_renew
521 | 
522 |             # Has photos?
523 |             view_photos_div = content.find(class_="vef")
524 |             if view_photos_div:
525 |                 response["has_photos"] = True
526 |             else:
527 |                 response["has_photos"] = False
528 | 
529 |             # If we are renewing ads we need the container
530 |             if _container:
531 |                 response["container"] = container
532 | 
533 |             return response
534 | 
535 |         ads = []
536 |         for container in soup.find_all(class_="aditem"):
537 |             # Get ad info
538 |             ads.append(get_ad_info(container))
539 | 
540 |         self.logger.debug("%d ads published in your account", len(ads))
541 | 
542 |         if ads:
543 |             if dataframe:
544 |                 return DataFrame(ads, columns=ads[0].keys())
545 |             return ads
546 |         return []
547 | 
548 | 
549 |     def renew_ads(self, *args, ads=None, number=None, **kwargs):
550 |         """Renew ads
551 | 
552 |         Args:
553 |             email (str): Email of your milanuncios account
554 |             password (str): Password of your milanuncios account
555 |             remember (bool, optional): Do you want to be remembered
556 |                 in login? False as default
557 |             ads (list, optional): List with all ads title that you want to renew.
558 |                 If None, automatically will be renewed all of these
559 |                 wich can be renovated.
560 |             number (int, optional): Number of ads maximun to renovate.
561 |                 If you specifies ad titles in ads param, this param
562 |                 will be ignored. As default None.
563 | 
564 |         Returns (int):
565 |             Number of ads that were renewed
566 |         """
567 |         # Get all ads of my account
568 |         if not self.logged:
569 |             all_ads = self.my_ads(args[0], args[1], dataframe=False,
570 |                                   _container=True, **kwargs)
571 |         else:
572 |             all_ads = self.my_ads(dataframe=False, _container=True, **kwargs)
573 | 
574 |         if not all_ads:
575 |             self.logger.warning("0 ads found. Maybe yo don't have ads pusblished?")
576 |             return
577 | 
578 |         if ads:
579 |             self.logger.debug("Renewing %d ads: %s" % (len(ads), str(ads)))
580 |         else:
581 |             self.logger.debug("Renewing all ads (%d)", len(all_ads))
582 | 
583 |         def renew(container):
584 |             """Internal function to renew an ad"""
585 |             footer = container.find(class_="aditem-footer").find("div")
586 |             # Get renew button
587 |             renew_button_href = footer.find(class_="icon-renew").parent["href"]
588 |             renew_button = self.browser.find_element_by_xpath(
589 |                 '//a[@href="%s"]' % renew_button_href)
590 |             renew_button.click()  # Click renew button
591 |             time.sleep(self.delay)
592 | 
593 |             # Change to internal renew iframe
594 |             iframe = self.browser.find_element_by_id("ifrw")
595 |             self.browser.switch_to.frame(iframe)
596 |             # Get confirm renew button
597 |             confirm_renew_button = self.browser.find_element_by_id("lren")
598 |             confirm_renew_button.click()  # Click renew
599 |             time.sleep(1)  # Go to my ads page again
600 |             return True
601 | 
602 |         def count_new_ad(stats):
603 |             stats["ads_found"]["n"] += 1
604 |             stats["ads_found"]["ads"].append(advert["title"])
605 |             return True
606 | 
607 |         minimun_time_between_renews = datetime.timedelta(hours=24)
608 | 
609 |         stats = {
610 |             "n_renews": 0,
611 |             "ads_to_renew": {
612 |                 "n": len(all_ads) if not ads else len(ads),
613 |                 "ads": all_ads if not ads else ads,
614 |             },
615 |             "ads_found": {  # Check if there are title errors in ads param
616 |                 "n": 0,
617 |                 "ads": []
618 |             }
619 |         }
620 | 
621 |         for advert in all_ads:
622 |             to_renew, renovated = (False, False)
623 |             if ads:
624 |                 if advert["title"] in ads or advert["title"].upper() in ads:
625 |                     to_renew = count_new_ad(stats)
626 |                     stats["ads_found"]["n"] += 1
627 |                     stats["ads_found"]["ads"].append(advert["title"])
628 |             else:
629 |                 to_renew = count_new_ad(stats)
630 | 
631 |             if to_renew and advert["last_renew"] > minimun_time_between_renews:
632 |                 renovated = renew(advert["container"])
633 |             if renovated:
634 |                 stats["n_renews"] += 1
635 | 
636 |         self.logger.info("%d adverts renovated",  stats["n_renews"])
637 | 
638 |         if ads:
639 |             # Check if all titles on ads list param were found
640 |             if stats["ads_found"]["n"] < stats["ads_to_renew"]["n"]:
641 |                 self.logger.warning("%d ads not found:",
642 |                     stats["ads_to_renew"]["n"] - stats["ads_found"]["n"])
643 |                 for ad in ads:
644 |                     if ad not in stats["ads_found"]["ads"]:
645 |                         self.logger.warning(ad)
646 | 
647 |         # Check number of ads not renewed (only for debug)
648 |         if stats["n_renews"] < stats["ads_to_renew"]["n"]:
649 |             self.logger.debug("%d adverts were not renovated",
650 |                                stats["ads_to_renew"]["n"] - stats["n_renews"])
651 | 
652 |         return stats["n_renews"]
653 | 


--------------------------------------------------------------------------------