├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── autoscraper
    ├── __init__.py
    ├── auto_scraper.py
    └── utils.py
└── setup.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .idea/
  6 | .vscode/
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | env/
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # dotenv
 85 | .env
 86 | 
 87 | # virtualenv
 88 | .venv
 89 | venv/
 90 | ENV/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | .spyproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # mkdocs documentation
100 | /site
101 | 
102 | # mypy
103 | .mypy_cache/
104 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Alireza Mika
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AutoScraper: A Smart, Automatic, Fast and Lightweight Web Scraper for Python
  2 | 
  3 | ![img](https://user-images.githubusercontent.com/17881612/91968083-5ee92080-ed29-11ea-82ec-d99ec85367a5.png)
  4 | 
  5 | This project is made for automatic web scraping to make scraping easy. 
  6 | It gets a url or the html content of a web page and a list of sample data which we want to scrape from that page. **This data can be text, url or any html tag value of that page.** It learns the scraping rules and returns the similar elements. Then you can use this learned object with new urls to get similar content or the exact same element of those new pages.
  7 | 
  8 | 
  9 | ## Installation
 10 | 
 11 | It's compatible with python 3.
 12 | 
 13 | - Install latest version from git repository using pip:
 14 | ```bash
 15 | $ pip install git+https://github.com/alirezamika/autoscraper.git
 16 | ```
 17 | 
 18 | - Install from PyPI:
 19 | ```bash
 20 | $ pip install autoscraper
 21 | ```
 22 | 
 23 | - Install from source:
 24 | ```bash
 25 | $ python setup.py install
 26 | ```
 27 | 
 28 | ## How to use
 29 | 
 30 | ### Getting similar results
 31 | 
 32 | Say we want to fetch all related post titles in a stackoverflow page:
 33 | 
 34 | ```python
 35 | from autoscraper import AutoScraper
 36 | 
 37 | url = 'https://stackoverflow.com/questions/2081586/web-scraping-with-python'
 38 | 
 39 | # We can add one or multiple candidates here.
 40 | # You can also put urls here to retrieve urls.
 41 | wanted_list = ["What are metaclasses in Python?"]
 42 | 
 43 | scraper = AutoScraper()
 44 | result = scraper.build(url, wanted_list)
 45 | print(result)
 46 | ```
 47 | 
 48 | Here's the output:
 49 | ```python
 50 | [
 51 |     'How do I merge two dictionaries in a single expression in Python (taking union of dictionaries)?', 
 52 |     'How to call an external command?', 
 53 |     'What are metaclasses in Python?', 
 54 |     'Does Python have a ternary conditional operator?', 
 55 |     'How do you remove duplicates from a list whilst preserving order?', 
 56 |     'Convert bytes to a string', 
 57 |     'How to get line count of a large file cheaply in Python?', 
 58 |     "Does Python have a string 'contains' substring method?", 
 59 |     'Why is “1000000000000000 in range(1000000000000001)” so fast in Python 3?'
 60 | ]
 61 | ```
 62 | Now you can use the `scraper` object to get related topics of any stackoverflow page:
 63 | ```python
 64 | scraper.get_result_similar('https://stackoverflow.com/questions/606191/convert-bytes-to-a-string')
 65 | ```
 66 | 
 67 | ### Getting exact result
 68 | 
 69 | Say we want to scrape live stock prices from yahoo finance:
 70 | 
 71 | ```python
 72 | from autoscraper import AutoScraper
 73 | 
 74 | url = 'https://finance.yahoo.com/quote/AAPL/'
 75 | 
 76 | wanted_list = ["124.81"]
 77 | 
 78 | scraper = AutoScraper()
 79 | 
 80 | # Here we can also pass html content via the html parameter instead of the url (html=html_content)
 81 | result = scraper.build(url, wanted_list)
 82 | print(result)
 83 | ```
 84 | Note that you should update the `wanted_list` if you want to copy this code, as the content of the page dynamically changes.
 85 | 
 86 | You can also pass any custom `requests` module parameter. for example you may want to use proxies or custom headers:
 87 | 
 88 | ```python
 89 | proxies = {
 90 |     "http": 'http://127.0.0.1:8001',
 91 |     "https": 'https://127.0.0.1:8001',
 92 | }
 93 | 
 94 | result = scraper.build(url, wanted_list, request_args=dict(proxies=proxies))
 95 | ```
 96 | 
 97 | Now we can get the price of any symbol:
 98 | 
 99 | ```python
100 | scraper.get_result_exact('https://finance.yahoo.com/quote/MSFT/')
101 | ```
102 | 
103 | **You may want to get other info as well.** For example if you want to get market cap too, you can just append it to the wanted list. By using the `get_result_exact` method, it will retrieve the data as the same exact order in the wanted list.
104 | 
105 | **Another example:** Say we want to scrape the about text, number of stars and the link to issues of Github repo pages:
106 | 
107 | ```python
108 | from autoscraper import AutoScraper
109 | 
110 | url = 'https://github.com/alirezamika/autoscraper'
111 | 
112 | wanted_list = ['A Smart, Automatic, Fast and Lightweight Web Scraper for Python', '2.5k', 'https://github.com/alirezamika/autoscraper/issues']
113 | 
114 | scraper = AutoScraper()
115 | scraper.build(url, wanted_list)
116 | ```
117 | 
118 | Simple, right?
119 | 
120 | 
121 | ### Saving the model
122 | 
123 | We can now save the built model to use it later. To save:
124 | 
125 | ```python
126 | # Give it a file path
127 | scraper.save('yahoo-finance')
128 | ```
129 | 
130 | And to load:
131 | 
132 | ```python
133 | scraper.load('yahoo-finance')
134 | ```
135 | 
136 | ## Tutorials
137 | 
138 | - See [this gist](https://gist.github.com/alirezamika/72083221891eecd991bbc0a2a2467673) for more advanced usages.
139 | - [AutoScraper and Flask: Create an API From Any Website in Less Than 5 Minutes](https://medium.com/better-programming/autoscraper-and-flask-create-an-api-from-any-website-in-less-than-5-minutes-3f0f176fc4a3)
140 | 
141 | ## Issues
142 | Feel free to open an issue if you have any problem using the module.
143 | 
144 | 
145 | ## Support the project
146 | 
147 | <a href="https://www.buymeacoffee.com/alirezam" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/v2/default-black.png" alt="Buy Me A Coffee" height="45" width="163" ></a>
148 | 
149 | 
150 | #### Happy Coding  ♥️
151 | 


--------------------------------------------------------------------------------
/autoscraper/__init__.py:
--------------------------------------------------------------------------------
1 | from autoscraper.auto_scraper import AutoScraper
2 | 


--------------------------------------------------------------------------------
/autoscraper/auto_scraper.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | from collections import defaultdict
  4 | from html import unescape
  5 | from urllib.parse import urljoin, urlparse
  6 | 
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | from autoscraper.utils import (
 11 |     FuzzyText,
 12 |     ResultItem,
 13 |     get_non_rec_text,
 14 |     get_random_str,
 15 |     normalize,
 16 |     text_match,
 17 |     unique_hashable,
 18 |     unique_stack_list,
 19 | )
 20 | 
 21 | 
 22 | class AutoScraper(object):
 23 |     """
 24 |     AutoScraper : A Smart, Automatic, Fast and Lightweight Web Scraper for Python.
 25 |     AutoScraper automatically learns a set of rules required to extract the needed content
 26 |         from a web page. So the programmer doesn't need to explicitly construct the rules.
 27 | 
 28 |     Attributes
 29 |     ----------
 30 |     stack_list: list
 31 |         List of rules learned by AutoScraper
 32 | 
 33 |     Methods
 34 |     -------
 35 |     build() - Learns a set of rules represented as stack_list based on the wanted_list,
 36 |         which can be reused for scraping similar elements from other web pages in the future.
 37 |     get_result_similar() - Gets similar results based on the previously learned rules.
 38 |     get_result_exact() - Gets exact results based on the previously learned rules.
 39 |     get_results() - Gets exact and similar results based on the previously learned rules.
 40 |     save() - Serializes the stack_list as JSON and saves it to disk.
 41 |     load() - De-serializes the JSON representation of the stack_list and loads it back.
 42 |     remove_rules() - Removes one or more learned rule[s] from the stack_list.
 43 |     keep_rules() - Keeps only the specified learned rules in the stack_list and removes the others.
 44 |     """
 45 | 
 46 |     request_headers = {
 47 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 \
 48 |             (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
 49 |     }
 50 | 
 51 |     def __init__(self, stack_list=None):
 52 |         self.stack_list = stack_list or []
 53 | 
 54 |     def save(self, file_path):
 55 |         """
 56 |         Serializes the stack_list as JSON and saves it to the disk.
 57 | 
 58 |         Parameters
 59 |         ----------
 60 |         file_path: str
 61 |             Path of the JSON output
 62 | 
 63 |         Returns
 64 |         -------
 65 |         None
 66 |         """
 67 | 
 68 |         data = dict(stack_list=self.stack_list)
 69 |         with open(file_path, "w") as f:
 70 |             json.dump(data, f)
 71 | 
 72 |     def load(self, file_path):
 73 |         """
 74 |         De-serializes the JSON representation of the stack_list and loads it back.
 75 | 
 76 |         Parameters
 77 |         ----------
 78 |         file_path: str
 79 |             Path of the JSON file to load stack_list from.
 80 | 
 81 |         Returns
 82 |         -------
 83 |         None
 84 |         """
 85 | 
 86 |         with open(file_path, "r") as f:
 87 |             data = json.load(f)
 88 | 
 89 |         # for backward compatibility
 90 |         if isinstance(data, list):
 91 |             self.stack_list = data
 92 |             return
 93 | 
 94 |         self.stack_list = data["stack_list"]
 95 | 
 96 |     @classmethod
 97 |     def _fetch_html(cls, url, request_args=None):
 98 |         request_args = request_args or {}
 99 |         headers = dict(cls.request_headers)
100 |         if url:
101 |             headers["Host"] = urlparse(url).netloc
102 | 
103 |         user_headers = request_args.pop("headers", {})
104 |         headers.update(user_headers)
105 |         res = requests.get(url, headers=headers, **request_args)
106 |         if res.encoding == "ISO-8859-1" and not "ISO-8859-1" in res.headers.get(
107 |             "Content-Type", ""
108 |         ):
109 |             res.encoding = res.apparent_encoding
110 |         html = res.text
111 |         return html
112 | 
113 |     @classmethod
114 |     def _get_soup(cls, url=None, html=None, request_args=None):
115 |         if html:
116 |             html = normalize(unescape(html))
117 |             return BeautifulSoup(html, "lxml")
118 | 
119 |         html = cls._fetch_html(url, request_args)
120 |         html = normalize(unescape(html))
121 | 
122 |         return BeautifulSoup(html, "lxml")
123 | 
124 |     @staticmethod
125 |     def _get_valid_attrs(item):
126 |         key_attrs = {"class", "style"}
127 |         attrs = {
128 |             k: v if v != [] else "" for k, v in item.attrs.items() if k in key_attrs
129 |         }
130 | 
131 |         for attr in key_attrs:
132 |             if attr not in attrs:
133 |                 attrs[attr] = ""
134 |         return attrs
135 | 
136 |     @staticmethod
137 |     def _child_has_text(child, text, url, text_fuzz_ratio):
138 |         child_text = child.getText().strip()
139 | 
140 |         if text_match(text, child_text, text_fuzz_ratio):
141 |             parent_text = child.parent.getText().strip()
142 |             if child_text == parent_text and child.parent.parent:
143 |                 return False
144 | 
145 |             child.wanted_attr = None
146 |             return True
147 | 
148 |         if text_match(text, get_non_rec_text(child), text_fuzz_ratio):
149 |             child.is_non_rec_text = True
150 |             child.wanted_attr = None
151 |             return True
152 | 
153 |         for key, value in child.attrs.items():
154 |             if not isinstance(value, str):
155 |                 continue
156 | 
157 |             value = value.strip()
158 |             if text_match(text, value, text_fuzz_ratio):
159 |                 child.wanted_attr = key
160 |                 return True
161 | 
162 |             if key in {"href", "src"}:
163 |                 full_url = urljoin(url, value)
164 |                 if text_match(text, full_url, text_fuzz_ratio):
165 |                     child.wanted_attr = key
166 |                     child.is_full_url = True
167 |                     return True
168 | 
169 |         return False
170 | 
171 |     def _get_children(self, soup, text, url, text_fuzz_ratio):
172 |         children = reversed(soup.findChildren())
173 |         children = [
174 |             x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)
175 |         ]
176 |         return children
177 | 
178 |     def build(
179 |         self,
180 |         url=None,
181 |         wanted_list=None,
182 |         wanted_dict=None,
183 |         html=None,
184 |         request_args=None,
185 |         update=False,
186 |         text_fuzz_ratio=1.0,
187 |     ):
188 |         """
189 |         Automatically constructs a set of rules to scrape the specified target[s] from a web page.
190 |             The rules are represented as stack_list.
191 | 
192 |         Parameters:
193 |         ----------
194 |         url: str, optional
195 |             URL of the target web page. You should either pass url or html or both.
196 | 
197 |         wanted_list: list of strings or compiled regular expressions, optional
198 |             A list of needed contents to be scraped.
199 |                 AutoScraper learns a set of rules to scrape these targets. If specified,
200 |                 wanted_dict will be ignored.
201 | 
202 |         wanted_dict: dict, optional
203 |             A dict of needed contents to be scraped. Keys are aliases and values are list of target texts
204 |                 or compiled regular expressions.
205 |                 AutoScraper learns a set of rules to scrape these targets and sets its aliases.
206 | 
207 |         html: str, optional
208 |             An HTML string can also be passed instead of URL.
209 |                 You should either pass url or html or both.
210 | 
211 |         request_args: dict, optional
212 |             A dictionary used to specify a set of additional request parameters used by requests
213 |                 module. You can specify proxy URLs, custom headers etc.
214 | 
215 |         update: bool, optional, defaults to False
216 |             If True, new learned rules will be added to the previous ones.
217 |             If False, all previously learned rules will be removed.
218 | 
219 |         text_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
220 |             The fuzziness ratio threshold for matching the wanted contents.
221 | 
222 |         Returns:
223 |         --------
224 |         List of similar results
225 |         """
226 | 
227 |         soup = self._get_soup(url=url, html=html, request_args=request_args)
228 | 
229 |         result_list = []
230 | 
231 |         if update is False:
232 |             self.stack_list = []
233 | 
234 |         if wanted_list:
235 |             wanted_dict = {"": wanted_list}
236 | 
237 |         wanted_list = []
238 | 
239 |         for alias, wanted_items in wanted_dict.items():
240 |             wanted_items = [normalize(w) for w in wanted_items]
241 |             wanted_list += wanted_items
242 | 
243 |             for wanted in wanted_items:
244 |                 children = self._get_children(soup, wanted, url, text_fuzz_ratio)
245 | 
246 |                 for child in children:
247 |                     result, stack = self._get_result_for_child(child, soup, url)
248 |                     stack["alias"] = alias
249 |                     result_list += result
250 |                     self.stack_list.append(stack)
251 | 
252 |         result_list = [item.text for item in result_list]
253 |         result_list = unique_hashable(result_list)
254 | 
255 |         self.stack_list = unique_stack_list(self.stack_list)
256 |         return result_list
257 | 
258 |     @classmethod
259 |     def _build_stack(cls, child, url):
260 |         content = [(child.name, cls._get_valid_attrs(child))]
261 | 
262 |         parent = child
263 |         while True:
264 |             grand_parent = parent.findParent()
265 |             if not grand_parent:
266 |                 break
267 | 
268 |             children = grand_parent.findAll(
269 |                 parent.name, cls._get_valid_attrs(parent), recursive=False
270 |             )
271 |             for i, c in enumerate(children):
272 |                 if c == parent:
273 |                     content.insert(
274 |                         0, (grand_parent.name, cls._get_valid_attrs(grand_parent), i)
275 |                     )
276 |                     break
277 | 
278 |             if not grand_parent.parent:
279 |                 break
280 | 
281 |             parent = grand_parent
282 | 
283 |         wanted_attr = getattr(child, "wanted_attr", None)
284 |         is_full_url = getattr(child, "is_full_url", False)
285 |         is_non_rec_text = getattr(child, "is_non_rec_text", False)
286 |         stack = dict(
287 |             content=content,
288 |             wanted_attr=wanted_attr,
289 |             is_full_url=is_full_url,
290 |             is_non_rec_text=is_non_rec_text,
291 |         )
292 |         stack["url"] = url if is_full_url else ""
293 |         stack["hash"] = hashlib.sha256(str(stack).encode("utf-8")).hexdigest()
294 |         stack["stack_id"] = "rule_" + get_random_str(4)
295 |         return stack
296 | 
297 |     def _get_result_for_child(self, child, soup, url):
298 |         stack = self._build_stack(child, url)
299 |         result = self._get_result_with_stack(stack, soup, url, 1.0)
300 |         return result, stack
301 | 
302 |     @staticmethod
303 |     def _fetch_result_from_child(child, wanted_attr, is_full_url, url, is_non_rec_text):
304 |         if wanted_attr is None:
305 |             if is_non_rec_text:
306 |                 return get_non_rec_text(child)
307 |             return child.getText().strip()
308 | 
309 |         if wanted_attr not in child.attrs:
310 |             return None
311 | 
312 |         if is_full_url:
313 |             return urljoin(url, child.attrs[wanted_attr])
314 | 
315 |         return child.attrs[wanted_attr]
316 | 
317 |     @staticmethod
318 |     def _get_fuzzy_attrs(attrs, attr_fuzz_ratio):
319 |         attrs = dict(attrs)
320 |         for key, val in attrs.items():
321 |             if isinstance(val, str) and val:
322 |                 val = FuzzyText(val, attr_fuzz_ratio)
323 |             elif isinstance(val, (list, tuple)):
324 |                 val = [FuzzyText(x, attr_fuzz_ratio) if x else x for x in val]
325 |             attrs[key] = val
326 |         return attrs
327 | 
328 |     def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs):
329 |         parents = [soup]
330 |         stack_content = stack["content"]
331 |         contain_sibling_leaves = kwargs.get("contain_sibling_leaves", False)
332 |         for index, item in enumerate(stack_content):
333 |             children = []
334 |             if item[0] == "[document]":
335 |                 continue
336 |             for parent in parents:
337 | 
338 |                 attrs = item[1]
339 |                 if attr_fuzz_ratio < 1.0:
340 |                     attrs = self._get_fuzzy_attrs(attrs, attr_fuzz_ratio)
341 | 
342 |                 found = parent.findAll(item[0], attrs, recursive=False)
343 |                 if not found:
344 |                     continue
345 | 
346 |                 if not contain_sibling_leaves and index == len(stack_content) - 1:
347 |                     idx = min(len(found) - 1, stack_content[index - 1][2])
348 |                     found = [found[idx]]
349 | 
350 |                 children += found
351 | 
352 |             parents = children
353 | 
354 |         wanted_attr = stack["wanted_attr"]
355 |         is_full_url = stack["is_full_url"]
356 |         is_non_rec_text = stack.get("is_non_rec_text", False)
357 |         result = [
358 |             ResultItem(
359 |                 self._fetch_result_from_child(
360 |                     i, wanted_attr, is_full_url, url, is_non_rec_text
361 |                 ),
362 |                 getattr(i, "child_index", 0),
363 |             )
364 |             for i in parents
365 |         ]
366 |         if not kwargs.get("keep_blank", False):
367 |             result = [x for x in result if x.text]
368 |         return result
369 | 
370 |     def _get_result_with_stack_index_based(
371 |         self, stack, soup, url, attr_fuzz_ratio, **kwargs
372 |     ):
373 |         p = soup.findChildren(recursive=False)[0]
374 |         stack_content = stack["content"]
375 |         for index, item in enumerate(stack_content[:-1]):
376 |             if item[0] == "[document]":
377 |                 continue
378 |             content = stack_content[index + 1]
379 |             attrs = content[1]
380 |             if attr_fuzz_ratio < 1.0:
381 |                 attrs = self._get_fuzzy_attrs(attrs, attr_fuzz_ratio)
382 |             p = p.findAll(content[0], attrs, recursive=False)
383 |             if not p:
384 |                 return []
385 |             idx = min(len(p) - 1, item[2])
386 |             p = p[idx]
387 | 
388 |         result = [
389 |             ResultItem(
390 |                 self._fetch_result_from_child(
391 |                     p,
392 |                     stack["wanted_attr"],
393 |                     stack["is_full_url"],
394 |                     url,
395 |                     stack["is_non_rec_text"],
396 |                 ),
397 |                 getattr(p, "child_index", 0),
398 |             )
399 |         ]
400 |         if not kwargs.get("keep_blank", False):
401 |             result = [x for x in result if x.text]
402 |         return result
403 | 
404 |     def _get_result_by_func(
405 |         self,
406 |         func,
407 |         url,
408 |         html,
409 |         soup,
410 |         request_args,
411 |         grouped,
412 |         group_by_alias,
413 |         unique,
414 |         attr_fuzz_ratio,
415 |         **kwargs
416 |     ):
417 |         if not soup:
418 |             soup = self._get_soup(url=url, html=html, request_args=request_args)
419 | 
420 |         keep_order = kwargs.get("keep_order", False)
421 | 
422 |         if group_by_alias or (keep_order and not grouped):
423 |             for index, child in enumerate(soup.findChildren()):
424 |                 setattr(child, "child_index", index)
425 | 
426 |         result_list = []
427 |         grouped_result = defaultdict(list)
428 |         for stack in self.stack_list:
429 |             if not url:
430 |                 url = stack.get("url", "")
431 | 
432 |             result = func(stack, soup, url, attr_fuzz_ratio, **kwargs)
433 | 
434 |             if not grouped and not group_by_alias:
435 |                 result_list += result
436 |                 continue
437 | 
438 |             group_id = stack.get("alias", "") if group_by_alias else stack["stack_id"]
439 |             grouped_result[group_id] += result
440 | 
441 |         return self._clean_result(
442 |             result_list, grouped_result, grouped, group_by_alias, unique, keep_order
443 |         )
444 | 
445 |     @staticmethod
446 |     def _clean_result(
447 |         result_list, grouped_result, grouped, grouped_by_alias, unique, keep_order
448 |     ):
449 |         if not grouped and not grouped_by_alias:
450 |             if unique is None:
451 |                 unique = True
452 |             if keep_order:
453 |                 result_list = sorted(result_list, key=lambda x: x.index)
454 |             result = [x.text for x in result_list]
455 |             if unique:
456 |                 result = unique_hashable(result)
457 |             return result
458 | 
459 |         for k, val in grouped_result.items():
460 |             if grouped_by_alias:
461 |                 val = sorted(val, key=lambda x: x.index)
462 |             val = [x.text for x in val]
463 |             if unique:
464 |                 val = unique_hashable(val)
465 |             grouped_result[k] = val
466 | 
467 |         return dict(grouped_result)
468 | 
469 |     def get_result_similar(
470 |         self,
471 |         url=None,
472 |         html=None,
473 |         soup=None,
474 |         request_args=None,
475 |         grouped=False,
476 |         group_by_alias=False,
477 |         unique=None,
478 |         attr_fuzz_ratio=1.0,
479 |         keep_blank=False,
480 |         keep_order=False,
481 |         contain_sibling_leaves=False,
482 |     ):
483 |         """
484 |         Gets similar results based on the previously learned rules.
485 | 
486 |         Parameters:
487 |         ----------
488 |         url: str, optional
489 |             URL of the target web page. You should either pass url or html or both.
490 | 
491 |         html: str, optional
492 |             An HTML string can also be passed instead of URL.
493 |                 You should either pass url or html or both.
494 | 
495 |         request_args: dict, optional
496 |             A dictionary used to specify a set of additional request parameters used by requests
497 |                 module. You can specify proxy URLs, custom headers etc.
498 | 
499 |         grouped: bool, optional, defaults to False
500 |             If set to True, the result will be a dictionary with the rule_ids as keys
501 |                 and a list of scraped data per rule as values.
502 | 
503 |         group_by_alias: bool, optional, defaults to False
504 |             If set to True, the result will be a dictionary with the rule alias as keys
505 |                 and a list of scraped data per alias as values.
506 | 
507 |         unique: bool, optional, defaults to True for non grouped results and
508 |                 False for grouped results.
509 |             If set to True, will remove duplicates from returned result list.
510 | 
511 |         attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
512 |             The fuzziness ratio threshold for matching html tag attributes.
513 | 
514 |         keep_blank: bool, optional, defaults to False
515 |             If set to True, missing values will be returned as empty strings.
516 | 
517 |         keep_order: bool, optional, defaults to False
518 |             If set to True, the results will be ordered as they are present on the web page.
519 | 
520 |         contain_sibling_leaves: bool, optional, defaults to False
521 |             If set to True, the results will also contain the sibling leaves of the wanted elements.
522 | 
523 |         Returns:
524 |         --------
525 |         List of similar results scraped from the web page.
526 |         Dictionary if grouped=True or group_by_alias=True.
527 |         """
528 | 
529 |         func = self._get_result_with_stack
530 |         return self._get_result_by_func(
531 |             func,
532 |             url,
533 |             html,
534 |             soup,
535 |             request_args,
536 |             grouped,
537 |             group_by_alias,
538 |             unique,
539 |             attr_fuzz_ratio,
540 |             keep_blank=keep_blank,
541 |             keep_order=keep_order,
542 |             contain_sibling_leaves=contain_sibling_leaves,
543 |         )
544 | 
545 |     def get_result_exact(
546 |         self,
547 |         url=None,
548 |         html=None,
549 |         soup=None,
550 |         request_args=None,
551 |         grouped=False,
552 |         group_by_alias=False,
553 |         unique=None,
554 |         attr_fuzz_ratio=1.0,
555 |         keep_blank=False,
556 |     ):
557 |         """
558 |         Gets exact results based on the previously learned rules.
559 | 
560 |         Parameters:
561 |         ----------
562 |         url: str, optional
563 |             URL of the target web page. You should either pass url or html or both.
564 | 
565 |         html: str, optional
566 |             An HTML string can also be passed instead of URL.
567 |                 You should either pass url or html or both.
568 | 
569 |         request_args: dict, optional
570 |             A dictionary used to specify a set of additional request parameters used by requests
571 |                 module. You can specify proxy URLs, custom headers etc.
572 | 
573 |         grouped: bool, optional, defaults to False
574 |             If set to True, the result will be a dictionary with the rule_ids as keys
575 |                 and a list of scraped data per rule as values.
576 | 
577 |         group_by_alias: bool, optional, defaults to False
578 |             If set to True, the result will be a dictionary with the rule alias as keys
579 |                 and a list of scraped data per alias as values.
580 | 
581 |         unique: bool, optional, defaults to True for non grouped results and
582 |                 False for grouped results.
583 |             If set to True, will remove duplicates from returned result list.
584 | 
585 |         attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
586 |             The fuzziness ratio threshold for matching html tag attributes.
587 | 
588 |         keep_blank: bool, optional, defaults to False
589 |             If set to True, missing values will be returned as empty strings.
590 | 
591 |         Returns:
592 |         --------
593 |         List of exact results scraped from the web page.
594 |         Dictionary if grouped=True or group_by_alias=True.
595 |         """
596 | 
597 |         func = self._get_result_with_stack_index_based
598 |         return self._get_result_by_func(
599 |             func,
600 |             url,
601 |             html,
602 |             soup,
603 |             request_args,
604 |             grouped,
605 |             group_by_alias,
606 |             unique,
607 |             attr_fuzz_ratio,
608 |             keep_blank=keep_blank,
609 |         )
610 | 
611 |     def get_result(
612 |         self,
613 |         url=None,
614 |         html=None,
615 |         request_args=None,
616 |         grouped=False,
617 |         group_by_alias=False,
618 |         unique=None,
619 |         attr_fuzz_ratio=1.0,
620 |     ):
621 |         """
622 |         Gets similar and exact results based on the previously learned rules.
623 | 
624 |         Parameters:
625 |         ----------
626 |         url: str, optional
627 |             URL of the target web page. You should either pass url or html or both.
628 | 
629 |         html: str, optional
630 |             An HTML string can also be passed instead of URL.
631 |                 You should either pass url or html or both.
632 | 
633 |         request_args: dict, optional
634 |             A dictionary used to specify a set of additional request parameters used by requests
635 |                 module. You can specify proxy URLs, custom headers etc.
636 | 
637 |         grouped: bool, optional, defaults to False
638 |             If set to True, the result will be dictionaries with the rule_ids as keys
639 |                 and a list of scraped data per rule as values.
640 | 
641 |         group_by_alias: bool, optional, defaults to False
642 |             If set to True, the result will be a dictionary with the rule alias as keys
643 |                 and a list of scraped data per alias as values.
644 | 
645 |         unique: bool, optional, defaults to True for non grouped results and
646 |                 False for grouped results.
647 |             If set to True, will remove duplicates from returned result list.
648 | 
649 |         attr_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
650 |             The fuzziness ratio threshold for matching html tag attributes.
651 | 
652 |         Returns:
653 |         --------
654 |         Pair of (similar, exact) results.
655 |         See get_result_similar and get_result_exact methods.
656 |         """
657 | 
658 |         soup = self._get_soup(url=url, html=html, request_args=request_args)
659 |         args = dict(
660 |             url=url,
661 |             soup=soup,
662 |             grouped=grouped,
663 |             group_by_alias=group_by_alias,
664 |             unique=unique,
665 |             attr_fuzz_ratio=attr_fuzz_ratio,
666 |         )
667 |         similar = self.get_result_similar(**args)
668 |         exact = self.get_result_exact(**args)
669 |         return similar, exact
670 | 
671 |     def remove_rules(self, rules):
672 |         """
673 |         Removes a list of learned rules from stack_list.
674 | 
675 |         Parameters:
676 |         ----------
677 |         rules : list
678 |             A list of rules to be removed
679 | 
680 |         Returns:
681 |         --------
682 |         None
683 |         """
684 | 
685 |         self.stack_list = [x for x in self.stack_list if x["stack_id"] not in rules]
686 | 
687 |     def keep_rules(self, rules):
688 |         """
689 |         Removes all other rules except the specified ones.
690 | 
691 |         Parameters:
692 |         ----------
693 |         rules : list
694 |             A list of rules to keep in stack_list and removing the rest.
695 | 
696 |         Returns:
697 |         --------
698 |         None
699 |         """
700 | 
701 |         self.stack_list = [x for x in self.stack_list if x["stack_id"] in rules]
702 | 
703 |     def set_rule_aliases(self, rule_aliases):
704 |         """
705 |         Sets the specified alias for each rule
706 | 
707 |         Parameters:
708 |         ----------
709 |         rule_aliases : dict
710 |             A dictionary with keys of rule_id and values of alias
711 | 
712 |         Returns:
713 |         --------
714 |         None
715 |         """
716 | 
717 |         id_to_stack = {stack["stack_id"]: stack for stack in self.stack_list}
718 |         for rule_id, alias in rule_aliases.items():
719 |             id_to_stack[rule_id]["alias"] = alias
720 | 
721 |     def generate_python_code(self):
722 |         # deprecated
723 |         print("This function is deprecated. Please use save() and load() instead.")
724 | 


--------------------------------------------------------------------------------
/autoscraper/utils.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import random
 4 | import string
 5 | import unicodedata
 6 | 
 7 | from difflib import SequenceMatcher
 8 | 
 9 | 
10 | def unique_stack_list(stack_list):
11 |     seen = set()
12 |     unique_list = []
13 |     for stack in stack_list:
14 |         stack_hash = stack['hash']
15 |         if stack_hash in seen:
16 |             continue
17 |         unique_list.append(stack)
18 |         seen.add(stack_hash)
19 |     return unique_list
20 | 
21 | 
22 | def unique_hashable(hashable_items):
23 |     """Removes duplicates from the list. Must preserve the orders."""
24 |     return list(OrderedDict.fromkeys(hashable_items))
25 | 
26 | 
27 | def get_random_str(n):
28 |     chars = string.ascii_lowercase + string.digits
29 |     return ''.join(random.choice(chars) for i in range(n))
30 | 
31 | 
32 | def get_non_rec_text(element):
33 |     return ''.join(element.find_all(text=True, recursive=False)).strip()
34 | 
35 | 
36 | def normalize(item):
37 |     if not isinstance(item, str):
38 |         return item
39 |     return unicodedata.normalize("NFKD", item.strip())
40 | 
41 | 
42 | def text_match(t1, t2, ratio_limit):
43 |     if hasattr(t1, 'fullmatch'):
44 |         return bool(t1.fullmatch(t2))
45 |     if ratio_limit >= 1:
46 |         return t1 == t2
47 |     return SequenceMatcher(None, t1, t2).ratio() >= ratio_limit
48 | 
49 | 
50 | class ResultItem():
51 |     def __init__(self, text, index):
52 |         self.text = text
53 |         self.index = index
54 | 
55 |     def __str__(self):
56 |         return self.text
57 | 
58 | 
59 | class FuzzyText(object):
60 |     def __init__(self, text, ratio_limit):
61 |         self.text = text
62 |         self.ratio_limit = ratio_limit
63 |         self.match = None
64 | 
65 |     def search(self, text):
66 |         return SequenceMatcher(None, self.text, text).ratio() >= self.ratio_limit
67 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from codecs import open
 2 | from os import path
 3 | 
 4 | from setuptools import find_packages, setup
 5 | 
 6 | here = path.abspath(path.dirname(__file__))
 7 | 
 8 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
 9 |     long_description = f.read()
10 | 
11 | setup(
12 |     name="autoscraper",
13 |     version="1.1.14",
14 |     description="A Smart, Automatic, Fast and Lightweight Web Scraper for Python",
15 |     long_description_content_type="text/markdown",
16 |     long_description=long_description,
17 |     url="https://github.com/alirezamika/autoscraper",
18 |     author="Alireza Mika",
19 |     author_email="alirezamika@gmail.com",
20 |     license="MIT",
21 |     classifiers=[
22 |         "Development Status :: 4 - Beta",
23 |         "License :: OSI Approved :: MIT License",
24 |         "Programming Language :: Python :: 3",
25 |     ],
26 |     keywords="scraping - scraper",
27 |     packages=find_packages(exclude=["contrib", "docs", "tests"]),
28 |     python_requires=">=3.6",
29 |     install_requires=["requests", "bs4", "lxml"],
30 | )
31 | 


--------------------------------------------------------------------------------