├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── article.html
├── people_also_ask
    ├── __init__.py
    ├── data_collector.py
    ├── exceptions.py
    ├── google.py
    ├── parser.py
    ├── plugins
    │   ├── __init__.py
    │   └── article_generator
    │   │   ├── __init__.py
    │   │   ├── article_generators.py
    │   │   └── templates
    │   │       └── base.html
    ├── request
    │   ├── __init__.py
    │   └── session.py
    ├── tests
    │   ├── fixtures
    │   │   ├── cheetah_vs_lion.html
    │   │   ├── gangnam_style.html
    │   │   ├── how_to_make_a_cold_brew_coffee.html
    │   │   ├── the_10_highest-grossing_movies_of_all_time.html
    │   │   ├── what_are_3_basic_programming_languages.html
    │   │   ├── what_time_is_it.html
    │   │   ├── why_was_ho_chi_minh_a_hero.html
    │   │   └── world_university_rankings_2019.html
    │   ├── test_google.py
    │   ├── test_parser.py
    │   ├── where_is_france
    │   └── who_is_ho_chi_minh?
    └── tools.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *_pycache_*
2 | *.swp
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 LE Van Tuan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include plugins/article_generator/templates/*.html


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # People-also-ask Api
  2 | 
  3 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
  4 | [![PyPI](https://img.shields.io/pypi/v/people_also_ask.svg)](https://pypi.org/project/people-also-ask)
  5 | [![versions](https://img.shields.io/pypi/pyversions/people_also_ask.svg)](https://github.com/lagranges/people_also_ask)
  6 | 
  7 | People-also-ask provides APIs to easily crawl the data of google featured snippet.
  8 | 
  9 | ## ⚠ Warning
 10 | 
 11 | Search engines like Google do not allow any sort of automated access to their service but from a legal point of view there is no known case or broken law. Google does not take legal action against scraping, likely for self-protective reasons.
 12 | API have been configured to not abuse google search engine.
 13 | 
 14 | ## Installation
 15 | 
 16 | ```
 17 | pip install people_also_ask 
 18 | ```
 19 | 
 20 | ## Usage
 21 | 
 22 | Goal of ``people_also_ask`` is to provide simple and easy to use API for retrieving informations from Google Featured Snippet.
 23 | 
 24 | ### Importing
 25 | 
 26 | ```python
 27 | import people_also_ask
 28 | ```
 29 | 
 30 | ### How to get related questions
 31 | 
 32 | ```python
 33 | people_also_ask.get_related_questions("coffee")
 34 | 
 35 | ['Is coffee good for your health?',
 36 |   'Why is coffee bad for you?',
 37 |   'Who invented coffee?',
 38 |  'What do u know about coffee?']
 39 | ```
 40 | 
 41 | ### How to get more questions
 42 | 
 43 | ```python
 44 | people_also_ask.get_related_questions("coffee", 5)
 45 | 
 46 | ['How did coffee originate?',
 47 |  'Is coffee good for your health?',
 48 |   'Who brought coffee America?',
 49 |  'Who invented coffee?',
 50 |  'Why is coffee bad for you?',
 51 |  'Why is drinking coffee bad for you?']
 52 | ```
 53 | 
 54 | ### Generate unlimited questions
 55 | 
 56 | ```python
 57 | for question in people_also_ask.generate_related_questions("cofee")
 58 | 
 59 | Why is coffee bad for you?
 60 | Who invented coffee?
 61 | Is coffee good for your health?
 62 | Who brought coffee America?
 63 | How did coffee originate?
 64 | Why is drinking coffee bad for you?
 65 | ....
 66 | ```
 67 | 
 68 | ### Get answer for a question
 69 | 
 70 | ```python
 71 | people_also_ask.get_answer("Why is coffee bad for you?")
 72 | 
 73 | {'has_answer': True,
 74 |  'question': 'Why is coffee bad for you?',
 75 |  'related_questions': ['Why is drinking coffee bad for you?',
 76 |   'Is coffee good for your health?',
 77 |   'Is coffee toxic to your body?',
 78 |   'What does coffee do to your body?'],
 79 |  'response': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018',
 80 |  'heading': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018',
 81 |  'title': 'Coffee — Good or Bad? - Healthline',
 82 |  'link': 'https://www.healthline.com/nutrition/coffee-good-or-bad#:~:text=Consuming%20too%20much%20caffeine%20can,can%20disrupt%20sleep%20(%2035%20).',
 83 |  'displayed_link': 'www.healthline.com › nutrition › coffee-good-or-bad',
 84 |  'snippet_str': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018\nwww.healthline.com › nutrition › coffee-good-or-bad\nhttps://www.healthline.com/nutrition/coffee-good-or-bad#:~:text=Consuming%20too%20much%20caffeine%20can,can%20disrupt%20sleep%20(%2035%20).\nCoffee — Good or Bad? - Healthline',
 85 |  'snippet_data': None,
 86 |  'date': None,
 87 |  'snippet_type': 'Definition Featured Snippet',
 88 |  'snippet_str_body': '',
 89 |  'raw_text': 'Featured snippet from the web\nConsuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If \nyou\n are sensitive to caffeine and tend to become overstimulated, \n may want to avoid \ncoffee\n altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).\nAug 30, 2018\nCoffee — Good or Bad? - Healthline\nwww.healthline.com\n › nutrition › coffee-good-or-bad'}
 90 | ```
 91 | 
 92 | ### Get Simple Answer for a question
 93 | 
 94 | ```python
 95 | people_also_ask.get_simple_answer("Why is coffee bad for you?")
 96 | 
 97 | 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018'
 98 | ```
 99 | 
100 | ### Generate questions and answers relative to a subject
101 | 
102 | ```python
103 | people_also_ask.generate_answer("coffee")
104 | ```
105 | 
106 | ### Using proxies
107 | 
108 | ```python
109 | import people_also_ask.request.session
110 | 
111 | people_also_ask.request.session.set_proxies(
112 |     (
113 |         "http://1234.5.6.7:8080",
114 |         "http://1237.5.6.7:8080",
115 |     )
116 | )
117 | ```
118 | 
119 | ### Using google domain different than global
120 | 
121 | Default domain is ".com", but it doesn't always show good PAA for a keyword in other language than English.
122 | 
123 | Example of usage:
124 | 
125 | ```python
126 | import people_also_ask
127 | 
128 | keyword = "kawa"
129 | paa = people_also_ask.get_related_questions(text=keyword, max_nb_questions=10, domain="pl")
130 | print(paa)
131 | ```
132 | Output:
133 | ```python
134 | ['Na co dobra jest kawa?', 'Jakie są wady picia kawy?', 'Kiedy kawa jest zdrowa?', 'Jakie są minusy picia kawy?', 'Na co wpływa kawa?', 'Na jakie choroby pomaga kawa?', 'Jaka kawa jest najlepsza?', 'Jakie skutki uboczne ma kawa?', 'Jaka kawa ma najlepszy smak?', 'Jaka kawa smakuje dobrze?', 'Na co pomaga picie kawy?']
135 | ```
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/article.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>The role of ORM (Online Review Management) in digital marketing for your business</title>
 6 | </head>
 7 | <body>
 8 |     <h1>The role of ORM (Online Review Management) in digital marketing for your business</h1>
 9 |     <a> Online reputation management (ORM) is about monitoring and improving how your business is viewed online. It means looking and analyzing what a potential customer, reporter, or partner will discover about your brand, your people, or your product/service when they perform a Google search.</a>
10 |     <ul class="sitemap">
11 |         <li>
12 |             <a href="#What is the role of ORM in digital marketing?">What is the role of ORM in digital marketing?</a>
13 |         </li>
14 |         <li>
15 |             <a href="#What is the role of ORM?">What is the role of ORM?</a>
16 |         </li>
17 |         <li>
18 |             <a href="#Is ORM part of digital marketing?">Is ORM part of digital marketing?</a>
19 |         </li>
20 |         <li>
21 |             <a href="#What is the role of online reputation management?">What is the role of online reputation management?</a>
22 |         </li>
23 |     </ul>
24 |     <div id="What is the role of ORM in digital marketing?">
25 |             <h2>What is the role of ORM in digital marketing?</h2>
26 |         </div>
27 |         <a>Online reputation management (ORM) is about monitoring and improving how your business is viewed online. It means looking and analyzing what a potential customer, reporter, or partner will discover about your brand, your people, or your product/service when they perform a Google search.</a>
28 |     <div id="What is the role of ORM?">
29 |             <h2>What is the role of ORM?</h2>
30 |         </div>
31 |         <a>Online reputation management (ORM) is about monitoring and improving how your business is viewed online. It means looking and analyzing what a potential customer, reporter, or partner will discover about your brand, your people, or your product/service when they perform a Google search.</a>
32 |     <div id="Is ORM part of digital marketing?">
33 |             <h2>Is ORM part of digital marketing?</h2>
34 |         </div>
35 |         <a>ORM in digital marketing assists businesses in establishing and maintaining a positive brand image on the internet.</a>
36 |     <div id="What is the role of online reputation management?">
37 |             <h2>What is the role of online reputation management?</h2>
38 |         </div>
39 |         <a>Online reputation management includes having accurate business listings and relevant advertisements, staying on top of engagement with consumers on your company's marketplace and social channels, understanding their pain points and responding to reviews and surveys.Feb 12, 2021</a>
40 |     
41 | </body>
42 | </html>


--------------------------------------------------------------------------------
/people_also_ask/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 | from people_also_ask.google import (
3 |     get_answer,
4 |     generate_answer,
5 |     get_simple_answer,
6 |     get_related_questions,
7 |     generate_related_questions,
8 | )
9 | 


--------------------------------------------------------------------------------
/people_also_ask/data_collector.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | import time
 3 | import json
 4 | import argparse
 5 | import traceback
 6 | from collections import OrderedDict
 7 | from people_also_ask.google import get_simple_answer
 8 | from people_also_ask.exceptions import (
 9 |     InvalidQuestionInputFileError,
10 |     FailedToWriteOuputFileError,
11 | )
12 | 
13 | def parse_args():
14 |     parser = argparse.ArgumentParser()
15 | 
16 |     parser.add_argument("--input-file", "-i", help="input file which is a txt file containing list of questions", required=True)
17 |     parser.add_argument("--output-file", "-o", help="output file which is .json file containing a dictionary of question: answer", required=True)
18 | 
19 |     return parser.parse_args()
20 | 
21 | 
22 | def read_questions(input_file):
23 |     try:
24 |         with open(input_file, "r") as fd:
25 |             text = fd.read()
26 |             return OrderedDict.fromkeys(text.strip().split("\n")).keys()
27 |     except Exception:
28 |         message = traceback.format_exc()
29 |         raise InvalidQuestionInputFileError(input_file, message)
30 | 
31 | def write_question_answers(output_file, data):
32 |     try:
33 |         with open(output_file, "w") as fd:
34 |             fd.write(json.dumps(data))
35 |     except Exception:
36 |         message = traceback.format_exc()
37 |         raise FailedToWriteOuputFileError(output_file, message)
38 | 
39 | 
40 | def collect_one_question(question):
41 |     try:
42 |         answer = get_simple_answer(question)
43 |         print(f"{question}: {answer}")
44 |     except Exception:
45 |         traceback.print_exc()
46 |         answer = ""
47 |     return {question: answer}
48 | 
49 | 
50 | def collect_data(input_file, output_file):
51 |     questions = read_questions(input_file)
52 |     data = {}
53 | 
54 |     counter = 0
55 | 
56 |     start_time = time.time()
57 |     for question in questions:
58 |         counter += 1
59 |         print(f"COLLECTING {counter}/{len(questions)}")
60 |         data.update(collect_one_question(question))
61 |     collect_time = (time.time() - start_time) / 60  #  minutes
62 | 
63 |     print(f"Collected answers for {len(questions)} questions in {collect_time} minutes")
64 |     write_question_answers(output_file, data)
65 | 
66 | def main():
67 |     args = parse_args()
68 |     collect_data(args.input_file, args.output_file)
69 | 
70 | if __name__ == "__main__":
71 |     main()
72 | 


--------------------------------------------------------------------------------
/people_also_ask/exceptions.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | """
  3 | Global realted-questions exception and warning classes.
  4 | """
  5 | 
  6 | 
  7 | GITHUB_LINK = "https://github.com/lagranges/people_also_ask"
  8 | 
  9 | 
 10 | class RelatedQuestionError(Exception):
 11 |     """Base Related-Questions exception class."""
 12 | 
 13 |     def __init__(self, error):
 14 |         self.error = error
 15 | 
 16 |     def __repr__(self):
 17 |         return (
 18 |             f'An unkown error occured: {self.error}.'
 19 |             f' Please report it on {GITHUB_LINK}.'
 20 |         )
 21 | 
 22 | 
 23 | class FeaturedSnippetParserError(RelatedQuestionError):
 24 |     """
 25 |     Exception raised when failed to get answer from
 26 |     search result page
 27 |     """
 28 | 
 29 |     def __init__(self, text):
 30 |         self.keyword = text
 31 | 
 32 |     def __repr__(self):
 33 |         return (
 34 |             f"Cannot parse result page of '{self.text}'."
 35 |             f" It mays due to a format change of result page."
 36 |             f' Please report it on {GITHUB_LINK}.'
 37 |         )
 38 | 
 39 | 
 40 | class RelatedQuestionParserError(RelatedQuestionError):
 41 |     """
 42 |     Exception raised when failed to get related questions
 43 |     from search result page
 44 |     """
 45 | 
 46 |     def __init__(self, text):
 47 |         self.keyword = text
 48 | 
 49 |     def __repr__(self):
 50 |         return (
 51 |             f"Cannot parse result page of '{self.text}'."
 52 |             f" It mays due to a format change of result page."
 53 |             f' Please report it on {GITHUB_LINK}.'
 54 |         )
 55 | 
 56 | 
 57 | class GoogleSearchRequestFailedError(RelatedQuestionError):
 58 |     """Exception raised when failed to request search on google"""
 59 | 
 60 |     def __init__(self, url, keyword, message):
 61 |         self.url = url
 62 |         self.keyword = keyword
 63 |         self.message = message
 64 | 
 65 |     def __repr__(self):
 66 |         return (
 67 |             f"Failed to requests {self.url}/{self.keyword}"
 68 |             f"\n{self.message}"
 69 |         )
 70 | 
 71 | 
 72 | class InvalidQuestionInputFileError(RelatedQuestionError):
 73 |     """Exception raised when user enter an invalid question input"""
 74 |     """ for data collector """
 75 | 
 76 |     def __init__(self, input_file, message):
 77 |         self.input_file = input_file
 78 |         self.message = message
 79 | 
 80 |     def __repr__(self):
 81 |         return (
 82 |             f"Invalid input file: {self.input_file}\n{self.message}"
 83 |         )
 84 | 
 85 | 
 86 | class FailedToWriteOuputFileError(RelatedQuestionError):
 87 |     """Exception raised when program fails to write data to """
 88 |     """ output file for data colletor"""
 89 | 
 90 |     def __init__(self, output_file, message):
 91 |         self.output_file = output_file
 92 |         self.message = message
 93 | 
 94 |     def __repr__(self):
 95 |         return (
 96 |             f"Cannot write to {self.output_file}\n{self.message}"
 97 |         )
 98 | 
 99 | 
100 | class RequestError(RelatedQuestionError):
101 |     """Exception raised when failed to request"""
102 | 
103 |     def __init__(self, url, params, proxies, message):
104 |         self.url = url
105 |         self.keyword = params
106 |         self.proxies = proxies
107 |         self.message = message
108 | 
109 |     def __repr__(self):
110 |         return (
111 |             f"Failed to requests {self.url}"
112 |             f"\nParams = {self.params}"
113 |             f"\nProxy = {self.proxies}"
114 |             f"\nResp = {self.message}"
115 |         )
116 | 


--------------------------------------------------------------------------------
/people_also_ask/google.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | import sys
  3 | from bs4 import BeautifulSoup
  4 | from typing import List, Dict, Any, Optional, Generator
  5 | 
  6 | from people_also_ask.parser import (
  7 |     extract_related_questions,
  8 |     get_featured_snippet_parser,
  9 | )
 10 | from people_also_ask.exceptions import (
 11 |     RelatedQuestionParserError,
 12 |     FeaturedSnippetParserError
 13 | )
 14 | from people_also_ask.request import get
 15 | from people_also_ask.request.session import user_agent
 16 | 
 17 | 
 18 | URL = "https://www.google.com/search"
 19 | 
 20 | 
 21 | def search(keyword: str, url: str = URL) -> Optional[BeautifulSoup]:
 22 |     """return html parser of google search result"""
 23 |     browser = user_agent['browser']
 24 |     params = {"client": browser,
 25 |               "q": keyword,
 26 |               "sourceid": browser,
 27 |               "ie": "UTF-8",
 28 |               "oe": "UTF-8"}
 29 | 
 30 |     response = get(url, params=params)
 31 | 
 32 |     return BeautifulSoup(response.text, "html.parser")
 33 | 
 34 | 
 35 | def _get_related_questions(text: str, domain: str="com") -> List[str]:
 36 |     """
 37 |     return a list of questions related to text.
 38 |     These questions are from search result of text
 39 | 
 40 |     :param str text: text to search
 41 |     :param str domain: specify google domain to improve searching in a native language
 42 |     """
 43 | 
 44 |     url = f"https://www.google.{domain}/search"
 45 |     document = search(text, url=url)
 46 |     if not document:
 47 |         return []
 48 |     try:
 49 |         return extract_related_questions(document)
 50 |     except Exception:
 51 |         raise RelatedQuestionParserError(text)
 52 | 
 53 | 
 54 | def generate_related_questions(text: str, domain: str="com") -> Generator[str, None, None]:
 55 |     """
 56 |     generate the questions related to text,
 57 |     these quetions are found recursively
 58 | 
 59 |     :param str text: text to search
 60 |     :param str domain: specify google domain to improve searching in a native language
 61 |     """
 62 |     questions = set(_get_related_questions(text, domain=domain))
 63 |     searched_text = set(text)
 64 |     while questions:
 65 |         text = questions.pop()
 66 |         yield text
 67 |         searched_text.add(text)
 68 |         questions |= set(_get_related_questions(text, domain=domain))
 69 |         questions -= searched_text
 70 | 
 71 | 
 72 | def get_related_questions(text: str, max_nb_questions: Optional[int] = None, domain: str="com"):
 73 |     """
 74 |     return a number of questions related to text.
 75 |     These questions are found recursively.
 76 | 
 77 |     :param str text: text to search
 78 |     :param str domain: specify google domain to improve searching in a native language
 79 |     """
 80 |     if max_nb_questions is None:
 81 |         return _get_related_questions(text, domain=domain)
 82 |     nb_question_regenerated = 0
 83 |     questions = []
 84 |     for question in generate_related_questions(text, domain=domain):
 85 |         if len(set(questions)) >= max_nb_questions:
 86 |             break
 87 |         questions.append(question)
 88 |         nb_question_regenerated += 1
 89 | 
 90 |     return list(OrderedDict.fromkeys(questions))
 91 |     return list(questions)
 92 | 
 93 | 
 94 | def get_answer(question: str, domain: str="com") -> Dict[str, Any]:
 95 |     """
 96 |     return a dictionary as answer for a question.
 97 | 
 98 |     :param str question: asked question
 99 |     :param str domain: specify google domain to improve searching in a native language
100 |     """
101 | 
102 |     url = f"https://www.google.{domain}/search"
103 |     document = search(question, url=url)
104 |     related_questions = extract_related_questions(document)
105 |     featured_snippet = get_featured_snippet_parser(
106 |             question, document)
107 |     if not featured_snippet:
108 |         res = dict(
109 |             has_answer=False,
110 |             question=question,
111 |             related_questions=related_questions,
112 |         )
113 |     else:
114 |         res = dict(
115 |             has_answer=True,
116 |             question=question,
117 |             related_questions=related_questions,
118 |         )
119 |         try:
120 |             res.update(featured_snippet.to_dict())
121 |         except Exception:
122 |             raise FeaturedSnippetParserError(question)
123 |     return res
124 | 
125 | 
126 | def generate_answer(text: str, domain: str="com", enhance_search=True) -> Generator[dict, None, None]:
127 |     """
128 |     generate answers of questions related to text
129 | 
130 |     :param str text: text to search
131 |     :param str domain: specify google domain to improve searching in a native language
132 |     """
133 |     if enhance_search:
134 |         tries = 0
135 |         answer = {"link": False}
136 |     
137 |         while not answer["link"] and tries < 4:
138 |             answer = get_answer(text, domain)
139 |             tries += 1
140 |     else:
141 |         answer = get_answer(text, domain)
142 |      
143 |     questions = set(answer["related_questions"])
144 |     searched_text = set(text)
145 |     if answer["has_answer"]:
146 |         yield answer
147 |     while questions:
148 |         text = questions.pop()
149 |         answer = get_answer(text, domain)
150 |         if answer["has_answer"]:
151 |             yield answer
152 |         searched_text.add(text)
153 |         questions |= set(get_answer(text, domain)["related_questions"])
154 |         questions -= searched_text
155 | 
156 | 
157 | def get_simple_answer(question: str, depth: bool = False, domain: str="com") -> str:
158 |     """
159 |     return a text as summary answer for the question
160 | 
161 |     :param str question: asked quetion
162 |     :param bool depth: return the answer of first related question
163 |         if no answer found for question
164 |     :param str domain: specify google domain to improve searching in a native language
165 |     """
166 | 
167 |     url = f"https://www.google.{domain}/search"
168 |     document = search(question, url=url)
169 |     featured_snippet = get_featured_snippet_parser(
170 |             question, document)
171 |     if featured_snippet:
172 |         return featured_snippet.response
173 |     if depth:
174 |         related_questions = get_related_questions(question)
175 |         if not related_questions:
176 |             return ""
177 |         return get_simple_answer(related_questions[0], domain)
178 |     return ""
179 | 
180 | 
181 | if __name__ == "__main__":
182 |     from pprint import pprint as print
183 |     print(get_answer(sys.argv[1]))
184 | 


--------------------------------------------------------------------------------
/people_also_ask/parser.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | from bs4.element import Tag
  3 | from bs4 import BeautifulSoup
  4 | from operator import attrgetter
  5 | from typing import List, Optional
  6 | from people_also_ask.tools import itemize, tabulate, remove_redundant
  7 | 
  8 | 
  9 | FEATURED_SNIPPET_ATTRIBUTES = [
 10 |     "response", "heading", "title", "link", "displayed_link",
 11 |     "snippet_str", "snippet_data", "date", "snippet_data",
 12 |     "snippet_type", "snippet_str_body", "raw_text"
 13 | ]
 14 | 
 15 | 
 16 | def extract_related_questions(document: BeautifulSoup) -> List[str]:
 17 |     related_questions = document.find_all("div", class_="related-question-pair")
 18 |     if not related_questions:
 19 |         return []
 20 |     extract_question = lambda a: a.text.split('Search for:')[0]
 21 |     return list(map(extract_question, related_questions))
 22 | 
 23 | 
 24 | def is_ol_but_not_a_menu(tag):
 25 |     return (
 26 |         tag.name == "ol"
 27 |         and (
 28 |             not tag.has_attr("role")
 29 |             or (tag.has_attr("role") and tag["role"] != "menu")
 30 |             )
 31 |         )
 32 | 
 33 | 
 34 | def get_tag_heading(tag):
 35 |     return (
 36 |         tag.find("div", {"role": "heading", "aria-level": "3"})
 37 |         or tag.find("div", {"role": "heading"})
 38 |     )
 39 | 
 40 | 
 41 | def has_youtube_link(tag):
 42 |     youtube_links = tag.findAll(
 43 |         lambda x: x.name == "a" and "youtube" in x.get("href", "")
 44 |     )
 45 |     return bool(youtube_links)
 46 | 
 47 | 
 48 | def get_raw_text(tag):
 49 |     return "\n".join(remove_redundant(tag.strings))
 50 | 
 51 | 
 52 | def get_span_text(tag):
 53 |     return "\n".join(
 54 |             remove_redundant(
 55 |                 [e.text for e in tag.findAll("span") if e.text]
 56 |                 )
 57 |             )
 58 | 
 59 | 
 60 | class FeaturedSnippetParser(object):
 61 | 
 62 |     def __init__(self, text: str, tag: Tag):
 63 |         self.text = text
 64 |         self.tag = tag
 65 | 
 66 |     def __getattr__(self, attr):
 67 |         if attr in FEATURED_SNIPPET_ATTRIBUTES:
 68 |             return None
 69 |         raise AttributeError(f'{self.__class__.__name__}.{attr} is invalid.')
 70 | 
 71 |     @property
 72 |     def raw_text(self):
 73 |         return get_raw_text(self.tag)
 74 | 
 75 |     def to_dict(self):
 76 |         return {
 77 |             attr: getattr(self, attr) for attr in FEATURED_SNIPPET_ATTRIBUTES
 78 |         }
 79 | 
 80 | 
 81 | class SimpleFeaturedSnippetParser(FeaturedSnippetParser):
 82 | 
 83 |     @classmethod
 84 |     def get_instance(self, text, tag):
 85 |         if tag.table is not None:
 86 |             return TableFeaturedSnippetParser(text, tag)
 87 |         if tag.findAll(is_ol_but_not_a_menu):
 88 |             return OrderedFeaturedSnippetParser(text, tag)
 89 |         if tag.ul is not None:
 90 |             return UnorderedFeaturedSnippetParser(text, tag)
 91 |         if get_tag_heading(tag):
 92 |             return DefinitionFeaturedSnippetParser(text, tag)
 93 |         if has_youtube_link(tag):
 94 |             return YoutubeFeaturedSnippetParser(text, tag)
 95 | 
 96 |     @property
 97 |     def tag_link(self):
 98 |         if hasattr(self, "_tag_link"):
 99 |             return self._tag_link
100 |         self._tag_link = self.tag.find(
101 |             lambda tag: (
102 |                 tag.name == "a"
103 |                 and tag.has_attr("href")
104 |                 and tag["href"].startswith("http")
105 |                 and (tag.h3 or tag.h2) is not None
106 |             )
107 |         )
108 |         return self._tag_link
109 | 
110 |     @property
111 |     def link(self):
112 |         return self.tag_link["href"] if self.tag_link else None
113 | 
114 |     @property
115 |     def displayed_link(self):
116 |         return self.tag.cite.text if self.tag.cite else None
117 | 
118 |     @property
119 |     def title(self):
120 |         if self.tag_link is None:
121 |             return None
122 |         tag_title = self.tag_link.h3 or self.tag_link.h2
123 |         return tag_title.text
124 | 
125 |     @property
126 |     def heading(self):
127 |         tag_heading = get_tag_heading(self.tag)
128 |         return tag_heading.text
129 | 
130 |     @property
131 |     def snippet_str(self):
132 |         lines = []
133 |         for field in (
134 |             "heading", "snippet_str_body",
135 |             "displayed_link", "link", "title"
136 |         ):
137 |             if getattr(self, field):
138 |                 lines.append(getattr(self, field))
139 |         return "\n".join(lines)
140 | 
141 |     @property
142 |     def date(self):
143 |         return None
144 | 
145 |     @property
146 |     def snippet_data(self):
147 |         return None
148 | 
149 |     @property
150 |     def snippet_type(self):
151 |         return "Unknown Featured Snippet"
152 | 
153 |     @property
154 |     def snippet_str_body(self):
155 |         return ""
156 | 
157 | 
158 | class TableFeaturedSnippetParser(SimpleFeaturedSnippetParser):
159 |     """Example: world university rankings 2019"""
160 | 
161 |     @property
162 |     def snippet_type(self):
163 |         return "Table Featured Snippet"
164 | 
165 |     @property
166 |     def snippet_str_body(self):
167 |         header = self.snippet_data["columns"]
168 |         table = self.snippet_data["values"]
169 |         return tabulate(header=header, table=table)
170 | 
171 |     @property
172 |     def response(self):
173 |         return self.snippet_str_body
174 | 
175 |     @property
176 |     def snippet_data(self):
177 |         table_tag = self.tag.find("table")
178 |         tr_tags = table_tag.findAll("tr")
179 |         if tr_tags[0].find("th"):
180 |             columns = [
181 |                 th_tag.text for th_tag in tr_tags[0].findAll("th")
182 |             ]
183 |             body_table_tags = tr_tags[1:]
184 |         else:
185 |             columns = None
186 |             body_table_tags = tr_tags
187 |         values = [
188 |             [td_tag.text for td_tag in tr_tag.findAll("td")]
189 |             for tr_tag in body_table_tags
190 |         ]
191 |         if columns is None:
192 |             columns = list(range(len(values[0])))
193 |         return {
194 |             "columns": columns,
195 |             "values": values
196 |         }
197 | 
198 | 
199 | class OrderedFeaturedSnippetParser(SimpleFeaturedSnippetParser):
200 |     """Example: top grossing movies"""
201 | 
202 |     @property
203 |     def snippet_type(self):
204 |         return "Ordered Featured Snippet"
205 | 
206 |     @property
207 |     def response(self):
208 |         return self.snippet_str_body
209 | 
210 |     @property
211 |     def snippet_str_body(self):
212 |         return "\n".join(itemize(self.snippet_data))
213 | 
214 |     @property
215 |     def snippet_data(self):
216 |         ol_tags = self.tag.find("ol")
217 |         li_tags = ol_tags.findAll("li")
218 |         return [tag.text for tag in li_tags]
219 | 
220 | 
221 | class UnorderedFeaturedSnippetParser(SimpleFeaturedSnippetParser):
222 |     """ What are 3 basic programming languages? """
223 | 
224 |     @property
225 |     def snippet_type(self):
226 |         return "Unordered Featured Snippet"
227 | 
228 |     @property
229 |     def snippet_str_body(self):
230 |         return "\n".join(itemize(self.snippet_data))
231 | 
232 |     @property
233 |     def response(self):
234 |         return self.snippet_str_body
235 | 
236 |     @property
237 |     def snippet_data(self):
238 |         ul_tag = self.tag.find("ul")
239 |         li_tags = ul_tag.findAll("li")
240 |         return [tag.text for tag in li_tags]
241 | 
242 | 
243 | class DefinitionFeaturedSnippetParser(SimpleFeaturedSnippetParser):
244 |     """Why was ho chi minh a hero"""
245 | 
246 |     @property
247 |     def snippet_type(self):
248 |         return "Definition Featured Snippet"
249 | 
250 |     @property
251 |     def response(self):
252 |         return self.heading
253 | 
254 | 
255 | class YoutubeFeaturedSnippetParser(SimpleFeaturedSnippetParser):
256 |     """Ex: cheetah vs lion"""
257 | 
258 |     @property
259 |     def snippet_type(self):
260 |         return "Youtube Featured Snippet"
261 | 
262 |     @property
263 |     def heading(self):
264 |         return ""
265 | 
266 |     @property
267 |     def response(self):
268 |         return self.link
269 | 
270 | 
271 | class MultipleCardsFeaturedSnippetTag(FeaturedSnippetParser):
272 |     """How to make a cold brew coffee"""
273 | 
274 |     @property
275 |     def heading(self):
276 |         tag_heading = (
277 |             self.tag.find("h3", {"role": "heading"})
278 |             or self.tag.find("h2", {"role": "heading"})
279 |         )
280 |         return tag_heading.text
281 | 
282 |     @property
283 |     def snippet_type(self):
284 |         return "Multiple Cards Featured Snippet Tag"
285 | 
286 |     def parse_card(self, tag_card):
287 |         return {
288 |             "heading": tag_card.find("div", {"role": "heading"}).text,
289 |             "title": tag_card.cite.text,
290 |             "link": tag_card.find('a', attrs={'data-jsarwt': True})['href'],
291 |             "raw_text": get_raw_text(tag_card),
292 |         }
293 | 
294 |     def str_card(self, card_data):
295 |         lines = [card_data["raw_text"]]
296 |         lines.append(f"Link: {card_data['link']}")
297 |         return "\n".join(lines)
298 | 
299 |     @property
300 |     def snippet_str(self):
301 |         if not self.snippet_data:
302 |             return ""
303 |         return "\n-------------\n".join(map(self.str_card, self.snippet_data))
304 | 
305 |     @property
306 |     def snippet_data(self):
307 |         return list(map(self.parse_card, self.tag.findAll("g-inner-card")))
308 | 
309 |     @property
310 |     def response(self):
311 |         return self.snippet_str
312 | 
313 | 
314 | class SingleCardFeaturedSnippetParser(FeaturedSnippetParser):
315 |     """What time is it"""
316 | 
317 |     @property
318 |     def snippet_type(self):
319 |         return "Single Card FeaturedSnippet"
320 | 
321 |     @property
322 |     def heading(self):
323 |         tag_heading = get_tag_heading(self.tag)
324 |         return get_raw_text(tag_heading)
325 | 
326 |     @property
327 |     def response(self):
328 |         heading = self.heading
329 |         if heading:
330 |             return heading
331 |         return self.raw_text
332 | 
333 |     @property
334 |     def raw_text(self):
335 |         return get_span_text(self.tag)
336 | 
337 | 
338 | class WholePageTabContainer(FeaturedSnippetParser):
339 |     """Gangnam Style"""
340 | 
341 |     @property
342 |     def snippet_type(self):
343 |         return "Whole Page Tab Container"
344 | 
345 |     @property
346 |     def tag_link(self):
347 |         if hasattr(self, "_tag_link"):
348 |             return self._tag_link
349 |         self._tag_link = self.tag.find(
350 |             lambda tag: (
351 |                 tag.name == "a"
352 |                 and tag.has_attr("href")
353 |                 and tag["href"].startswith("http")
354 |                 and (tag.h3 or tag.h2) is not None
355 |             )
356 |         )
357 |         return self._tag_link
358 | 
359 |     @property
360 |     def link(self):
361 |         return self.tag_link["href"] if self.tag_link else None
362 | 
363 |     @property
364 |     def displayed_link(self):
365 |         return self.tag.cite.text if self.tag.cite else None
366 | 
367 |     @property
368 |     def title(self):
369 |         if self.tag_link is None:
370 |             return None
371 |         tag_title = self.tag_link.h3 or self.tag_link.h2
372 |         return tag_title.text
373 | 
374 |     @property
375 |     def response(self):
376 |         return self.raw_text
377 | 
378 |     @property
379 |     def raw_text(self):
380 |         return get_span_text(self.tag)
381 | 
382 | 
383 | def is_simple_featured_snippet_tag(tag):
384 |     class_tuple = tuple(tag.get("class", ""))
385 |     is_xpdopen = (tag.name == "div" and class_tuple == ("xpdopen",))
386 |     if not is_xpdopen:
387 |         return False
388 |     is_xpdopen_of_related_questions = (
389 |         tag.h2 is not None and tag.h2.text == "People also ask"
390 |     )
391 |     return not is_xpdopen_of_related_questions
392 | 
393 | 
394 | def is_single_card_featured_snippet_tag(tag):
395 |     is_card_section = (
396 |         tag.name == "div" and "card-section" in tag.get("class", [])
397 |     )
398 |     if not is_card_section:
399 |         return False
400 |     is_card_section_of_tip = tag.text.startswith("Tip:")
401 |     return not is_card_section_of_tip
402 | 
403 | 
404 | def is_multiple_card_snippet_tag(tag):
405 |     return (tag.name == "g-section-with-header")
406 | 
407 | 
408 | def is_whole_page_tabs_container(tag):
409 |     return (tag.get("id") == "wp-tabs-container")
410 | 
411 | 
412 | def is_web_results(tag):
413 |     return (tag.name == "h2" and tag.text == "Web results")
414 | 
415 | 
416 | def get_featured_snippet_tag(document):
417 | 
418 |     def lookup_featured_snippet_tag(tag):
419 |         return (
420 |             is_simple_featured_snippet_tag(tag)
421 |             or is_single_card_featured_snippet_tag(tag)
422 |             or is_multiple_card_snippet_tag(tag)
423 |             or is_web_results(tag)
424 |         )
425 |     whole_page_tag = document.find(is_whole_page_tabs_container)
426 |     tag = document.find(lookup_featured_snippet_tag)
427 |     if tag and is_simple_featured_snippet_tag(tag):
428 |         return tag
429 |     if whole_page_tag:
430 |         return whole_page_tag
431 |     if not tag or tag.name == "h2":
432 |         return None
433 |     return tag
434 | 
435 | 
436 | def get_featured_snippet_parser(question, document: BeautifulSoup):
437 |     tag = get_featured_snippet_tag(document)
438 |     if tag is None:
439 |         return
440 |     if is_simple_featured_snippet_tag(tag):
441 |         return SimpleFeaturedSnippetParser.get_instance(question, tag)
442 |     if is_multiple_card_snippet_tag(tag):
443 |         return MultipleCardsFeaturedSnippetTag(question, tag)
444 |     if is_single_card_featured_snippet_tag(tag):
445 |         return SingleCardFeaturedSnippetParser(question, tag)
446 |     if is_whole_page_tabs_container(tag):
447 |         return WholePageTabContainer(question, tag)
448 | 


--------------------------------------------------------------------------------
/people_also_ask/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lagranges/people_also_ask/4e14287125ff3593b0814db718af7b97afe73d89/people_also_ask/plugins/__init__.py


--------------------------------------------------------------------------------
/people_also_ask/plugins/article_generator/__init__.py:
--------------------------------------------------------------------------------
1 | from .article_generators import generate_article
2 | 
3 | 
4 | __all__ = [
5 |     "generate_article"
6 | ]


--------------------------------------------------------------------------------
/people_also_ask/plugins/article_generator/article_generators.py:
--------------------------------------------------------------------------------
 1 | import people_also_ask as paa
 2 | from pathlib import Path
 3 | from jinja2 import Environment, FileSystemLoader
 4 | 
 5 | 
 6 | NB_QUESTION = 10
 7 | 
 8 | 
 9 | def generate_article(title: str):
10 |     questions = paa.get_related_questions(title, max_nb_questions=NB_QUESTION)
11 | 
12 |     introduction = paa.get_simple_answer(title)
13 | 
14 |     contents = {}
15 |     for question in questions:
16 |         contents[question] = paa.get_simple_answer(question)
17 | 
18 |     file_loader = FileSystemLoader('templates')
19 |     env = Environment(loader=file_loader)
20 | 
21 |     cur_dir = Path(__file__).parent
22 |     template_path = cur_dir / "templates" / "base.html"
23 |     template = env.from_string(template_path.read_text())
24 | 
25 |     output = template.render(
26 |         title=title,
27 |         introduction=introduction,
28 |         contents=contents,
29 |         get_question_id=lambda x: x.replace(" ", "_")
30 |     )
31 |     with open("article.html", "w") as fd:
32 |         fd.write(output)


--------------------------------------------------------------------------------
/people_also_ask/plugins/article_generator/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>{{ title }}</title>
 6 | </head>
 7 | <body>
 8 |     <h1>{{ title }}</h1>
 9 |     <a> {{ introduction}}</a>
10 |     <ul class="sitemap">
11 |     {%- for item in contents recursive %}
12 |         <li>
13 |             <a href="{{ '#' + get_question_id(item) }}">{{ item}}</a>
14 |         </li>
15 |     {%- endfor %}
16 |     </ul>
17 |     {% for question, answer in contents.items() -%}
18 |         <div id="{{ get_question_id(question) }}">
19 |             <h2>{{ question }}</h2>
20 |         </div>
21 |         <a>{{ answer }}</a>
22 |     {% endfor %}
23 | </body>
24 | </html>


--------------------------------------------------------------------------------
/people_also_ask/request/__init__.py:
--------------------------------------------------------------------------------
1 | from .session import get
2 | 
3 | 
4 | __all__ = ["get", "user_agent"]
5 | 


--------------------------------------------------------------------------------
/people_also_ask/request/session.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import requests
  4 | import traceback
  5 | from fake_useragent import UserAgent
  6 | 
  7 | from people_also_ask.tools import retryable
  8 | from itertools import cycle
  9 | from typing import Optional
 10 | from people_also_ask.tools import CallingSemaphore
 11 | from people_also_ask.exceptions import RequestError
 12 | 
 13 | from requests import Session as _Session
 14 | 
 15 | 
 16 | SESSION = _Session()
 17 | NB_TIMES_RETRY = os.environ.get(
 18 |     "RELATED_QUESTION_NB_TIMES_RETRY", 3
 19 | )
 20 | NB_REQUESTS_LIMIT = os.environ.get(
 21 |     "RELATED_QUESTION_NB_REQUESTS_LIMIT", 25
 22 | )
 23 | NB_REQUESTS_DURATION_LIMIT = os.environ.get(
 24 |     "RELATED_QUESTION_NB_REQUESTS_DURATION_LIMIT", 60  # seconds
 25 | )
 26 | logging.basicConfig()
 27 | semaphore = CallingSemaphore(
 28 |     NB_REQUESTS_LIMIT, NB_REQUESTS_DURATION_LIMIT
 29 | )
 30 | 
 31 | ua = UserAgent()
 32 | user_agent = ua.getRandom
 33 | 
 34 | HEADERS = {
 35 |     'User-Agent': user_agent['useragent']
 36 | }
 37 | 
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | 
 41 | class ProxyGeneator:
 42 | 
 43 |     def __init__(self, proxies: Optional[tuple]):
 44 |         self.proxies = proxies
 45 | 
 46 |     @property
 47 |     def iter_proxy(self):
 48 |         if not self.proxies:
 49 |             raise ValueError("No proxy found")
 50 |         if getattr(self, "_iter_proxy", None) is None:
 51 |             self._iter_proxy = cycle(self.proxies)
 52 |         return self._iter_proxy
 53 | 
 54 |     def get(self) -> dict:
 55 |         if not self.proxies:
 56 |             return {}
 57 |         proxy = next(self.iter_proxy)
 58 |         if not proxy.startswith("https"):
 59 |             proxy = f"http://{proxy}"
 60 |         return {
 61 |             "https": proxy
 62 |         }
 63 | 
 64 | 
 65 | def _load_proxies() -> Optional[tuple]:
 66 |     filepath = os.getenv("PAA_PROXY_FILE")
 67 |     if filepath:
 68 |         with open(filepath, "w") as fd:
 69 |             proxies = [e.strip() for e in fd.read().splitlines() if e.strip()]
 70 |     else:
 71 |         proxies = None
 72 |     return proxies
 73 | 
 74 | 
 75 | def set_proxies(proxies: Optional[tuple]) -> ProxyGeneator:
 76 |     global PROXY_GENERATORS
 77 |     PROXY_GENERATORS = ProxyGeneator(proxies=proxies)
 78 | 
 79 | 
 80 | set_proxies(proxies=_load_proxies())
 81 | 
 82 | 
 83 | @retryable(NB_TIMES_RETRY)
 84 | def get(url: str, params) -> requests.Response:
 85 |     proxies = PROXY_GENERATORS.get()
 86 |     try:
 87 |         with semaphore:
 88 |             response = SESSION.get(
 89 |                 url,
 90 |                 params=params,
 91 |                 headers=HEADERS,
 92 |                 proxies=proxies,
 93 |             )
 94 |     except Exception:
 95 |         raise RequestError(
 96 |             url, params, proxies, traceback.format_exc()
 97 |         )
 98 |     if response.status_code != 200:
 99 |         raise RequestError(
100 |             url, params, proxies, response.text
101 |         )
102 |     return response
103 | 


--------------------------------------------------------------------------------
/people_also_ask/tests/test_google.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from people_also_ask import google
 3 | 
 4 | 
 5 | config = dict(
 6 |     test_get_answer=dict(
 7 |         text="Who is Ho Chi Minh?"
 8 |     ),
 9 |     test_get_related_questions=dict(
10 |         text="where is france"
11 |     )
12 | )
13 | 
14 | 
15 | def test_get_answer():
16 |     answer = google.get_answer(config["test_get_answer"]["text"])
17 |     assert "response" in answer
18 | 
19 | 
20 | def test_get_related_questions():
21 |     related_questions = google.get_related_questions(
22 |         config["test_get_related_questions"]["text"]
23 |     )
24 |     assert len(related_questions) > 0
25 | 
26 | 


--------------------------------------------------------------------------------
/people_also_ask/tests/test_parser.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from bs4 import BeautifulSoup
 4 | from people_also_ask.parser import (
 5 |     get_featured_snippet_parser,
 6 |     WholePageTabContainer,
 7 |     TableFeaturedSnippetParser,
 8 |     YoutubeFeaturedSnippetParser,
 9 |     OrderedFeaturedSnippetParser,
10 |     UnorderedFeaturedSnippetParser,
11 |     DefinitionFeaturedSnippetParser,
12 |     MultipleCardsFeaturedSnippetTag,
13 |     SingleCardFeaturedSnippetParser,
14 | )
15 | 
16 | 
17 | HTMLS_PARSER = {
18 |     "cheetah_vs_lion.html": YoutubeFeaturedSnippetParser,
19 |     "gangnam_style.html": WholePageTabContainer,
20 |     "how_to_make_a_cold_brew_coffee.html": MultipleCardsFeaturedSnippetTag,
21 |     "the_10_highest-grossing_movies_of_all_time.html": (
22 |         OrderedFeaturedSnippetParser
23 |     ),
24 |     "what_are_3_basic_programming_languages.html": (
25 |         UnorderedFeaturedSnippetParser
26 |     ),
27 |     "what_time_is_it.html": SingleCardFeaturedSnippetParser,
28 |     "why_was_ho_chi_minh_a_hero.html": DefinitionFeaturedSnippetParser,
29 |     "world_university_rankings_2019.html": TableFeaturedSnippetParser
30 | }
31 | FIXTURES_DIR = os.path.join(
32 |     os.path.dirname(__file__),
33 |     "fixtures"
34 | )
35 | 
36 | 
37 | class TestParser(unittest.TestCase):
38 | 
39 |     def test_parsers(self):
40 |         for html_filename, Parser in HTMLS_PARSER.items():
41 |             html_file = os.path.join(FIXTURES_DIR, html_filename)
42 |             with open(html_file, "r") as fd:
43 |                 document = BeautifulSoup(fd.read(), "html.parser")
44 |                 question, _ = html_filename.split(".")
45 |                 question.replace("_", " ")
46 |                 parser = get_featured_snippet_parser(question, document)
47 |                 self.assertIsInstance(parser, Parser)
48 |                 self.assertIsNotNone(parser.response)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/people_also_ask/tools.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | import time
 3 | import random
 4 | import traceback
 5 | from contextlib import ContextDecorator
 6 | from typing import Callable, List
 7 | from people_also_ask.exceptions import FeaturedSnippetParserError
 8 | 
 9 | 
10 | def raise_featuredsnippetparsererror_if_failed(func):
11 |     def wrapper(self: "SimpleFeaturedSnippetParser", *args, **kwargs):
12 |         try:
13 |             return func(self, *args, **kwargs)
14 |         except Exception:
15 |             traceback.print_exc()
16 |             raise FeaturedSnippetParserError(self.text)
17 |     return wrapper
18 | 
19 | 
20 | def retryable(nb_times_retry):
21 | 
22 |     def decorator(func: Callable):
23 | 
24 |         def wrapper(*args, **kwargs):
25 |             for _ in range(nb_times_retry-1):
26 |                 try:
27 |                     return func(*args, **kwargs)
28 |                 except Exception:
29 |                     pass
30 |             return func(*args, **kwargs)
31 | 
32 |         return wrapper
33 |     return decorator
34 | 
35 | 
36 | def itemize(lines: List[str]) -> List[str]:
37 |     return ["\t- " + line for line in lines]
38 | 
39 | 
40 | def tabulate(header, table):
41 |     length_columns = []
42 |     if header:
43 |         table = [header] + table
44 |         length_columns = [len(str(e)) for e in header]
45 |     for row in table:
46 |         current_lengh = [len(str(e)) for e in row]
47 |         length_columns = [
48 |             max(i, j) for i, j in zip(length_columns, current_lengh)
49 |         ]
50 |     tabulated_rows = []
51 |     for row in table:
52 |         tabulated_rows.append("\t".join([
53 |             str(e).rjust(length, " ") for e, length in zip(row, length_columns)
54 |         ]))
55 |     if header:
56 |         tabulated_rows.insert(
57 |             1,
58 |             "\t".join(["-"*length for length in length_columns])
59 |         )
60 |     return "\n".join(tabulated_rows)
61 | 
62 | 
63 | def remove_redundant(elements): return list(dict.fromkeys(elements))
64 | 
65 | 
66 | class CallingSemaphore(ContextDecorator):
67 | 
68 |     def __init__(self, nb_call_times_limit, expired_time):
69 |         self.nb_call_times_limit = nb_call_times_limit
70 |         self.expired_time = expired_time
71 |         self.called_timestamps = list()
72 | 
73 |     def __enter__(self):
74 |         while len(self.called_timestamps) > self.nb_call_times_limit:
75 |             now = time.time()
76 |             self.called_timestamps = list(filter(
77 |                 lambda x: now - x < self.expired_time,
78 |                 self.called_timestamps
79 |             ))
80 |             time.sleep(random.random() * 2)
81 |         self.called_timestamps.append(time.time())
82 | 
83 |     def __exit__(self, *exc):
84 |         pass
85 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import codecs
 3 | import os
 4 | import setuptools
 5 | 
 6 | 
 7 | def local_file(file):
 8 |     return codecs.open(
 9 |         os.path.join(os.path.dirname(__file__), file), 'r', 'utf-8'
10 |     )
11 | 
12 | setuptools.setup(
13 |     name="people_also_ask",
14 |     version="1.1.0",
15 |     author="LE Van Tuan",
16 |     author_email="leavantuan2312@gmail.com",
17 |     packages=setuptools.find_packages(),
18 |     long_description=local_file('README.md').read(),
19 |     long_description_content_type="text/markdown",
20 |     url="https://github.com/lagranges/people_also_ask",
21 |     classifiers=[
22 | 	"Topic :: Software Development :: Libraries :: Python Modules",
23 |         "Topic :: Utilities",
24 |         "Development Status :: 5 - Production/Stable",
25 |         "Operating System :: MacOS",
26 |         "Operating System :: Microsoft",
27 |         "Programming Language :: Python :: 3.6",
28 |         "Programming Language :: Python :: 3.7",
29 |         "Programming Language :: Python :: 3.8",
30 |         "Typing :: Typed",
31 |     ],
32 |     install_requires=[
33 |         "beautifulsoup4",
34 |         "requests",
35 |         "jinja2",
36 | 	"fake-useragent"
37 |     ],
38 |     python_requires=">=3.6"
39 | )
40 | 


--------------------------------------------------------------------------------