├── .DS_Store ├── golden ├── __pycache__ │ ├── golden.cpython-36.pyc │ └── tools.cpython-36.pyc └── golden.py ├── .gitignore ├── LICENSE ├── setup.py └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrosdesigns/golden/HEAD/.DS_Store -------------------------------------------------------------------------------- /golden/__pycache__/golden.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrosdesigns/golden/HEAD/golden/__pycache__/golden.cpython-36.pyc -------------------------------------------------------------------------------- /golden/__pycache__/tools.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/terrosdesigns/golden/HEAD/golden/__pycache__/tools.cpython-36.pyc -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Packages 2 | *.egg 3 | *.egg-info 4 | dist 5 | build 6 | eggs 7 | parts 8 | bin 9 | var 10 | sdist 11 | develop-eggs 12 | .installed.cfg 13 | lib 14 | lib64 15 | __pycache__ 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 terrosdesigns 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3.4 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Terros 2019 -- https://terrosdesigns.com 5 | """ 6 | 7 | import sys 8 | import os 9 | 10 | 11 | try: 12 | from setuptools import setup 13 | except ImportError: 14 | from distutils.core import setup 15 | 16 | 17 | packages = [ 18 | 'golden', 19 | ] 20 | 21 | 22 | # if sys.argv[-1] == 'publish': 23 | # # PYPI now uses twine for package management. 24 | # # For this to work you must first `$ pip3 install twine` 25 | # os.system('python3 setup.py sdist bdist_wheel') 26 | # os.system('twine upload dist/*') 27 | # sys.exit() 28 | 29 | 30 | # if sys.version_info[0] == 2 and sys.argv[-1] not in ['publish', 'upload']: 31 | # sys.exit('WARNING! You are attempting to install golden\'s ' 32 | # 'python3 repository on python2. PLEASE RUN ' 33 | # '`$ pip3 install golden_data` for python3 or ' 34 | # '`$ pip install golden_data` for python2') 35 | 36 | 37 | with open("README.md", "r") as fh: 38 | long_description = fh.read() 39 | 40 | 41 | setup( 42 | name='golden', 43 | version='0.1.1', 44 | description='Python library to extract data from tech companies and topics.', 45 | long_description=long_description, 46 | author='Terros', 47 | author_email='terrosdesigns@gmail.com', 48 | url='https://github.com/terrosdesigns/golden/', 49 | packages=packages, 50 | include_package_data=True, 51 | license='MIT', 52 | zip_safe=False, 53 | classifiers=[ 54 | 'Programming Language :: Python :: 3', 55 | 'Natural Language :: English', 56 | 'Intended Audience :: Developers', 57 | ], 58 | ) 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # golden 2 | 3 | Python library built upon [requests](https://github.com/kennethreitz/requests) to access and parse data from [Golden](https://golden.com)'s collection. 4 | 5 | [Golden](https://golden.com) is a Wikipedia alternative that focuses on emerging tech, startups and ideas. 6 | 7 | Extract company summaries, timeline of recent events, key people and more ! 8 | 9 | ## Usage 10 | 11 | ```python 12 | >>> from golden import golden 13 | >>> search = golden.download('Apple') 14 | 15 | >>> golden.title(search) 16 | # 'Apple (company)' 17 | 18 | >>> golden.summary(search) 19 | # Apple Inc. is a public company designing and selling personal computers, smartphones, consumer electronics, and software. Its headquarters is located in Cupertino, California and it was founded in 1976.Apple Inc. is a California-based electronics company with a focus producing on consumer devices. 20 | 21 | >>> golden.content(search, sentences=3) 22 | #'Apple Inc. is a California-based electronics company with a focus producing on consumer devices. ProductsProducts and devices produced by Apple Inc. include iPad, iPhone, AirPods,Apple Watch, HomePod, and MacBook. Each product can give users access to one or more forms of media or technology including television, music, data storage, and computer applications.The products run on the Mac operating system, which has special features thare not available on non-Mac systems. Furthermore, the devices use continuity, which allows for all the devices owned by a user to beconnected.The company also produces software as a service and media options.' 23 | 24 | >>> events = golden.timeline(search, events=1) 25 | >>> for event in events: 26 | >>> print(event["date"], " : ", event["subtitle"], "\n", event["content"]) 27 | # March 25, 2019 : Apple Card 28 | # On March 25, 2019 during their keynote event Apple, in partnership with Goldman Sachs and Mastercard, announced Apple Card. A credit card by Apple with no fees—no annual, cash-advance, over-the-limit, international, or late fees— thats gives Apple users the ability to sign up for Apple Card using the Apple Wallet application. 29 | 30 | >>> other_search = golden.download("jetpack aviation") 31 | >>> people = golden.people(other_search) 32 | >>> for p in people: 33 | >>> print(p["name"], p["role"]) 34 | # Boris Jarry Employee 35 | # Daniel Schwarzbaum Employee 36 | # David Mayman Founder, CEO, Test Pilot, Project Manager 37 | # Nelson Tyler Founder, Principle Designer 38 | # Sergey Samchik Employee 39 | 40 | >>> ceo = golden.people(other_search, "CEO") 41 | >>> print(ceo) 42 | # David Mayman : Founder, CEO, Test Pilot, Project Manager 43 | ``` 44 | 45 | ## Installation 46 | To install golden run following command : 47 | ``` 48 | $ pip install golden 49 | ``` 50 | 51 | ## TO DO 52 | 53 | * Add new queries : 54 | * Commpany Url 55 | * Products 56 | * Country 57 | * Improve suggestion while querying 58 | * Hide html output when downloading a new page 59 | * Improve content query output: 60 | * Sentences nb query parameter doesn't always work 61 | 62 | ## LICENCE 63 | Authored and maintained by [Terros Designs](https://terrosdesigns.com). 64 | 65 | Contact me at terrosdesigns@gmail.com for any question / suggestion / comment ! 66 | 67 | MIT licensed. See the [LICENSE](https://github.com/terrosdesigns/golden/blob/master/LICENSE) file for full details. 68 | -------------------------------------------------------------------------------- /golden/golden.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | # from proxies import proxies 4 | import sys 5 | import nltk 6 | from nltk import sent_tokenize 7 | 8 | def extractPageContent(url): 9 | try: 10 | page = requests.get(url) 11 | except: 12 | print("Error while trying to get page") 13 | # sys.exit() 14 | soup = BeautifulSoup(page.text, 'html.parser') 15 | return soup 16 | 17 | def getFirstSearchResult(user_query): 18 | # query = input("What is your query ?\n") 19 | base_url = 'https://golden.com' 20 | base_search_url = 'https://golden.com/search/' 21 | search_url = base_search_url + user_query 22 | soup = extractPageContent(search_url) 23 | links_list = [] 24 | for link in soup.find_all('a'): 25 | links_list.append(link.get('href')) 26 | first_result = links_list[3] 27 | first_result_link = base_url + first_result 28 | return first_result_link 29 | 30 | def getQueryUrl(): 31 | base_url = 'https://golden.com/wiki/' 32 | query = getSearchResult() 33 | url = base_url + query 34 | return url 35 | 36 | def download(user_query): 37 | search_url = getFirstSearchResult(user_query) 38 | try: 39 | page = requests.get(search_url) 40 | except: 41 | return "Error while trying to get page" 42 | soup = BeautifulSoup(page.text, 'html.parser') 43 | return soup 44 | 45 | def title(soup): 46 | title = soup.find("h1", {"class": "TopicDetail__header__headline--inner"}).get_text() 47 | return title 48 | 49 | def summary(soup): 50 | summary = [] 51 | summary = soup.find("div", {"class": "TopicDetail__abstract"}).get_text() 52 | return summary 53 | 54 | def content(soup, sentences=1): 55 | content_abstract = soup.find("div", {"class": "TopicDetail__body"}) 56 | first_section = content_abstract.find("div", {"class": "TopicDetail__overview__block"}) 57 | text_content = first_section.find("div", {"class": "Editor--article"}) 58 | if not text_content: 59 | print("No content to display.") 60 | return 61 | if len(text_content.findAll("p", {"class": "Editor__text"})) == 0: 62 | return "No content to display" 63 | elif len(text_content.findAll("p", {"class": "Editor__text"})) == 1: 64 | try: 65 | content = text_content.findAll("p", {"class": "Editor__text"}).get_text() 66 | except: 67 | return "No content to display" 68 | content_sent = sent_tokenize(content) 69 | description = " ".join(content_sent[0:sentences]) 70 | else: 71 | i=0 72 | content = [] 73 | while i < len(text_content.findAll("p", {"class": "Editor__text"})): 74 | content.append(text_content.findAll("p", {"class": "Editor__text"})[i].get_text()) 75 | i+=1 76 | description = " ".join(content[0:sentences]) 77 | if description: 78 | return description 79 | 80 | def timeline(soup, events=0): 81 | timeline_block = soup.findAll("div", {"class": "EntityTimeline"}) 82 | events_list = [] 83 | i=0 84 | while i <= events: 85 | try: 86 | event = {} 87 | event["date"] = timeline_block[0].findAll("div", {"class": "TimelineEvent__date"})[i].get_text() 88 | event["subtitle"] = timeline_block[0].findAll("h3")[i].get_text() 89 | event["content"] = timeline_block[0].findAll("p")[i].get_text() 90 | events_list.append(event) 91 | except: 92 | print(i, "events loaded.") 93 | break 94 | i+=1 95 | return events_list 96 | 97 | def people(soup, position=''): 98 | table = soup.find('div', {"class": "table"}) 99 | rows = table.findAll('div', {"class": "table-row__wrapper"}) 100 | rows_list = [] 101 | for row in rows: 102 | name = row.findAll("div", {"class": "table-cell"})[0].get_text() 103 | role = row.findAll("div", {"class": "table-cell"})[1].get_text() 104 | golden_related = row.findAll("div", {"class": "table-cell"})[2].get_text() 105 | result = {} 106 | result["name"] = name 107 | result["role"] = role 108 | result["golden_related"] = golden_related 109 | rows_list.append(result) 110 | # remove first header row 111 | rows_list = rows_list[1:] 112 | final_list = [] 113 | if position: 114 | for row in rows_list: 115 | if position in row["role"]: 116 | final_list.append(row) 117 | if final_list: 118 | return final_list 119 | elif rows_list: 120 | return rows_list 121 | else: 122 | return "No people to display for this query." 123 | 124 | --------------------------------------------------------------------------------