├── .gitignore
├── LICENSE.md
├── README.md
├── mock_data
    ├── found.csv
    ├── list_of_company_names_raw.csv
    └── not_found.csv
├── requirements.txt
└── src
    ├── clipboard_fetcher.py
    └── crunchbase_scraper.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .vscode/


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Andrei Stoica
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">Crunchbase Scraper</h1>
 2 | 
 3 | <div align="center">
 4 | 
 5 | [![Status](https://img.shields.io/badge/status-active-success.svg)]()
 6 | [![GitHub Issues](https://img.shields.io/github/issues/stoicaandrei/crunchbase-scraper.svg)](https://github.com/stoicaandrei/crunchbase-scraper/issues)
 7 | [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/stoicaandrei/crunchbase-scraper.svg)](https://github.com/stoicaandrei/crunchbase-scraper/pulls)
 8 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](/LICENSE)
 9 | 
10 | </div>
11 | 
12 | 
13 | ## 📝 Table of Contents
14 | 
15 | - [About](#about)
16 | - [Getting Started](#getting_started)
17 | - [Usage](#usage)
18 | - [Built Using](#built_using)
19 | - [TODO](../TODO.md)
20 | - [Contributing](../CONTRIBUTING.md)
21 | - [Authors](#authors)
22 | - [Acknowledgments](#acknowledgement)
23 | 
24 | ## 🧐 About <a name = "about"></a>
25 | 
26 | This project was implemented to be able to save crunchbase data without having access to their APIs. All that you need is a `Crunchbase Free Trial`.
27 | 
28 | It gathers data about companies like their website, their twitter and their founder's twitter. It can be modified to gather other types of data easily.
29 | 
30 | ## 🏁 Getting Started <a name = "getting_started"></a>
31 | 
32 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
33 | 
34 | 
35 | 
36 | ### Installing
37 | 
38 | 
39 | 
40 | ```
41 | pip install -r requirements.txt
42 | ```
43 | 
44 | 
45 | ## 🎈 Usage <a name="usage"></a>
46 | 
47 | The project is composed of 2 scripts `clipboard_fetcher.py` and `crunchbase_scraper.py`
48 | 
49 | In order to get a `list of comapanies`, saved in `list_of_company_names_raw` run `python clipboard_fetcher.py`. Then login into [Crunchbase](https://crunchbase.com), go to an advanced search and `cmd+a, cmd+c`. The program will automatically detect the copied content and will write the name of the company to the list csv.
50 | 
51 | In order to scrape data using the company names run `python crunchbase_scraper.py`. It will write the data in 3 files:
52 | 
53 | * `found.csv` - the companies that were found. Format `Company Name, Company Website, Company Twitter, CEO Twitter, CTO Twitter`
54 | * `not_found.csv` - the companies that were not found based on the company name. Format `Company Name`
55 | * `error.csv` - the companies that returned an error while scraping. Format `Company Name`
56 | 
57 | ## ⛏️ Built Using <a name = "built_using"></a>
58 | 
59 | - [PyQt5](https://pypi.org/project/PyQt5/) - For loading website javascript
60 | - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) - Scraping library
61 | 
62 | ## ✍️ Authors <a name = "authors"></a>
63 | 
64 | - [@stoicaandrei](https://github.com/stoicaandrei) - Idea & Initial work
65 | 
66 | See also the list of [contributors](https://github.com/stoicaandrei/crunchbase-scraper/contributors) who participated in this project.
67 | 
68 | ## 🎉 Acknowledgements <a name = "acknowledgement"></a>
69 | 
70 | - Hat tip to anyone whose code was used
71 | - Inspiration
72 | - References
73 | 


--------------------------------------------------------------------------------
/mock_data/found.csv:
--------------------------------------------------------------------------------
 1 | 
 2 | "SiteMinder","https://www.siteminder.com/"","SiteMinder_News"","None","None"
 3 | "First Opinion","http://FirstOpinionApp.com","FirstOpinionApp"","mckaythomas","Jaymon"
 4 | "Groupalia","http://www.groupalia.com","groupaliaES"","joaquinengel","None"
 5 | "Now Account Network","https://nowcorp.com/"","nowaccount?lang=en"","None","None"
 6 | "Semasio","http://www.semasio.com","SemasioGlobal"","None","None"
 7 | "Yidinghuo","https://www.dinghuo123.com/"","None","None","None"
 8 | "Weave","http://www.getweave.com","getweave"","brandonrodman","clinton_berry"
 9 | "Yunmanman","https://www.ymm56.com","None","None","None"
10 | "Target2Sell","http://www.target2sell.com/en/"","target2sell"","Ziserman","None"
11 | "PRIVIT","https://privit.com/"","PrivitProfile"","None","None"
12 | "Bluebox","http://www.bluebox.com","BlueboxSec"","PamKostka","None"
13 | "Element","http://www.element8angels.com","None","None","None"
14 | "Facelift","https://www.facelift-bbt.com","FACELIFTbbt"","None","None"
15 | "Sage Intacct","https://www.sageintacct.com/"","SageIntacct"","None","None"
16 | "CitiusTech","http://citiustech.com","CitiusTech"","rizwankoita","None"
17 | "WebInterpret","https://webinterpret.com/"","WebInterpret_En"","None","None"
18 | "FittingBox","http://www.fittingbox.com","FittingBox"","None","BeN_Fittingbox"
19 | "Mobio Technologies","https://www.google.com/finance?q=CVE:MBO"","None","None","None"
20 | "Method:CRM","https://www.method.me","MethodCRM"","None","None"
21 | "Sensorberg","http://www.sensorberg.com","sensorberg"","None","None"
22 | "Booktrack","http://www.booktrack.com","booktrack"","None","None"
23 | "Slack","https://www.google.com/finance?q=NYSE:WORK"","None","None","None"
24 | "HELIX","http://www.helix.com/"","my_helix"","None","scottmburke"
25 | "Serious Labs","http://seriouslabs.com/"","SeriousLabs"","None","None"
26 | "Bellabeat","http://www.bellabeat.com","GetBellaBeat"","mursandro","None"
27 | "Rigetti Computing","http://www.rigetti.com/"","rigetti"","None","None"
28 | "Activ Technologies","http://activtech.com","ActivTech"","None","None"
29 | "Vlocity","https://vlocity.com/"","vlocity"","davidschmaier?lang=en","None"
30 | "PropTech Holdings","https://proptechholdings.com","None","None","None"
31 | "Riskmethods","http://www.riskmethods.net/en"","riskmethods1"","None","None"
32 | "Crol","https://www.crol.mx","CrolMX"","None","None"
33 | "Payapps","http://www.payapps.com","payappssoftware"","None","None"
34 | "Rentlytics","http://www.rentlytics.com","rentlytics"","None","None"
35 | "GreenPocket","https://www.greenpocket.com/"","GreenPocketGmbH"","None","None"
36 | "Grapeshot","http://www.grapeshot.com","Grapeshot_"","None","wizeline"
37 | "Influere.io","http://www.influere.io","None","None","None"
38 | "Pushfor","http://www.pushfor.com","Pushfor"","None","None"
39 | "Abiquo Group","http://www.abiquo.com","abiquo"","None","None"
40 | "iQVCloud","http://www.iqvcloud.net","None","None","None"
41 | "Flexport","https://www.flexport.com/"","flexport"","typesfast","None"
42 | "Agritek Holdings Inc","https://www.google.com/finance?q=OTCQB:AGTK"","None","None","None"
43 | "Boundary","http://www.boundary.com","boundary"","None","None"
44 | "GetOne Rewards","http://getonerewards.com","GetOneRewards"","None","Justin_Michela"
45 | "Liaison Technologies","http://www.liaison.com","Liaisontech"","None","None"
46 | "Confide","http://getconfide.com","GetConfide"","None","hongrich"
47 | "Weka.IO","http://www.weka.io","wekaio"","None","7MPS"
48 | "4C Insights","http://www.4cinsights.com","4cinsights"","None","None"
49 | "6sense","http://www.6sense.com","6SenseInc"","None","viralbajaria?lang=en"
50 | "PeopleDoc","http://www.people-doc.com","peopledoc_inc"","johnbenhamou","None"
51 | "IDV Solutions","http://www.idvsolutions.com","idvsolutions"","None","None"
52 | "Clio","https://www.clio.com","goclio"","jack_newton","None"
53 | "Silent Herdsman","http://silentherdsman.com","SilentHerdsman"","None","None"
54 | "FamiHero","http://www.famihero.com","famihero"","srobbes","zenanny"
55 | "Addapp","https://addapp.io/"","addappio"","None","None"
56 | "Blackford Analysis","http://www.blackfordanalysis.com","blackford"","None","None"
57 | "Tamoco","http://www.tamoco.com","tamocotech"","dsva","None"
58 | "MemSQL","http://www.memsql.com","memsql"","None","None"
59 | "Babel Street","http://babelstreet.com","babelstreet"","None","None"
60 | "MyActivityPal","http://www.myactivitypal.com","activitypal"","IkeSingh","None"
61 | "Freightos","https://www.freightos.com","freightos"","None","None"
62 | "Foradian","http://www.foradian.com","foradian"","None","None"
63 | "Mirada Medical","http://mirada-medical.com","MiradaMedical"","None","None"
64 | "KYON","http://www.kyontracker.com","kyontracker"","None","None"
65 | "Mekitec","http://mekitec.com","Mekitec"","None","None"
66 | "Wunwun","http://wunwun.com","wunwun"","calvinwl","None"
67 | "LoginRadius","http://www.loginradius.com","LoginRadius"","None","dip_ak"
68 | "Dodles","http://dodl.es/"","dodles_"","cragi","None"
69 | "Speek","http://www.speek.com","SpeekApp"","johnbracken","SpeekMatt"
70 | "Campaign Monitor","http://www.campaignmonitor.com","campaignmonitor"","None","None"
71 | "Three Day Rule","http://threedayrule.com","threedayrule"","None","None"
72 | "Act-On Software","http://www.act-on.com","ActOnSoftware"","None","None"
73 | "Fuel3D","http://www.fuel-3d.com","Fuel_3D"","None","None"
74 | "ServiceMax","http://www.servicemax.com","ServiceMax"","None","None"
75 | "Ants Technology","http://ants-technology.com","ants_technology"","None","None"


--------------------------------------------------------------------------------
/mock_data/not_found.csv:
--------------------------------------------------------------------------------
 1 | 
 2 | Hiringboss Holdings Pte. Ltd.
 3 | ExamSoft
 4 | Revolution Analytics
 5 | Keap
 6 | Kazoo
 7 | Bonfire (Formerly RVSpotfinder.com)
 8 | Upskill
 9 | Sorted Group
10 | Cupris Health
11 | CircleCI
12 | Crate.io
13 | Smartsheet
14 | Culer
15 | TakeLessons
16 | Aver
17 | Sand 9
18 | Skyfence Networks Ltd.
19 | BitAnimate
20 | Sphero
21 | SkyWire


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyQt5==5.10.1
2 | PyQt5-sip==12.7.0
3 | beautifulsoup4==4.8.0
4 | 


--------------------------------------------------------------------------------
/src/clipboard_fetcher.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import threading
 3 | import re
 4 | 
 5 | 
 6 | def getClipboardData():
 7 |     p = subprocess.Popen(['pbpaste'], stdout=subprocess.PIPE)
 8 |     retcode = p.wait()
 9 |     data = p.stdout.read()
10 |     return data.decode('utf-8')
11 | 
12 | 
13 | clip = getClipboardData()
14 | 
15 | 
16 | def check_for_clipboard_change():
17 |     global clip
18 | 
19 |     threading.Timer(0.5, check_for_clipboard_change).start()
20 | 
21 |     clip2 = getClipboardData()
22 | 
23 |     if clip != clip2:
24 |         clip = clip2
25 |         print('clipboard changed')
26 | 
27 |         match = re.findall(r'\d+\.\n(?:.*\n){3}(.*)', clip)
28 |         out = '\n'.join(match)
29 | 
30 |         with open('../data/list_of_company_names_raw.csv', 'a') as file:
31 |             file.write('\n' + out)
32 | 
33 | 
34 | check_for_clipboard_change()
35 | 


--------------------------------------------------------------------------------
/src/crunchbase_scraper.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import re
  3 | from bs4 import BeautifulSoup
  4 | 
  5 | from PyQt5.QtWidgets import QApplication
  6 | from PyQt5.QtCore import QUrl
  7 | from PyQt5.QtWebEngineWidgets import QWebEnginePage
  8 | 
  9 | BASE_URL = 'https://www.crunchbase.com'
 10 | 
 11 | companies = []
 12 | pages = []
 13 | 
 14 | 
 15 | class Page(QWebEnginePage):
 16 |     def __init__(self, url):
 17 |         self.app = QApplication(sys.argv)
 18 |         QWebEnginePage.__init__(self)
 19 |         self.html = ''
 20 |         self.loadFinished.connect(self._on_load_finished)
 21 |         self.load(QUrl(url))
 22 |         self.app.exec_()
 23 | 
 24 |     def _on_load_finished(self):
 25 |         self.html = self.toHtml(self.Callable)
 26 | 
 27 |     def Callable(self, html_str):
 28 |         self.html = html_str
 29 |         self.app.quit()
 30 | 
 31 | 
 32 | def get_page(route):
 33 |     if not route:
 34 |         return None
 35 | 
 36 |     try:
 37 |         url = f'{BASE_URL}{route}'
 38 |         pages.append(Page(url))
 39 |         soup = BeautifulSoup(pages[-1].html, 'lxml')
 40 |         pages[-1].deleteLater()
 41 |     except:
 42 |         return None
 43 |     else:
 44 |         return soup
 45 | 
 46 | 
 47 | def format_name(name):
 48 |     return name.lower().replace('\n', '').strip().replace('.', '-').replace(' ', '-').replace(':', '-')
 49 | 
 50 | 
 51 | def extract_link(element):
 52 |     return re.search(r'(https?:\/\/)(www\.)?([a-zA-Z0-9]+(-?[a-zA-Z0-9])*\.)+([a-z]{2,})(\/\S*)?', element).group(0)
 53 | 
 54 | 
 55 | def print_green(s):
 56 |     print(f'\033[92m{s}\033[0m')
 57 | 
 58 | 
 59 | def scrape_data(company_name):
 60 |     name = format_name(company_name)
 61 | 
 62 |     print_green(f'Checking {company_name} alias {name}')
 63 | 
 64 |     # load the page in "soup" variable
 65 |     soup = get_page(f'/organization/{name}')
 66 |     if not soup:
 67 |         print_green(
 68 |             f'{company_name}, alias {name} gave an error while loading')
 69 |         with open('../data/error.csv', 'a') as file:
 70 |             file.write('\n' + company_name)
 71 |         return
 72 | 
 73 |     # extract website and social media links
 74 |     html_links = soup.find_all(
 75 |         'a', class_="cb-link component--field-formatter field-type-link layout-row layout-align-start-end ng-star-inserted")
 76 |     links = []
 77 |     for html in html_links:
 78 |         link = extract_link(str(html))
 79 |         links.append(link)
 80 | 
 81 |     # the name wasn't correct if there are no social links on the page
 82 |     if len(links) == 0:
 83 |         print_green(f'{company_name}, alias {name} could not be found')
 84 |         with open('../data/not_found.csv', 'a') as file:
 85 |             file.write('\n' + company_name)
 86 |         return
 87 | 
 88 |     website = links[0]
 89 |     company_twitter = None
 90 |     if 'twitter' in links[-1]:
 91 |         company_twitter = links[-1].split('/')[-1]
 92 | 
 93 |     # extract the personans in the team
 94 |     html_persons = soup.find_all(
 95 |         'div', class_='flex cb-padding-medium-left cb-break-word cb-hyphen')
 96 | 
 97 |     ceo = None
 98 |     cto = None
 99 |     founders = []
100 | 
101 |     for html_person in html_persons:
102 |         name_link = html_person.find('a')['href']
103 |         position = html_person.find('span')['title']
104 | 
105 |         if re.search(r'(\s|^)((ceo)|(Chief Executive Officer))(\s|$)', position, re.I):
106 |             ceo = name_link
107 | 
108 |         if re.search(r'(\s|^)((cto)|(Chief Technical Officer)|(Chief technology officer))(\s|$)', position, re.I):
109 |             cto = name_link
110 | 
111 |         if re.search(r'founder', position, re.I):
112 |             founders.append(name_link)
113 | 
114 |     # just to be safe
115 |     if not ceo and not cto:
116 |         if len(founders) >= 2:
117 |             (ceo, cto) = founders
118 |         elif len(founders) == 1:
119 |             ceo = founders[0]
120 | 
121 |     ceo_twitter = None
122 |     cto_twitter = None
123 | 
124 |     for person in (ceo, cto):
125 |         if not person:
126 |             continue
127 | 
128 |         soup = get_page(person)
129 |         if not soup:
130 |             print(f'Could not find {person}')
131 |             continue
132 | 
133 |         card = soup.find(
134 |             'mat-card', class_='component--section-layout mat-card')
135 | 
136 |         person_twitter = re.search(r'twitter.com/([^"]*)"', str(card))
137 |         if not person_twitter:
138 |             print_green(f"{person} doesn't have a twitter account")
139 |             continue
140 | 
141 |         if person is ceo:
142 |             ceo_twitter = person_twitter.group(1)
143 |         else:
144 |             cto_twitter = person_twitter.group(1)
145 | 
146 |     with open('../data/found.csv', 'a') as file:
147 |         file.write(
148 |             f'\n"{company_name}","{website}","{company_twitter}","{ceo_twitter}","{cto_twitter}"')
149 | 
150 | 
151 | with open('../data/list_of_company_names_raw.csv', 'r') as fp:
152 |     line = fp.readline()
153 | 
154 |     while line:
155 |         companies.append(line.replace('\n', ''))
156 | 
157 |         line = fp.readline()
158 | 
159 | companies = list(dict.fromkeys(companies))
160 | 
161 | for company in companies:
162 |     scrape_data(company)
163 | 


--------------------------------------------------------------------------------