├── requirements.txt ├── README.md ├── LICENSE ├── .gitignore └── gsmarena_scraping.py /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.3 2 | bs4==0.0.1 3 | certifi==2022.12.7 4 | chardet==4.0.0 5 | idna==2.10 6 | requests==2.25.1 7 | soupsieve==2.1 8 | urllib3==1.26.5 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mobile-Phone-Dataset-GSMArena 2 | This is Python script which scrape the GSMArena website mobile phones specification and save in the csv format files. 3 | 4 | ### Prerequisites 5 | 6 | * Python3.x 7 | * Pip 8 | 9 | ### Installing 10 | 11 | * Install reqiurement text file using pip3 12 | 13 | ``` 14 | pip3 install -r requirements.txt 15 | ``` 16 | 17 | ### Running 18 | 19 | Run this command on your terminal 20 | ``` 21 | python3 gsmarena_scraping.py 22 | ``` 23 | 24 | ## Built With 25 | 26 | * [Beautifulsoup4](https://pypi.org/project/beautifulsoup4/) - Beautifulsoup4 python librabry for website scraping. 27 | 28 | ## Authors 29 | 30 | * **Deepak Chawla** - [Github](https://github.com/Deepakchawla), [Linkedin](https://www.linkedin.com/in/deepakchawla1307/) and [Website](http://deepakchawla.me/). 31 | 32 | ## License 33 | 34 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Deepak Chawla 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /gsmarena_scraping.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import csv 4 | import os 5 | import time 6 | import json 7 | 8 | # Class gsmarena scrap the website phones models and its devices and save to csv file individually. 9 | class Gsmarena(): 10 | 11 | # Constructor to initialize common useful varibales throughout the program. 12 | def __init__(self): 13 | self.phones = [] 14 | self.features = ["Brand", "Model Name", "Model Image"] 15 | self.temp1 = [] 16 | self.phones_brands = [] 17 | self.url = 'https://www.gsmarena.com/' # GSMArena website url 18 | self.new_folder_name = 'GSMArenaDataset' # Folder name on which files going to save. 19 | self.absolute_path = os.popen('pwd').read().strip() + '/' + self.new_folder_name # It create the absolute path of the GSMArenaDataset folder. 20 | 21 | # This function crawl the html code of the requested URL. 22 | def crawl_html_page(self, sub_url): 23 | 24 | url = self.url + sub_url # Url for html content parsing. 25 | header={"User-Agent":"#user agent of your system "} 26 | time.sleep(30) #SO that your IP does not gets blocked by the website 27 | # Handing the connection error of the url. 28 | try: 29 | page = requests.get(url,timeout= 5, headers=header) 30 | soup = BeautifulSoup(page.text, 'html.parser') # It parses the html data from requested url. 31 | return soup 32 | 33 | except ConnectionError as err: 34 | print("Please check your network connection and re-run the script.") 35 | exit() 36 | 37 | except Exception: 38 | print("Please check your network connection and re-run the script.") 39 | exit() 40 | 41 | # This function crawl mobile phones brands and return the list of the brands. 42 | def crawl_phone_brands(self): 43 | phones_brands = [] 44 | soup = self.crawl_html_page('makers.php3') 45 | table = soup.find_all('table')[0] 46 | table_a = table.find_all('a') 47 | for a in table_a: 48 | temp = [a['href'].split('-')[0], a.find('span').text.split(' ')[0], a['href']] 49 | phones_brands.append(temp) 50 | return phones_brands 51 | 52 | # This function crawl mobile phones brands models links and return the list of the links. 53 | def crawl_phones_models(self, phone_brand_link): 54 | links = [] 55 | nav_link = [] 56 | soup = self.crawl_html_page(phone_brand_link) 57 | nav_data = soup.find(class_='nav-pages') 58 | if not nav_data: 59 | nav_link.append(phone_brand_link) 60 | else: 61 | nav_link = nav_data.findAll('a') 62 | nav_link = [link['href'] for link in nav_link] 63 | nav_link.append(phone_brand_link) 64 | nav_link.insert(0, nav_link.pop()) 65 | for link in nav_link: 66 | soup = self.crawl_html_page(link) 67 | data = soup.find(class_='section-body') 68 | for line1 in data.findAll('a'): 69 | links.append(line1['href']) 70 | 71 | return links 72 | 73 | # This function crawl mobile phones specification and return the list of the all devices list of single brand. 74 | def crawl_phones_models_specification(self, link, phone_brand): 75 | phone_data = {} 76 | soup = self.crawl_html_page(link) 77 | model_name = soup.find(class_='specs-phone-name-title').text 78 | model_img_html = soup.find(class_='specs-photo-main') 79 | model_img = model_img_html.find('img')['src'] 80 | phone_data.update({"Brand": phone_brand}) 81 | phone_data.update({"Model Name": model_name}) 82 | phone_data.update({"Model Image": model_img}) 83 | temp = [] 84 | for data1 in range(len(soup.findAll('table'))): 85 | table = soup.findAll('table')[data1] 86 | for line in table.findAll('tr'): 87 | temp = [] 88 | for l in line.findAll('td'): 89 | text = l.getText() 90 | text = text.strip() 91 | text = text.lstrip() 92 | text = text.rstrip() 93 | text = text.replace("\n", "") 94 | temp.append(text) 95 | if temp[0] in phone_data.keys(): 96 | temp[0] = temp[0] + '_1' 97 | if temp[0] not in self.features: 98 | self.features.append(temp[0]) 99 | if not temp: 100 | continue 101 | else: 102 | phone_data.update({temp[0]: temp[1]}) 103 | return phone_data 104 | 105 | # This function create the folder 'GSMArenaDataset'. 106 | def create_folder(self): 107 | if not os.path.exists(self.new_folder_name): 108 | os.system('mkdir ' + self.new_folder_name) 109 | print("Creating ", self.new_folder_name, " Folder....") 110 | time.sleep(6) 111 | print("Folder Created.") 112 | else: 113 | print(self.new_folder_name , "directory already exists") 114 | 115 | # This function check the csv file exists in the 'GSMArenaDataset' directory or not. 116 | def check_file_exists(self): 117 | return os.listdir(self.absolute_path) 118 | 119 | # This function save the devices specification to csv file. 120 | def save_specification_to_file(self): 121 | phone_brand = self.crawl_phone_brands() 122 | self.create_folder() 123 | files_list = self.check_file_exists() 124 | for brand in phone_brand: 125 | phones_data = [] 126 | if (brand[0].title() + '.csv') not in files_list: 127 | link = self.crawl_phones_models(brand[2]) 128 | model_value = 1 129 | print("Working on", brand[0].title(), "brand.") 130 | for value in link: 131 | datum = self.crawl_phones_models_specification(value, brand[0]) 132 | datum = { k:v.replace('\n', ' ').replace('\r', ' ') for k,v in datum.items() } 133 | phones_data.append(datum) 134 | print("Completed ", model_value, "/", len(link)) 135 | model_value+=1 136 | with open(self.absolute_path + '/' + brand[0].title() + ".csv", "w") as file: 137 | dict_writer = csv.DictWriter(file, fieldnames=self.features) 138 | dict_writer.writeheader() 139 | str_phones_data = json.dumps(phones_data) 140 | encoded = str_phones_data.encode('utf-8') 141 | load_list = json.loads(encoded) 142 | for dicti in load_list: 143 | dict_writer.writerow({k:v.encode('utf-8') for k,v in dicti.items()}) 144 | print("Data loaded in the file") 145 | else: 146 | print(brand[0].title() + '.csv file already in your directory.') 147 | 148 | 149 | # This is the main function which create the object of Gsmarena class and call the save_specificiton_to_file function. 150 | i = 1 151 | while i == 1: 152 | if __name__ == "__main__": 153 | obj = Gsmarena() 154 | try: 155 | obj.save_specification_to_file() 156 | except KeyboardInterrupt: 157 | print("File has been stopped due to KeyBoard Interruption.") 158 | --------------------------------------------------------------------------------