├── requirements.txt
├── README.md
├── LICENSE
├── .gitignore
└── gsmarena_scraping.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.9.3
2 | bs4==0.0.1
3 | certifi==2022.12.7
4 | chardet==4.0.0
5 | idna==2.10
6 | requests==2.25.1
7 | soupsieve==2.1
8 | urllib3==1.26.5
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Mobile-Phone-Dataset-GSMArena
 2 | This is Python script which scrape the GSMArena website mobile phones specification and save in the csv format files.
 3 | 
 4 | ### Prerequisites
 5 | 
 6 | * Python3.x
 7 | * Pip
 8 | 
 9 | ### Installing
10 | 
11 | * Install reqiurement text file using pip3
12 |   
13 |   ```
14 |   pip3 install -r requirements.txt
15 |   ```
16 | 
17 | ### Running
18 | 
19 |   Run this command on your terminal
20 |   ```
21 |   python3 gsmarena_scraping.py
22 |   ```
23 | 
24 | ## Built With
25 | 
26 | * [Beautifulsoup4](https://pypi.org/project/beautifulsoup4/) - Beautifulsoup4 python librabry for website scraping.
27 | 
28 | ## Authors
29 | 
30 | * **Deepak Chawla** - [Github](https://github.com/Deepakchawla), [Linkedin](https://www.linkedin.com/in/deepakchawla1307/) and [Website](http://deepakchawla.me/).
31 | 
32 | ## License
33 | 
34 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Deepak Chawla
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/gsmarena_scraping.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import csv
  4 | import os
  5 | import time
  6 | import json
  7 | 
  8 | # Class gsmarena scrap the website phones models and its devices and save to csv file individually.
  9 | class Gsmarena():
 10 | 
 11 |     # Constructor to initialize common useful varibales throughout the program.
 12 |     def __init__(self):
 13 |         self.phones = []
 14 |         self.features = ["Brand", "Model Name", "Model Image"]
 15 |         self.temp1 = []
 16 |         self.phones_brands = []
 17 |         self.url = 'https://www.gsmarena.com/' # GSMArena website url
 18 |         self.new_folder_name = 'GSMArenaDataset' # Folder name on which files going to save.
 19 |         self.absolute_path = os.popen('pwd').read().strip() + '/' + self.new_folder_name  # It create the absolute path of the GSMArenaDataset folder.
 20 | 
 21 |     # This function crawl the html code of the requested URL.
 22 |     def crawl_html_page(self, sub_url):
 23 | 
 24 |         url = self.url + sub_url  # Url for html content parsing.
 25 |         header={"User-Agent":"#user agent of your system  "}
 26 |         time.sleep(30)  #SO that your IP does not gets blocked by the website
 27 |         # Handing the connection error of the url.
 28 |         try:
 29 |             page = requests.get(url,timeout= 5, headers=header)
 30 |             soup = BeautifulSoup(page.text, 'html.parser')  # It parses the html data from requested url.
 31 |             return soup
 32 | 
 33 |         except ConnectionError as err:
 34 |             print("Please check your network connection and re-run the script.")
 35 |             exit()
 36 | 
 37 |         except Exception:
 38 |             print("Please check your network connection and re-run the script.")
 39 |             exit()
 40 | 
 41 |     # This function crawl mobile phones brands and return the list of the brands.
 42 |     def crawl_phone_brands(self):
 43 |         phones_brands = []
 44 |         soup = self.crawl_html_page('makers.php3')
 45 |         table = soup.find_all('table')[0]
 46 |         table_a = table.find_all('a')
 47 |         for a in table_a:
 48 |             temp = [a['href'].split('-')[0], a.find('span').text.split(' ')[0], a['href']]
 49 |             phones_brands.append(temp)
 50 |         return phones_brands
 51 | 
 52 |     # This function crawl mobile phones brands models links and return the list of the links.
 53 |     def crawl_phones_models(self, phone_brand_link):
 54 |         links = []
 55 |         nav_link = []
 56 |         soup = self.crawl_html_page(phone_brand_link)
 57 |         nav_data = soup.find(class_='nav-pages')
 58 |         if not nav_data:
 59 |             nav_link.append(phone_brand_link)
 60 |         else:
 61 |             nav_link = nav_data.findAll('a')
 62 |             nav_link = [link['href'] for link in nav_link]
 63 |             nav_link.append(phone_brand_link)
 64 |             nav_link.insert(0, nav_link.pop())
 65 |         for link in nav_link:
 66 |             soup = self.crawl_html_page(link)
 67 |             data = soup.find(class_='section-body')
 68 |             for line1 in data.findAll('a'):
 69 |                 links.append(line1['href'])
 70 | 
 71 |         return links
 72 | 
 73 |     # This function crawl mobile phones specification and return the list of the all devices list of single brand.
 74 |     def crawl_phones_models_specification(self, link, phone_brand):
 75 |         phone_data = {}
 76 |         soup = self.crawl_html_page(link)
 77 |         model_name = soup.find(class_='specs-phone-name-title').text
 78 |         model_img_html = soup.find(class_='specs-photo-main')
 79 |         model_img = model_img_html.find('img')['src']
 80 |         phone_data.update({"Brand": phone_brand})
 81 |         phone_data.update({"Model Name": model_name})
 82 |         phone_data.update({"Model Image": model_img})
 83 |         temp = []
 84 |         for data1 in range(len(soup.findAll('table'))):
 85 |             table = soup.findAll('table')[data1]
 86 |             for line in table.findAll('tr'):
 87 |                 temp = []
 88 |                 for l in line.findAll('td'):
 89 |                     text = l.getText()
 90 |                     text = text.strip()
 91 |                     text = text.lstrip()
 92 |                     text = text.rstrip()
 93 |                     text = text.replace("\n", "")
 94 |                     temp.append(text)
 95 |                     if temp[0] in phone_data.keys():
 96 |                         temp[0] = temp[0] + '_1'
 97 |                     if temp[0] not in self.features:
 98 |                         self.features.append(temp[0])
 99 |                 if not temp:
100 |                     continue
101 |                 else:
102 |                     phone_data.update({temp[0]: temp[1]})
103 |         return phone_data
104 | 
105 |     # This function create the folder 'GSMArenaDataset'.
106 |     def create_folder(self):
107 |         if not os.path.exists(self.new_folder_name):
108 |             os.system('mkdir ' + self.new_folder_name)
109 |             print("Creating ", self.new_folder_name, " Folder....")
110 |             time.sleep(6)
111 |             print("Folder Created.")
112 |         else:
113 |             print(self.new_folder_name , "directory already exists")
114 | 
115 |     # This function check the csv file exists in the 'GSMArenaDataset' directory or not.
116 |     def check_file_exists(self):
117 |         return os.listdir(self.absolute_path)
118 | 
119 |     # This function save the devices specification to csv file.
120 |     def save_specification_to_file(self):
121 |         phone_brand = self.crawl_phone_brands()
122 |         self.create_folder()
123 |         files_list = self.check_file_exists()
124 |         for brand in phone_brand:
125 |             phones_data = []
126 |             if (brand[0].title() + '.csv') not in files_list:
127 |                 link = self.crawl_phones_models(brand[2])
128 |                 model_value = 1
129 |                 print("Working on", brand[0].title(), "brand.")
130 |                 for value in link:
131 |                     datum = self.crawl_phones_models_specification(value, brand[0])
132 |                     datum = { k:v.replace('\n', ' ').replace('\r', ' ') for k,v in datum.items() }
133 |                     phones_data.append(datum)
134 |                     print("Completed ", model_value, "/", len(link))
135 |                     model_value+=1
136 |                 with open(self.absolute_path + '/' + brand[0].title() + ".csv", "w")  as file:
137 |                     dict_writer = csv.DictWriter(file, fieldnames=self.features)
138 |                     dict_writer.writeheader()
139 |                     str_phones_data = json.dumps(phones_data)
140 |                     encoded = str_phones_data.encode('utf-8')
141 |                     load_list = json.loads(encoded)
142 |                     for dicti in load_list:
143 |                         dict_writer.writerow({k:v.encode('utf-8') for k,v in dicti.items()})
144 |                 print("Data loaded in the file")
145 |             else:
146 |                 print(brand[0].title() + '.csv file already in your directory.')
147 | 
148 | 
149 | # This is the main function which create the object of Gsmarena class and call the save_specificiton_to_file function.
150 | i = 1
151 | while i == 1:
152 |     if __name__ == "__main__":
153 |         obj = Gsmarena()
154 |         try:
155 |             obj.save_specification_to_file()
156 |         except KeyboardInterrupt:
157 |             print("File has been stopped due to KeyBoard Interruption.")
158 | 


--------------------------------------------------------------------------------