├── .gitignore ├── LICENSE ├── README.md ├── constants.py ├── extensions.json ├── extensions_by_type.json ├── requirements.txt └── scrape_extensions.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # Code editors and IDEs 104 | .vscode 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Faheel Ahmad 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # File extensions 2 | 3 | JSON collection of scraped file extensions, along with their description and type, from [FileInfo.com][fileinfo]. 4 | 5 | The scraped data is available as JSON in two formats: 6 | 7 | 1. ### [Extensions by type](extensions_by_type.json) 8 | 9 | This JSON dictionary has **file types** as keys that map to an **array of extentsions** which are of that type. 10 | 11 | ```json 12 | { 13 | "3D image": [ 14 | ".OBJ", 15 | ".3DS", 16 | ".3DM", 17 | ".MAX", 18 | ... 19 | ], 20 | "Audio": [ 21 | ".AIF", 22 | ".M4A", 23 | ".MID", 24 | ".MP3", 25 | ... 26 | ], 27 | ... 28 | } 29 | ``` 30 | 31 | 2. ### [Extensions with details](extensions.json) 32 | 33 | This JSON dictionary has **file extensions** as keys that map to a dictionary containing the **description** of that extension and its **type**. 34 | 35 | ```json 36 | { 37 | ".!BT": { 38 | "description": "BitTorrent Incomplete Download File", 39 | "type": "Misc" 40 | }, 41 | ".!QB": { 42 | "description": "qBittorrent Partial Download File", 43 | "type": "Misc" 44 | }, 45 | ".!SYNC": { 46 | "description": "BitTorrent Partially Synced File", 47 | "type": "Misc" 48 | }, 49 | ... 50 | } 51 | ``` 52 | 53 | ## License 54 | 55 | This project is licensed under the terms of the [MIT license](LICENSE). 56 | 57 | 58 | [fileinfo]: https://fileinfo.com 59 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Contains package-wide constants. 3 | ''' 4 | 5 | BASE_URL = 'https://fileinfo.com/filetypes/' 6 | 7 | FILE_TYPES = { 8 | 'Text': { 9 | 'url': 'text', 10 | 'path': 'Documents/Text' , 11 | }, 12 | 'Data': { 13 | 'url': 'datafiles-all', 14 | 'path': 'Data', 15 | }, 16 | 'Audio': { 17 | 'url': 'audio-all', 18 | 'path': 'Music', 19 | }, 20 | 'Video': { 21 | 'url': 'video', 22 | 'path': 'Videos', 23 | }, 24 | 'eBook': { 25 | 'url': 'ebook', 26 | 'path': 'eBooks', 27 | }, 28 | '3D image': { 29 | 'url': '3d_image', 30 | 'path': 'Images/3D', 31 | }, 32 | 'Raster image': { 33 | 'url': 'raster_image', 34 | 'path': 'Images/Raster', 35 | }, 36 | 'Vector image': { 37 | 'url': 'vector_image', 38 | 'path': 'Images/Vector', 39 | }, 40 | 'Raw camera image': { 41 | 'url': 'camera_raw', 42 | 'path': 'Images/Raw Camera', 43 | }, 44 | 'Page layout': { 45 | 'url': 'page_layout', 46 | 'path': 'Documents/Layouts', 47 | }, 48 | 'Spreadsheet': { 49 | 'url': 'spreadsheet', 50 | 'path': 'Documents/Spreadsheets', 51 | }, 52 | 'Database': { 53 | 'url': 'database', 54 | 'path': 'Databases', 55 | }, 56 | 'Executable': { 57 | 'url': 'executable', 58 | 'path': 'Executables', 59 | }, 60 | 'Game file': { 61 | 'url': 'game-all', 62 | 'path': 'Game Files', 63 | }, 64 | 'CAD': { 65 | 'url': 'cad', 66 | 'path': 'CAD', 67 | }, 68 | 'GIS': { 69 | 'url': 'gis', 70 | 'path': 'GIS', 71 | }, 72 | 'Web': { 73 | 'url': 'web', 74 | 'path': 'Web', 75 | }, 76 | 'Plugin': { 77 | 'url': 'plugin', 78 | 'path': 'Plugins', 79 | }, 80 | 'Font': { 81 | 'url': 'font', 82 | 'path': 'Fonts', 83 | }, 84 | 'System': { 85 | 'url': 'system', 86 | 'path': 'System Files', 87 | }, 88 | 'Settings': { 89 | 'url': 'settings-all', 90 | 'path': 'Settings', 91 | }, 92 | 'Encoded': { 93 | 'url': 'encoded', 94 | 'path': 'Misc/Encoded', 95 | }, 96 | 'Compressed': { 97 | 'url': 'compressed', 98 | 'path': 'Archives', 99 | }, 100 | 'Disk image': { 101 | 'url': 'disk_image', 102 | 'path': 'Disk Images', 103 | }, 104 | 'Code': { 105 | 'url': 'developer-all', 106 | 'path': 'Code', 107 | }, 108 | 'Backup': { 109 | 'url': 'backup', 110 | 'path': 'Backups', 111 | }, 112 | 'Misc': { 113 | 'url': 'misc-all', 114 | 'path': 'Misc', 115 | }, 116 | } 117 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.3 2 | lxml==4.6.5 3 | soupsieve==2.2.1 4 | -------------------------------------------------------------------------------- /scrape_extensions.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | ''' 4 | Scrapes file extensions for various file types from FileInfo.com. 5 | The file types are defined in the constants module. 6 | ''' 7 | 8 | import io 9 | import json 10 | from time import sleep 11 | from urllib.request import urlopen 12 | 13 | from bs4 import BeautifulSoup 14 | 15 | from constants import BASE_URL, FILE_TYPES 16 | 17 | EXTENSIONS_DICT = {} 18 | EXTENSIONS_BY_TYPE = {} 19 | 20 | 21 | def make_soup(url): 22 | html = urlopen(url).read() 23 | return BeautifulSoup(html, 'lxml') 24 | 25 | 26 | def get_extensions_for(type): 27 | soup = make_soup(BASE_URL + FILE_TYPES[type]['url']) 28 | extension_table = soup.find('tbody') 29 | 30 | EXTENSIONS_BY_TYPE[type] = [] 31 | for row in extension_table.find_all('tr'): 32 | cols = row.find_all('td') 33 | extension = cols[0].get_text() 34 | EXTENSIONS_BY_TYPE[type].append(extension) 35 | 36 | EXTENSIONS_DICT[extension] = {} 37 | EXTENSIONS_DICT[extension]['type'] = type 38 | EXTENSIONS_DICT[extension]['description'] = cols[1].get_text() 39 | 40 | 41 | def get_all_extensions(): 42 | for type in FILE_TYPES: 43 | get_extensions_for(type) 44 | sleep(1) 45 | 46 | 47 | def write_dict_to_json_file(dictionary, filename): 48 | with io.open(filename, 'w', encoding='utf8') as file: 49 | json_str = json.dumps(dictionary, 50 | ensure_ascii=False, 51 | indent=4, 52 | sort_keys=True, 53 | separators=(',', ': ')) 54 | file.write(json_str) 55 | 56 | 57 | if __name__ == '__main__': 58 | get_all_extensions() 59 | write_dict_to_json_file(EXTENSIONS_DICT, 'extensions.json') 60 | write_dict_to_json_file(EXTENSIONS_BY_TYPE, 'extensions_by_type.json') 61 | --------------------------------------------------------------------------------