├── MANIFEST.in ├── .gitignore ├── setup.py ├── LICENSE ├── CHANGES.txt ├── test.py ├── README.md └── xenocanto.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include LICENSE 3 | include test.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /__pycache__ 2 | /build 3 | /dataset 4 | /dist 5 | /.vscode 6 | /venv 7 | **.pyc 8 | *.egg-info 9 | .DS_Store 10 | requirements.txt 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='xeno-canto', 5 | version='3.0', 6 | python_requires='>3.4.0', 7 | description='xeno-canto.org API Wrapper', 8 | long_description=open('README.md').read(), 9 | long_description_content_type='text/markdown', 10 | url='https://github.com/ntivirikin/xeno-canto-py', 11 | author='Nazariy Tivirikin', 12 | author_email='n.tivirikin@gmail.com', 13 | license="MIT", 14 | py_modules=['xenocanto'], 15 | entry_points={"console_scripts": ["xeno-canto = xenocanto:main"]}, 16 | install_requires=[ 17 | 'aiofiles>=0.8.0', 18 | 'aiohttp>=3.8.1', 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-2022 Nazariy Tivirikin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 2 | v1.0 March 12, 2019 -- Initial release. 3 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 4 | v1.0.1 March 13, 2019 -- Refactor to PEP8 standards, add comments. 5 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 6 | v1.0.2 March 18, 2019 -- Update package name in README. 7 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 8 | v1.1 March 30, 2019 -- Added metadata generation. 9 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 10 | v2.0 October 21, 2019 -- Added purge, delete, and execution via command line arguments. 11 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 12 | v2.0.1 March 15, 2020 -- Installed command-line tool from pull request. 13 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 14 | v2.0.2 Oct 26, 2021 -- Added console messages. 15 | -- Added confirmation message when running delete. 16 | -- Fixed recordings not being completely downloaded if the download was canceled and executed with the same query. 17 | -- Fixed SSL certificate error. 18 | -- Fixed using double quote operator in arguments. 19 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 20 | v2.0.3 Jul 11, 2022 -- Decreased number of console messages when downloading. 21 | -- Removed SSL certificate mitigation, may need workaround on some PCs. 22 | -- Modified test cases to include more assertions. 23 | -- Fixed download error due to switch to HTTPS. 24 | -- Fixed print error with concatenation of str and int in purge function. 25 | -- Fixed expected naming scheme in assertion of test cases. 26 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 27 | v2.0.4 Jul 18, 2022 -- Added method for generating links from metadata in preparation for async implementation. 28 | -- Fixed issue where 1st page of recordings was continuosly redownloaded if more than one page was present. 29 | -- Fixed typo in metadata console messages. 30 | ----------------------------------------------------------------------------------------------------------------------------------------------------- 31 | v3.0.0 Aug 9, 2022 -- Added asynchronous downloading of recordings. 32 | -- Separated download function into two for easier async implementation. 33 | -- Removed "in_progress.txt" functionality to work with async. Be careful with stopping a download as this will corrupt files. 34 | -- Modified tests to include async functionality. 35 | -- Updated README. 36 | -- Added further comments and console messages explaining code functionality. 37 | -- Added checking for existence of directory to prevent errors. 38 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import shutil 4 | import unittest 5 | from urllib import request 6 | from xenocanto import metadata, download, gen_meta, purge, delete 7 | 8 | 9 | # TODO: 10 | # [ ] Test resuming a download after interrupt 11 | class TestCases(unittest.TestCase): 12 | 13 | # Check if connection to the API can be established 14 | def test_conn(self): 15 | url = 'https://www.xeno-canto.org/api/2/recordings?query=cnt:any' 16 | status = request.urlopen(url).getcode() 17 | self.assertEqual(status, 200) 18 | 19 | # Checks if metadata is successfully downloaded into the expected 20 | # folder structure 21 | def test_metadata(self): 22 | metadata(['Bearded Bellbird', 'q:A']) 23 | self.assertTrue(os.path.exists 24 | ('dataset/metadata/BeardedBellbirdq_A/page1.json')) 25 | 26 | # Checks if audio files are downloaded into the correct directory 27 | def test_download(self): 28 | asyncio.run(download(['gen:Otis'])) 29 | self.assertTrue(os.path.exists('dataset/metadata/gen_Otis/page1.json')) 30 | self.assertTrue(os.path.exists('dataset/audio/GreatBustard/' 31 | '459281.mp3')) 32 | 33 | # Check if purge is deleting folders based on file count 34 | def test_purge(self): 35 | asyncio.run(download(['gen:Otis'])) 36 | asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil'])) 37 | purge(7) 38 | self.assertFalse(os.path.exists('dataset/audio/GreatBustard/')) 39 | self.assertTrue(os.path.exists('dataset/audio/BeardedBellbird/')) 40 | 41 | # Check if metadata is being correctly generated for one 42 | # recording with metadata already saved 43 | def test_gen_meta_with_extra_metadata(self): 44 | metadata(['gen:Otis']) 45 | asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil'])) 46 | gen_meta() 47 | self.assertTrue(os.path.exists('dataset/metadata/library.json')) 48 | 49 | # Check if deleting files using multiple filters 50 | def test_delete(self): 51 | asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil'])) 52 | self.assertTrue(os.path.exists('dataset/audio/BeardedBellbird/' 53 | '493159.mp3')) 54 | self.assertTrue(os.path.exists('dataset/audio/BeardedBellbird/' 55 | '427845.mp3')) 56 | delete(['id:493159', 'id:427845']) 57 | self.assertFalse(os.path.exists('dataset/audio/BeardedBellbird/' 58 | '493159.mp3')) 59 | self.assertFalse(os.path.exists('dataset/audio/BeardedBellbird/' 60 | '427845.mp3')) 61 | 62 | # Check if deleting files from multiple folders 63 | def test_delete_multiple_species(self): 64 | asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil'])) 65 | self.assertTrue(os.path.exists('dataset/audio/BeardedBellbird/' 66 | '493159.mp3')) 67 | asyncio.run(download(['gen:Otis'])) 68 | self.assertTrue(os.path.exists('dataset/audio/GreatBustard/')) 69 | delete(['id:493159', 'gen:Otis']) 70 | self.assertFalse(os.path.exists('dataset/audio/BeardedBellbird/' 71 | '493159.mp3')) 72 | self.assertFalse(os.path.exists('dataset/audio/GreatBustard/')) 73 | 74 | # Check if metadata is being correctly generated when some metadata 75 | # is saved and some must be retrieved from an API call 76 | def test_gen_meta_with_extra_tracks(self): 77 | path = metadata(['gen:Otis']) 78 | asyncio.run(download(['gen:Otis'])) 79 | asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil'])) 80 | shutil.rmtree(path) 81 | gen_meta() 82 | self.assertTrue(os.path.exists('dataset/metadata/library.json')) 83 | 84 | # Removes files used in testing 85 | def tearDown(self): 86 | try: 87 | shutil.rmtree('dataset/') 88 | except OSError: 89 | pass 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xeno-canto API Wrapper 2 | xeno-canto-py is an API wrapper designed to help users download xeno-canto.org recordings and associated information in an efficient manner. Download requests are processed concurrently using the `asyncio`, `aiohttp` and `aiofiles` libraries to optimize retrieval time. The wrapper also offers delete and metadata generation functions for recording library management. 3 | 4 | Created to aid in data collection and filtering for the training of machine learning models. 5 | ## Installation 6 | xeno-canto-py is available on [PyPi](https://pypi.org/project/xeno-canto/) and can be downloaded with the package manager [pip](https://pip.pypa.io/en/stable/) to install xeno-canto-py. 7 | ```bash 8 | pip install xeno-canto 9 | ``` 10 | The package can then be used straight from the command-line: 11 | ```bash 12 | xeno-canto -dl Bearded Bellbird 13 | ``` 14 | Or imported into an existing Python project: 15 | ```python 16 | import xenocanto 17 | ``` 18 | For users who want more control over the wrapper, navigate to your desired file location in a terminal window and then clone the repository with the following command: 19 | ```bash 20 | git clone https://github.com/ntivirikin/xeno-canto-py 21 | ``` 22 | The only file required for operation is `xenocanto.py`, so feel free to remove the others or move `xenocanto.py` to another working directory. 23 | 24 | **WARNING:** Please exercise caution using `test.py` as executing the tests via `unittest` or other test harness will delete any `dataset` folder in the working directory following completion of the tests. 25 | ## Usage 26 | The xeno-canto-py wrapper supports the retrieval of metadata and audio from the xeno-canto database, as well as library management functions such as deletion of recordings matching input tags, removal of folders with an insufficient amount of audio recordings and generation of a single JSON metadata file for a given path containing xeno-canto audio recordings. Examples of command usage are given below. 27 | 28 | --- 29 | **Metadata Download** 30 | `xeno-canto -m [parameters]` 31 | 32 | Downloads metadata as a series of JSON files and returns the path to the metadata folder. 33 | 34 | _Example: Metadata retrieval for Bearded Bellbird recordings of quality A_ 35 | 36 | `xeno-canto -m Bearded Bellbird q:A` 37 | 38 | --- 39 | **Audio Recording Download** 40 | `xeno-canto -dl [parameters]` 41 | 42 | Retrieves the metadata for the request and uses it to download audio recordings as MP3s from the database. 43 | 44 | _Example: Download Bearded Bellbird recordings from the country of Brazil_ 45 | 46 | `xeno-canto -dl Bearded Bellbird cnt:Brazil` 47 | 48 | --- 49 | **Delete Recordings** 50 | `xeno-canto -del [parameters]` 51 | 52 | Delete recordings with **ANY** of the parameters given as input. 53 | 54 | _Example: Delete **ALL** quality D recordings and **ALL** recordings from Brazil_ 55 | 56 | `xeno-canto -del q:D cnt:Brazil` 57 | 58 | --- 59 | **Purge Folders** 60 | 61 | Removes any folders within the `dataset/audio/` directory that have less recordings than the input value `num`. 62 | 63 | `xeno-canto -p [num]` 64 | 65 | _Example: Remove recording folders with less than 10 recordings (not inclusive)_ 66 | 67 | `xeno-canto -p 10` 68 | 69 | --- 70 | **Generate Metadata** 71 | 72 | Generates metadata for the xeno-canto database recordings at the input path, defaulting to `dataset/audio/` within the working directory if none is given. 73 | 74 | `xeno-canto -g [path]` 75 | 76 | _Example: Generate metadata for the recordings located in `bird_rec/audio/` within the working directory_ 77 | 78 | `xeno-canto -g bird_rec/audio/` 79 | 80 | --- 81 | `parameters` are given in tag:value form in accordance with the API search guidelines. For help in building search terms, consult the [xeno-canto API guide](https://xeno-canto.org/explore/api) and this [article](https://xeno-canto.org/article/153). The only exception is when providing English bird names as an argument to the delete function, which must be preceded with `en:` and have all spaces be replaced with underscores. 82 | ### Directory Structure 83 | Files are saved in the working directory under the folder `dataset/`. Metadata and audio recordings are separated into `metadata/` and `audio/` folders by request information and bird species respectively. For example: 84 | ``` 85 | dataset/ 86 | - audio/ 87 | - Indigo Bunting/ 88 | - 14325.mp3 89 | - Northern Cardinal/ 90 | - 8273.mp3 91 | - metadata/ 92 | - library.json 93 | - IndigoBuntingcnt_Canada/ 94 | - page1.json 95 | - NorthernCardinalq_A/ 96 | - page1.json 97 | ``` 98 | Metadata is retrieved as a JSON file and contains information on each of the audio recordings matching the request parameters provided as input. The metadata also contains the download links used to retrieve the audio recordings. The `library.json` file is generated by running the metadata generation command `-g`. 99 | ### Error 503 100 | If an Error 503 is given when attempting a recording download, try passing a value lower than 4 as the num\_chunks value in download(filt, num\_chunks). This can either be done by changing the default value in the function definition for `download(filt, num_chunks)`, or by passing a value into `download(params)` in the body of `main()` as shown below. 101 | ```python 102 | # Running with default 4 locks on semaphore 103 | asyncio.run(download(params)) 104 | 105 | # Running with 3 locks rather than default 106 | asyncio.run(download(params, 3)) 107 | ``` 108 | Alternatively, you can try experimenting with higher values for num_chunks to see some performance improvements. 109 | ## Contributing 110 | All pull requests are welcome! If any issues are found, please do not hesitate to bring them to my attention. 111 | ## Acknowledgements 112 | Thank you to the team at xeno-canto.org and all its contributors for putting together such an amazing database. 113 | ## License 114 | [MIT](https://choosealicense.com/licenses/mit/) 115 | -------------------------------------------------------------------------------- /xenocanto.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import asyncio 4 | import json 5 | import os 6 | import shutil 7 | import sys 8 | import time 9 | from urllib import request, error 10 | 11 | import aiofiles 12 | import aiohttp 13 | 14 | 15 | # TODO: 16 | # [X] Log messages to console 17 | # [ ] Add sono image download capabilities 18 | # [ ] Add ability to process multiple species in one command 19 | # [ ] Create function to verify all recordings downloaded correctly 20 | # [ ] Purge recordings that did not complete download 21 | # [ ] Add text file processing for batch requests 22 | # [ ] Display tables of tags collected 23 | # 24 | # FIXME: 25 | # [ ] Modify delete method to remove recordings containing all input tags 26 | # rather than any one of the tags 27 | # [ ] Allow the delete method to accept species names with spaces 28 | 29 | 30 | # Retrieves metadata for requested recordings in the form of a JSON file 31 | def metadata(filt): 32 | page = 1 33 | page_num = 1 34 | filt_path = list() 35 | filt_url = list() 36 | print("Retrieving metadata...") 37 | 38 | # Scrubbing input for file name and url 39 | for f in filt: 40 | filt_url.append(f.replace(' ', '%20')) 41 | filt_path.append((f.replace(' ', '')).replace(':', '_') 42 | .replace('\"', '')) 43 | 44 | path = 'dataset/metadata/' + ''.join(filt_path) 45 | 46 | # Create a metadata folder if it does not exist already 47 | if not os.path.exists(path): 48 | os.makedirs(path) 49 | 50 | # Input parameters are separated by %20 for use in URL 51 | query = ('%20'.join(filt_url)) 52 | 53 | # Save all pages of the JSON response 54 | while page < (page_num + 1): 55 | url = ('https://www.xeno-canto.org/api/2/recordings?' 56 | 'query={0}&page={1}'.format(query, page)) 57 | try: 58 | r = request.urlopen(url) 59 | except error.HTTPError as e: 60 | print("An error has occurred: " + str(e)) 61 | exit() 62 | print("Downloading metadata page " + str(page) + "...") 63 | data = json.loads(r.read().decode('UTF-8')) 64 | filename = path + '/page' + str(page) + '.json' 65 | with open(filename, 'w') as saved: 66 | json.dump(data, saved) 67 | page_num = data['numPages'] 68 | page += 1 69 | 70 | # Rate limit of one request per second 71 | time.sleep(1) 72 | 73 | # Return the path to the folder containing downloaded metadata 74 | print("Metadata retrieval complete.") 75 | return path 76 | 77 | 78 | # Uses JSON metadata files to generate a list of recording URLs 79 | def list_urls(path): 80 | url_list = [] 81 | page = 1 82 | 83 | # Initial opening of JSON to retrieve amount of pages and recordings 84 | with open(path + '/page' + str(page) + '.json', 'r') as jsonfile: 85 | data = jsonfile.read() 86 | jsonfile.close() 87 | data = json.loads(data) 88 | page_num = data['numPages'] 89 | recordings_num = int(data['numRecordings']) 90 | 91 | # Clear may not be required if setting to None, included for redundancy 92 | data.clear() 93 | data = None 94 | 95 | # Set the first element to the number of recordings 96 | url_list.append(recordings_num) 97 | 98 | # Second element will be a list of tuples with (name, track_id, file url) 99 | url_list.append(list()) 100 | 101 | # Read each metadata file and extract information into list as a tuple 102 | while page < page_num + 1: 103 | with open(path + '/page' + str(page) + '.json', 'r') as jsonfile: 104 | data = jsonfile.read() 105 | jsonfile.close() 106 | data = json.loads(data) 107 | 108 | # Extract the number of recordings in the opened metadata file 109 | rec_length = len(data['recordings']) 110 | 111 | # Parse through the opened data and add it to the URL list 112 | for i in range(0, rec_length): 113 | name = (data['recordings'][i]['en']).replace(' ', '') 114 | track_id = data['recordings'][i]['id'] 115 | track_url = data['recordings'][i]['file'] 116 | track_format = os.path.splitext(data['recordings'][i]['file-name'])[-1] 117 | track_info = (name, track_id, track_url, track_format) 118 | url_list[1].append(track_info) 119 | page += 1 120 | return url_list 121 | 122 | 123 | # Client that processes the list of track information concurrently 124 | def chunked_http_client(num_chunks): 125 | 126 | # Semaphore used to limit the number of requests with num_chunks 127 | semaphore = asyncio.Semaphore(num_chunks) 128 | 129 | # Processes a tuple from the url_list using the aiohttp client_session 130 | async def http_get(track_tuple, client_session): 131 | 132 | # Work with semaphore located outside the function 133 | nonlocal semaphore 134 | async with semaphore: 135 | 136 | # Pull relevant info from tuple 137 | name = str(track_tuple[0]) 138 | track_id = str(track_tuple[1]) 139 | url = track_tuple[2] 140 | track_format = track_tuple[3] 141 | 142 | # Set up the paths required for saving the audio file 143 | folder_path = 'dataset/audio/' + name + '/' 144 | file_path = folder_path + track_id + track_format 145 | 146 | # Create an audio folder for the species if it does not exist 147 | if not os.path.exists(folder_path): 148 | print("Creating recording folder at " + str(folder_path)) 149 | os.makedirs(folder_path) 150 | 151 | # If the file exists in the directory, we will skip it 152 | if os.path.exists(file_path): 153 | print(track_id + track_format + " is already present. Skipping...") 154 | return 155 | 156 | # Use the aiohttp client to retrieve the audio file asynchronously 157 | async with client_session.get(url) as response: 158 | if response.status == 200: 159 | f = await aiofiles.open((file_path), mode='wb') 160 | await f.write(await response.content.read()) 161 | await f.close() 162 | elif response.status == 503: 163 | print("Error 503 occurred when downloading " + track_id 164 | + track_format + ". Please try using a lower value for " 165 | "num_chunks. Consult the README for more " 166 | "information.") 167 | else: 168 | print("Error " + str(response.status) + " occurred " 169 | "when downloading " + track_id + track_format + ".") 170 | 171 | return http_get 172 | 173 | 174 | # Retrieves metadata and recordings for a given set of input param 175 | async def download(filt, num_chunks=4): 176 | 177 | # Retrieve metadata and generate list of track information 178 | meta_path = metadata(filt) 179 | url_list = list_urls(meta_path) 180 | 181 | # Retrieve the number of recordings to be downloaded 182 | recordings_num = url_list[0] 183 | 184 | # Exit the program if no recordings are found 185 | if (recordings_num == 0): 186 | print("No recordings found for the provided request.") 187 | quit() 188 | 189 | print(str(recordings_num) + " recordings found, downloading...") 190 | 191 | # Setup the aiohttp client with the desired semaphore limit 192 | http_client = chunked_http_client(num_chunks) 193 | async with aiohttp.ClientSession() as client_session: 194 | 195 | # Collect tasks and await futures to ensure concurrent processing 196 | tasks = [http_client(track_tuple, client_session) for track_tuple in 197 | url_list[1]] 198 | for future in asyncio.as_completed(tasks): 199 | data = await future 200 | print("Download complete.") 201 | 202 | 203 | # Retrieve all files while ignoring those that are hidden 204 | def listdir_nohidden(path): 205 | for f in os.listdir(path): 206 | if not f.startswith('.'): 207 | yield f 208 | 209 | 210 | # Removes audio folders containing num or less than num files 211 | def purge(num): 212 | print("Removing all audio folders with fewer than " + str(num) + 213 | " recordings.") 214 | path = 'dataset/audio/' 215 | dirs = listdir_nohidden(path) 216 | remove_count = 0 217 | 218 | # Count the number of tracks in each folder 219 | for fold in dirs: 220 | fold_path = path + fold 221 | count = sum(1 for _ in listdir_nohidden(fold_path)) 222 | 223 | # Remove the folder if the track amount is less than input 224 | if count < num: 225 | print("Deleting " + fold_path + " since <" + str(num) + " tracks.") 226 | shutil.rmtree(fold_path) 227 | remove_count = remove_count + 1 228 | print(str(remove_count) + " folders removed.") 229 | 230 | 231 | # Deletes audio tracks based on provided parameters 232 | def delete(filt): 233 | 234 | # Generating list of current tracks with metadata 235 | gen_meta() 236 | 237 | # Separating desired tags from values for parsing 238 | tags = list() 239 | vals = list() 240 | for f in filt: 241 | tag = f.split(':')[0] 242 | tags.append(tag) 243 | 244 | val = f.split(':')[1] 245 | if tag == 'en': 246 | val = val.replace('_', ' ') 247 | vals.append(val) 248 | 249 | with open('dataset/metadata/library.json', 'r') as jsonfile: 250 | data = jsonfile.read() 251 | data = json.loads(data) 252 | 253 | # Creating a set of track id's to delete 254 | track_del = set() 255 | for i in range(int(data['recordingNumber'])): 256 | for j in range(len(tags)): 257 | if data['tracks'][i][str(tags[j])] == str(vals[j]): 258 | track_del.add(int(data['tracks'][i]['id'])) 259 | 260 | # Proposed change for deletion of tracks matching all inputs 261 | # rather than any 262 | # 263 | # if data['tracks'][i][str(tags[j])] != str(vals[j]): 264 | # exit this for loop 265 | # track_del.add(int(data['tracks'][i]['id'])) 266 | 267 | print(str(len(track_del)) + " tracks have been identified to be deleted.") 268 | 269 | # Checking audio folders for tracks to delete 270 | path = 'dataset/audio/' 271 | dirs = listdir_nohidden(path) 272 | removed = 0 273 | for fold in dirs: 274 | fold_path = path + fold 275 | tracks = listdir_nohidden(fold_path) 276 | for tr in tracks: 277 | if int(tr.split('.')[0]) in track_del: 278 | os.remove(fold_path + '/' + str(tr)) 279 | removed = removed + 1 280 | 281 | print(str(removed) + " tracks deleted!") 282 | 283 | # Removing any empty folders 284 | purge(1) 285 | 286 | 287 | # Generate a metadata file for given library path 288 | def gen_meta(path='dataset/audio/'): 289 | 290 | # Checking to see if the path exists 291 | if not os.path.exists(path): 292 | print("Path " + str(path) + " does not exist.") 293 | return 294 | print("Generating metadata file for current recording library...") 295 | 296 | # Removing old library file if exists 297 | if os.path.exists('dataset/metadata/library.json'): 298 | os.remove('dataset/metadata/library.json') 299 | 300 | # Create a list of track ID's contained in the current library 301 | id_list = set() 302 | 303 | for fold in listdir_nohidden(path): 304 | filenames = listdir_nohidden(path + fold) 305 | for f in filenames: 306 | track_id = (f.split('.')) 307 | id_list.add(track_id[0]) 308 | 309 | count = len(id_list) 310 | print(str(count) + " recordings have been found. Collecting metadata...") 311 | 312 | write_data = dict() 313 | write_data['recordingNumber'] = str(count) 314 | write_data['tracks'] = list() 315 | 316 | # Create a list of all metadata files 317 | meta_files = list() 318 | if os.path.exists('dataset/metadata/'): 319 | for filename in listdir_nohidden('dataset/metadata/'): 320 | if filename != 'library.json': 321 | meta_files.append(filename) 322 | 323 | # Check each metadata track for presence in library 324 | found_files = set() 325 | for f in meta_files: 326 | page_num = 1 327 | page = 1 328 | 329 | while page < page_num + 1: 330 | 331 | # Open the json 332 | with open('dataset/metadata/' + f + '/page' + str(page) + 333 | '.json', 'r') as jsonfile: 334 | data = jsonfile.read() 335 | data = json.loads(data) 336 | page_num = data['numPages'] 337 | 338 | # Parse through each track 339 | for i in range(len(data['recordings'])): 340 | track = data['recordings'][i]['id'] 341 | if track in id_list: 342 | track_info = data['recordings'][i] 343 | write_data['tracks'].append(track_info) 344 | page += 1 345 | 346 | # Retrieves information from API for tracks that cannot be found in the 347 | # currently saved metadata 348 | found_files = list() 349 | for i in range(len(write_data['tracks'])): 350 | found_files.append(write_data['tracks'][i]['id']) 351 | 352 | not_found = list(set(id_list) - set(found_files)) 353 | if not_found: 354 | print(str(len(not_found)) + " recordings must have their " 355 | " metadata downloaded.") 356 | 357 | # Retrieves metadata for each of the recordings individually 358 | for i in not_found: 359 | track_find = 'nr:' + i 360 | path = metadata([track_find]) 361 | with open(path + '/page1.json') as jsonfile: 362 | data = jsonfile.read() 363 | data = json.loads(data) 364 | write_data['tracks'].append(data['recordings'][0]) 365 | 366 | with open('data.txt', 'w') as outfile: 367 | json.dump(write_data, outfile) 368 | 369 | os.rename('data.txt', 'dataset/metadata/library.json') 370 | print("Metadata successfully generated at dataset/metadata/library.json") 371 | 372 | 373 | # Accepts command line input to determine function to execute 374 | def main(): 375 | if len(sys.argv) == 1: 376 | print("No command given. Please consult the README for help.") 377 | return 378 | 379 | if len(sys.argv) == 2: 380 | print("Commands must be given in a '-command parameter' format. " 381 | "Please consult the README for help.") 382 | 383 | act = sys.argv[1] 384 | params = sys.argv[2:] 385 | 386 | # Retrieve metadata 387 | if act == "-m": 388 | metadata(params) 389 | 390 | # Download recordings 391 | elif act == "-dl": 392 | start = time.time() 393 | asyncio.run(download(params)) 394 | end = time.time() 395 | print("Duration: " + str(int(end - start)) + "s") 396 | 397 | # Purge folders 398 | elif act == "-p": 399 | purge(int(params[0])) 400 | 401 | # Generate library metadata 402 | elif act == "-g": 403 | if len(params) == 1: 404 | gen_meta(params[0]) 405 | else: 406 | gen_meta() 407 | 408 | # Delete recordings matching ANY input parameter 409 | elif act == '-del': 410 | dec = input("Proceed with deleting? (Y or N)\n") 411 | if dec == "Y": 412 | delete(params) 413 | 414 | else: 415 | print("Command not found, please consult the README.") 416 | 417 | 418 | # Handles command line execution 419 | if __name__ == '__main__': 420 | main() 421 | --------------------------------------------------------------------------------