├── MANIFEST.in
├── .gitignore
├── setup.py
├── LICENSE
├── CHANGES.txt
├── test.py
├── README.md
└── xenocanto.py


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include LICENSE
3 | include test.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /__pycache__
 2 | /build
 3 | /dataset
 4 | /dist
 5 | /.vscode
 6 | /venv
 7 | **.pyc
 8 | *.egg-info
 9 | .DS_Store
10 | requirements.txt
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='xeno-canto',
 5 |     version='3.0',
 6 |     python_requires='>3.4.0',
 7 |     description='xeno-canto.org API Wrapper',
 8 |     long_description=open('README.md').read(),
 9 |     long_description_content_type='text/markdown',
10 |     url='https://github.com/ntivirikin/xeno-canto-py',
11 |     author='Nazariy Tivirikin',
12 |     author_email='n.tivirikin@gmail.com',
13 |     license="MIT",
14 |     py_modules=['xenocanto'],
15 |     entry_points={"console_scripts": ["xeno-canto = xenocanto:main"]},
16 |     install_requires=[
17 |         'aiofiles>=0.8.0',
18 |         'aiohttp>=3.8.1',
19 |     ],
20 | )
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2022 Nazariy Tivirikin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
 1 | -----------------------------------------------------------------------------------------------------------------------------------------------------
 2 | v1.0  	March 12, 2019 		-- Initial release.
 3 | -----------------------------------------------------------------------------------------------------------------------------------------------------
 4 | v1.0.1  March 13, 2019 		-- Refactor to PEP8 standards, add comments.
 5 | -----------------------------------------------------------------------------------------------------------------------------------------------------
 6 | v1.0.2  March 18, 2019 		-- Update package name in README.
 7 | -----------------------------------------------------------------------------------------------------------------------------------------------------
 8 | v1.1  	March 30, 2019 		-- Added metadata generation.
 9 | -----------------------------------------------------------------------------------------------------------------------------------------------------
10 | v2.0  	October 21, 2019 	-- Added purge, delete, and execution via command line arguments.
11 | -----------------------------------------------------------------------------------------------------------------------------------------------------
12 | v2.0.1  March 15, 2020 		-- Installed command-line tool from pull request.
13 | -----------------------------------------------------------------------------------------------------------------------------------------------------
14 | v2.0.2  Oct 26, 2021 		-- Added console messages. 
15 | 				-- Added confirmation message when running delete. 
16 | 				-- Fixed recordings not being completely downloaded if the download was canceled and executed with the same query. 
17 | 				-- Fixed SSL certificate error.
18 | 				-- Fixed using double quote operator in arguments.
19 | -----------------------------------------------------------------------------------------------------------------------------------------------------
20 | v2.0.3	Jul 11, 2022		-- Decreased number of console messages when downloading.
21 | 				-- Removed SSL certificate mitigation, may need workaround on some PCs.
22 | 				-- Modified test cases to include more assertions.
23 | 				-- Fixed download error due to switch to HTTPS.
24 | 				-- Fixed print error with concatenation of str and int in purge function.
25 | 				-- Fixed expected naming scheme in assertion of test cases.
26 | -----------------------------------------------------------------------------------------------------------------------------------------------------
27 | v2.0.4 	Jul 18, 2022 		-- Added method for generating links from metadata in preparation for async implementation.
28 | 				-- Fixed issue where 1st page of recordings was continuosly redownloaded if more than one page was present.
29 | 				-- Fixed typo in metadata console messages.
30 | -----------------------------------------------------------------------------------------------------------------------------------------------------
31 | v3.0.0	Aug 9, 2022			-- Added asynchronous downloading of recordings.
32 | 				-- Separated download function into two for easier async implementation.
33 | 				-- Removed "in_progress.txt" functionality to work with async. Be careful with stopping a download as this will corrupt files.
34 | 				-- Modified tests to include async functionality.
35 | 				-- Updated README.
36 | 				-- Added further comments and console messages explaining code functionality.
37 | 				-- Added checking for existence of directory to prevent errors.
38 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import shutil
 4 | import unittest
 5 | from urllib import request
 6 | from xenocanto import metadata, download, gen_meta, purge, delete
 7 | 
 8 | 
 9 | # TODO:
10 | #   [ ] Test resuming a download after interrupt
11 | class TestCases(unittest.TestCase):
12 | 
13 |     # Check if connection to the API can be established
14 |     def test_conn(self):
15 |         url = 'https://www.xeno-canto.org/api/2/recordings?query=cnt:any'
16 |         status = request.urlopen(url).getcode()
17 |         self.assertEqual(status, 200)
18 | 
19 |     # Checks if metadata is successfully downloaded into the expected
20 |     # folder structure
21 |     def test_metadata(self):
22 |         metadata(['Bearded Bellbird', 'q:A'])
23 |         self.assertTrue(os.path.exists
24 |                         ('dataset/metadata/BeardedBellbirdq_A/page1.json'))
25 | 
26 |     # Checks if audio files are downloaded into the correct directory
27 |     def test_download(self):
28 |         asyncio.run(download(['gen:Otis']))
29 |         self.assertTrue(os.path.exists('dataset/metadata/gen_Otis/page1.json'))
30 |         self.assertTrue(os.path.exists('dataset/audio/GreatBustard/'
31 |                                        '459281.mp3'))
32 | 
33 |     # Check if purge is deleting folders based on file count
34 |     def test_purge(self):
35 |         asyncio.run(download(['gen:Otis']))
36 |         asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil']))
37 |         purge(7)
38 |         self.assertFalse(os.path.exists('dataset/audio/GreatBustard/'))
39 |         self.assertTrue(os.path.exists('dataset/audio/BeardedBellbird/'))
40 | 
41 |     # Check if metadata is being correctly generated for one
42 |     # recording with metadata already saved
43 |     def test_gen_meta_with_extra_metadata(self):
44 |         metadata(['gen:Otis'])
45 |         asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil']))
46 |         gen_meta()
47 |         self.assertTrue(os.path.exists('dataset/metadata/library.json'))
48 | 
49 |     # Check if deleting files using multiple filters
50 |     def test_delete(self):
51 |         asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil']))
52 |         self.assertTrue(os.path.exists('dataset/audio/BeardedBellbird/'
53 |                                        '493159.mp3'))
54 |         self.assertTrue(os.path.exists('dataset/audio/BeardedBellbird/'
55 |                                        '427845.mp3'))
56 |         delete(['id:493159', 'id:427845'])
57 |         self.assertFalse(os.path.exists('dataset/audio/BeardedBellbird/'
58 |                                         '493159.mp3'))
59 |         self.assertFalse(os.path.exists('dataset/audio/BeardedBellbird/'
60 |                                         '427845.mp3'))
61 | 
62 |     # Check if deleting files from multiple folders
63 |     def test_delete_multiple_species(self):
64 |         asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil']))
65 |         self.assertTrue(os.path.exists('dataset/audio/BeardedBellbird/'
66 |                                        '493159.mp3'))
67 |         asyncio.run(download(['gen:Otis']))
68 |         self.assertTrue(os.path.exists('dataset/audio/GreatBustard/'))
69 |         delete(['id:493159', 'gen:Otis'])
70 |         self.assertFalse(os.path.exists('dataset/audio/BeardedBellbird/'
71 |                                         '493159.mp3'))
72 |         self.assertFalse(os.path.exists('dataset/audio/GreatBustard/'))
73 | 
74 |     # Check if metadata is being correctly generated when some metadata
75 |     #  is saved and some must be retrieved from an API call
76 |     def test_gen_meta_with_extra_tracks(self):
77 |         path = metadata(['gen:Otis'])
78 |         asyncio.run(download(['gen:Otis']))
79 |         asyncio.run(download(['Bearded Bellbird', 'q:A', 'cnt:Brazil']))
80 |         shutil.rmtree(path)
81 |         gen_meta()
82 |         self.assertTrue(os.path.exists('dataset/metadata/library.json'))
83 | 
84 |     # Removes files used in testing
85 |     def tearDown(self):
86 |         try:
87 |             shutil.rmtree('dataset/')
88 |         except OSError:
89 |             pass
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # xeno-canto API Wrapper
  2 | xeno-canto-py is an API wrapper designed to help users download xeno-canto.org recordings and associated information in an efficient manner. Download requests are processed concurrently using the `asyncio`, `aiohttp` and `aiofiles` libraries to optimize retrieval time. The wrapper also offers delete and metadata generation functions for recording library management.
  3 | 
  4 | Created to aid in data collection and filtering for the training of machine learning models.
  5 | ## Installation
  6 | xeno-canto-py is available on [PyPi](https://pypi.org/project/xeno-canto/) and can be downloaded with the package manager [pip](https://pip.pypa.io/en/stable/) to install xeno-canto-py.
  7 | ```bash
  8 | pip install xeno-canto
  9 | ```
 10 | The package can then be used straight from the command-line:
 11 | ```bash
 12 | xeno-canto -dl Bearded Bellbird
 13 | ```
 14 | Or imported into an existing Python project:
 15 | ```python
 16 | import xenocanto
 17 | ```
 18 | For users who want more control over the wrapper, navigate to your desired file location in a terminal window and then clone the repository with the following command:
 19 | ```bash
 20 | git clone https://github.com/ntivirikin/xeno-canto-py
 21 | ```
 22 | The only file required for operation is `xenocanto.py`, so feel free to remove the others or move `xenocanto.py` to another working directory.
 23 | 
 24 | **WARNING:** Please exercise caution using `test.py` as executing the tests via `unittest` or other test harness will delete any `dataset` folder in the working directory following completion of the tests.
 25 | ## Usage
 26 | The xeno-canto-py wrapper supports the retrieval of metadata and audio from the xeno-canto database, as well as library management functions such as deletion of recordings matching input tags, removal of folders with an insufficient amount of audio recordings and generation of a single JSON metadata file for a given path containing xeno-canto audio recordings. Examples of command usage are given below.
 27 | 
 28 | ---
 29 | **Metadata Download**
 30 | `xeno-canto -m [parameters]`
 31 | 
 32 | Downloads metadata as a series of JSON files and returns the path to the metadata folder.
 33 | 
 34 | _Example: Metadata retrieval for Bearded Bellbird recordings of quality A_
 35 | 
 36 | `xeno-canto -m Bearded Bellbird q:A`
 37 | 
 38 | ---
 39 | **Audio Recording Download**
 40 | `xeno-canto -dl [parameters]`
 41 | 
 42 | Retrieves the metadata for the request and uses it to download audio recordings as MP3s from the database.
 43 | 
 44 | _Example: Download Bearded Bellbird recordings from the country of Brazil_
 45 | 
 46 | `xeno-canto -dl Bearded Bellbird cnt:Brazil`
 47 | 
 48 | ---
 49 | **Delete Recordings**
 50 | `xeno-canto -del [parameters]`
 51 | 
 52 | Delete recordings with **ANY** of the parameters given as input.
 53 | 
 54 | _Example: Delete **ALL** quality D recordings and **ALL** recordings from Brazil_
 55 | 
 56 | `xeno-canto -del q:D cnt:Brazil`
 57 | 
 58 | ---
 59 | **Purge Folders**
 60 | 
 61 | Removes any folders within the `dataset/audio/` directory that have less recordings than the input value `num`.
 62 | 
 63 | `xeno-canto -p [num]`
 64 | 
 65 | _Example: Remove recording folders with less than 10 recordings (not inclusive)_
 66 | 
 67 | `xeno-canto -p 10`
 68 | 
 69 | ---
 70 | **Generate Metadata**
 71 | 
 72 | Generates metadata for the xeno-canto database recordings at the input path, defaulting to `dataset/audio/` within the working directory if none is given.
 73 | 
 74 | `xeno-canto -g [path]`
 75 | 
 76 | _Example: Generate metadata for the recordings located in `bird_rec/audio/` within the working directory_
 77 | 
 78 | `xeno-canto -g bird_rec/audio/`
 79 | 
 80 | ---
 81 | `parameters` are given in tag:value form in accordance with the API search guidelines. For help in building search terms, consult the [xeno-canto API guide](https://xeno-canto.org/explore/api) and this [article](https://xeno-canto.org/article/153). The only exception is when providing English bird names as an argument to the delete function, which must be preceded with `en:` and have all spaces be replaced with underscores.
 82 | ### Directory Structure
 83 | Files are saved in the working directory under the folder `dataset/`. Metadata and audio recordings are separated into `metadata/` and `audio/` folders by request information and bird species respectively. For example:
 84 | ```
 85 | dataset/
 86 |     - audio/
 87 |         - Indigo Bunting/
 88 |             - 14325.mp3
 89 |         - Northern Cardinal/
 90 |             - 8273.mp3
 91 |     - metadata/
 92 |         - library.json
 93 |         - IndigoBuntingcnt_Canada/
 94 |             - page1.json
 95 |         - NorthernCardinalq_A/
 96 |             - page1.json
 97 | ```
 98 | Metadata is retrieved as a JSON file and contains information on each of the audio recordings matching the request parameters provided as input. The metadata also contains the download links used to retrieve the audio recordings. The `library.json` file is generated by running the metadata generation command `-g`.
 99 | ### Error 503
100 | If an Error 503 is given when attempting a recording download, try passing a value lower than 4 as the num\_chunks value in download(filt, num\_chunks). This can either be done by changing the default value in the function definition for `download(filt, num_chunks)`, or by passing a value into `download(params)` in the body of `main()` as shown below.
101 | ```python
102 | # Running with default 4 locks on semaphore
103 | asyncio.run(download(params))
104 | 
105 | # Running with 3 locks rather than default
106 | asyncio.run(download(params, 3))
107 | ```
108 | Alternatively, you can try experimenting with higher values for num_chunks to see some performance improvements.
109 | ## Contributing
110 | All pull requests are welcome! If any issues are found, please do not hesitate to bring them to my attention.
111 | ## Acknowledgements
112 | Thank you to the team at xeno-canto.org and all its contributors for putting together such an amazing database.
113 | ## License
114 | [MIT](https://choosealicense.com/licenses/mit/)
115 | 


--------------------------------------------------------------------------------
/xenocanto.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import asyncio
  4 | import json
  5 | import os
  6 | import shutil
  7 | import sys
  8 | import time
  9 | from urllib import request, error
 10 | 
 11 | import aiofiles
 12 | import aiohttp
 13 | 
 14 | 
 15 | # TODO:
 16 | #   [X] Log messages to console
 17 | #   [ ] Add sono image download capabilities
 18 | #   [ ] Add ability to process multiple species in one command
 19 | #   [ ] Create function to verify all recordings downloaded correctly
 20 | #   [ ] Purge recordings that did not complete download
 21 | #   [ ] Add text file processing for batch requests
 22 | #   [ ] Display tables of tags collected
 23 | #
 24 | # FIXME:
 25 | #   [ ] Modify delete method to remove recordings containing all input tags
 26 | #       rather than any one of the tags
 27 | #   [ ] Allow the delete method to accept species names with spaces
 28 | 
 29 | 
 30 | # Retrieves metadata for requested recordings in the form of a JSON file
 31 | def metadata(filt):
 32 |     page = 1
 33 |     page_num = 1
 34 |     filt_path = list()
 35 |     filt_url = list()
 36 |     print("Retrieving metadata...")
 37 | 
 38 |     # Scrubbing input for file name and url
 39 |     for f in filt:
 40 |         filt_url.append(f.replace(' ', '%20'))
 41 |         filt_path.append((f.replace(' ', '')).replace(':', '_')
 42 |                          .replace('\"', ''))
 43 | 
 44 |     path = 'dataset/metadata/' + ''.join(filt_path)
 45 | 
 46 |     # Create a metadata folder if it does not exist already
 47 |     if not os.path.exists(path):
 48 |         os.makedirs(path)
 49 | 
 50 |     # Input parameters are separated by %20 for use in URL
 51 |     query = ('%20'.join(filt_url))
 52 | 
 53 |     # Save all pages of the JSON response
 54 |     while page < (page_num + 1):
 55 |         url = ('https://www.xeno-canto.org/api/2/recordings?'
 56 |                'query={0}&page={1}'.format(query, page))
 57 |         try:
 58 |             r = request.urlopen(url)
 59 |         except error.HTTPError as e:
 60 |             print("An error has occurred: " + str(e))
 61 |             exit()
 62 |         print("Downloading metadata page " + str(page) + "...")
 63 |         data = json.loads(r.read().decode('UTF-8'))
 64 |         filename = path + '/page' + str(page) + '.json'
 65 |         with open(filename, 'w') as saved:
 66 |             json.dump(data, saved)
 67 |         page_num = data['numPages']
 68 |         page += 1
 69 | 
 70 |         # Rate limit of one request per second
 71 |         time.sleep(1)
 72 | 
 73 |     # Return the path to the folder containing downloaded metadata
 74 |     print("Metadata retrieval complete.")
 75 |     return path
 76 | 
 77 | 
 78 | # Uses JSON metadata files to generate a list of recording URLs
 79 | def list_urls(path):
 80 |     url_list = []
 81 |     page = 1
 82 | 
 83 |     # Initial opening of JSON to retrieve amount of pages and recordings
 84 |     with open(path + '/page' + str(page) + '.json', 'r') as jsonfile:
 85 |         data = jsonfile.read()
 86 |         jsonfile.close()
 87 |     data = json.loads(data)
 88 |     page_num = data['numPages']
 89 |     recordings_num = int(data['numRecordings'])
 90 | 
 91 |     # Clear may not be required if setting to None, included for redundancy
 92 |     data.clear()
 93 |     data = None
 94 | 
 95 |     # Set the first element to the number of recordings
 96 |     url_list.append(recordings_num)
 97 | 
 98 |     # Second element will be a list of tuples with (name, track_id, file url)
 99 |     url_list.append(list())
100 | 
101 |     # Read each metadata file and extract information into list as a tuple
102 |     while page < page_num + 1:
103 |         with open(path + '/page' + str(page) + '.json', 'r') as jsonfile:
104 |             data = jsonfile.read()
105 |             jsonfile.close()
106 |         data = json.loads(data)
107 | 
108 |         # Extract the number of recordings in the opened metadata file
109 |         rec_length = len(data['recordings'])
110 | 
111 |         # Parse through the opened data and add it to the URL list
112 |         for i in range(0, rec_length):
113 |             name = (data['recordings'][i]['en']).replace(' ', '')
114 |             track_id = data['recordings'][i]['id']
115 |             track_url = data['recordings'][i]['file']
116 |             track_format = os.path.splitext(data['recordings'][i]['file-name'])[-1]
117 |             track_info = (name, track_id, track_url, track_format)
118 |             url_list[1].append(track_info)
119 |         page += 1
120 |     return url_list
121 | 
122 | 
123 | # Client that processes the list of track information concurrently
124 | def chunked_http_client(num_chunks):
125 | 
126 |     # Semaphore used to limit the number of requests with num_chunks
127 |     semaphore = asyncio.Semaphore(num_chunks)
128 | 
129 |     # Processes a tuple from the url_list using the aiohttp client_session
130 |     async def http_get(track_tuple, client_session):
131 | 
132 |         # Work with semaphore located outside the function
133 |         nonlocal semaphore
134 |         async with semaphore:
135 | 
136 |             # Pull relevant info from tuple
137 |             name = str(track_tuple[0])
138 |             track_id = str(track_tuple[1])
139 |             url = track_tuple[2]
140 |             track_format = track_tuple[3]
141 | 
142 |             # Set up the paths required for saving the audio file
143 |             folder_path = 'dataset/audio/' + name + '/'
144 |             file_path = folder_path + track_id + track_format
145 | 
146 |             # Create an audio folder for the species if it does not exist
147 |             if not os.path.exists(folder_path):
148 |                 print("Creating recording folder at " + str(folder_path))
149 |                 os.makedirs(folder_path)
150 | 
151 |             # If the file exists in the directory, we will skip it
152 |             if os.path.exists(file_path):
153 |                 print(track_id + track_format + " is already present. Skipping...")
154 |                 return
155 | 
156 |             # Use the aiohttp client to retrieve the audio file asynchronously
157 |             async with client_session.get(url) as response:
158 |                 if response.status == 200:
159 |                     f = await aiofiles.open((file_path), mode='wb')
160 |                     await f.write(await response.content.read())
161 |                     await f.close()
162 |                 elif response.status == 503:
163 |                     print("Error 503 occurred when downloading " + track_id
164 |                           + track_format + ". Please try using a lower value for "
165 |                           "num_chunks. Consult the README for more "
166 |                           "information.")
167 |                 else:
168 |                     print("Error " + str(response.status) + " occurred "
169 |                           "when downloading " + track_id + track_format + ".")
170 | 
171 |     return http_get
172 | 
173 | 
174 | # Retrieves metadata and recordings for a given set of input param
175 | async def download(filt, num_chunks=4):
176 | 
177 |     # Retrieve metadata and generate list of track information
178 |     meta_path = metadata(filt)
179 |     url_list = list_urls(meta_path)
180 | 
181 |     # Retrieve the number of recordings to be downloaded
182 |     recordings_num = url_list[0]
183 | 
184 |     # Exit the program if no recordings are found
185 |     if (recordings_num == 0):
186 |         print("No recordings found for the provided request.")
187 |         quit()
188 | 
189 |     print(str(recordings_num) + " recordings found, downloading...")
190 | 
191 |     # Setup the aiohttp client with the desired semaphore limit
192 |     http_client = chunked_http_client(num_chunks)
193 |     async with aiohttp.ClientSession() as client_session:
194 | 
195 |         # Collect tasks and await futures to ensure concurrent processing
196 |         tasks = [http_client(track_tuple, client_session) for track_tuple in
197 |                  url_list[1]]
198 |         for future in asyncio.as_completed(tasks):
199 |             data = await future
200 |     print("Download complete.")
201 | 
202 | 
203 | # Retrieve all files while ignoring those that are hidden
204 | def listdir_nohidden(path):
205 |     for f in os.listdir(path):
206 |         if not f.startswith('.'):
207 |             yield f
208 | 
209 | 
210 | # Removes audio folders containing num or less than num files
211 | def purge(num):
212 |     print("Removing all audio folders with fewer than " + str(num) +
213 |           " recordings.")
214 |     path = 'dataset/audio/'
215 |     dirs = listdir_nohidden(path)
216 |     remove_count = 0
217 | 
218 |     # Count the number of tracks in each folder
219 |     for fold in dirs:
220 |         fold_path = path + fold
221 |         count = sum(1 for _ in listdir_nohidden(fold_path))
222 | 
223 |         # Remove the folder if the track amount is less than input
224 |         if count < num:
225 |             print("Deleting " + fold_path + " since <" + str(num) + " tracks.")
226 |             shutil.rmtree(fold_path)
227 |             remove_count = remove_count + 1
228 |     print(str(remove_count) + " folders removed.")
229 | 
230 | 
231 | # Deletes audio tracks based on provided parameters
232 | def delete(filt):
233 | 
234 |     # Generating list of current tracks with metadata
235 |     gen_meta()
236 | 
237 |     # Separating desired tags from values for parsing
238 |     tags = list()
239 |     vals = list()
240 |     for f in filt:
241 |         tag = f.split(':')[0]
242 |         tags.append(tag)
243 | 
244 |         val = f.split(':')[1]
245 |         if tag == 'en':
246 |             val = val.replace('_', ' ')
247 |         vals.append(val)
248 | 
249 |     with open('dataset/metadata/library.json', 'r') as jsonfile:
250 |         data = jsonfile.read()
251 |     data = json.loads(data)
252 | 
253 |     # Creating a set of track id's to delete
254 |     track_del = set()
255 |     for i in range(int(data['recordingNumber'])):
256 |         for j in range(len(tags)):
257 |             if data['tracks'][i][str(tags[j])] == str(vals[j]):
258 |                 track_del.add(int(data['tracks'][i]['id']))
259 | 
260 |             # Proposed change for deletion of tracks matching all inputs
261 |             # rather than any
262 |             #
263 |             # if data['tracks'][i][str(tags[j])] != str(vals[j]):
264 |                 # exit this for loop
265 |             # track_del.add(int(data['tracks'][i]['id']))
266 | 
267 |     print(str(len(track_del)) + " tracks have been identified to be deleted.")
268 | 
269 |     # Checking audio folders for tracks to delete
270 |     path = 'dataset/audio/'
271 |     dirs = listdir_nohidden(path)
272 |     removed = 0
273 |     for fold in dirs:
274 |         fold_path = path + fold
275 |         tracks = listdir_nohidden(fold_path)
276 |         for tr in tracks:
277 |             if int(tr.split('.')[0]) in track_del:
278 |                 os.remove(fold_path + '/' + str(tr))
279 |                 removed = removed + 1
280 | 
281 |     print(str(removed) + " tracks deleted!")
282 | 
283 |     # Removing any empty folders
284 |     purge(1)
285 | 
286 | 
287 | # Generate a metadata file for given library path
288 | def gen_meta(path='dataset/audio/'):
289 | 
290 |     # Checking to see if the path exists
291 |     if not os.path.exists(path):
292 |         print("Path " + str(path) + " does not exist.")
293 |         return
294 |     print("Generating metadata file for current recording library...")
295 | 
296 |     # Removing old library file if exists
297 |     if os.path.exists('dataset/metadata/library.json'):
298 |         os.remove('dataset/metadata/library.json')
299 | 
300 |     # Create a list of track ID's contained in the current library
301 |     id_list = set()
302 | 
303 |     for fold in listdir_nohidden(path):
304 |         filenames = listdir_nohidden(path + fold)
305 |         for f in filenames:
306 |             track_id = (f.split('.'))
307 |             id_list.add(track_id[0])
308 | 
309 |     count = len(id_list)
310 |     print(str(count) + " recordings have been found. Collecting metadata...")
311 | 
312 |     write_data = dict()
313 |     write_data['recordingNumber'] = str(count)
314 |     write_data['tracks'] = list()
315 | 
316 |     # Create a list of all metadata files
317 |     meta_files = list()
318 |     if os.path.exists('dataset/metadata/'):
319 |         for filename in listdir_nohidden('dataset/metadata/'):
320 |             if filename != 'library.json':
321 |                 meta_files.append(filename)
322 | 
323 |     # Check each metadata track for presence in library
324 |     found_files = set()
325 |     for f in meta_files:
326 |         page_num = 1
327 |         page = 1
328 | 
329 |         while page < page_num + 1:
330 | 
331 |             # Open the json
332 |             with open('dataset/metadata/' + f + '/page' + str(page) +
333 |                       '.json', 'r') as jsonfile:
334 |                 data = jsonfile.read()
335 |             data = json.loads(data)
336 |             page_num = data['numPages']
337 | 
338 |             # Parse through each track
339 |             for i in range(len(data['recordings'])):
340 |                 track = data['recordings'][i]['id']
341 |                 if track in id_list:
342 |                     track_info = data['recordings'][i]
343 |                     write_data['tracks'].append(track_info)
344 |             page += 1
345 | 
346 |     # Retrieves information from  API for tracks that cannot be found in the
347 |     # currently saved metadata
348 |     found_files = list()
349 |     for i in range(len(write_data['tracks'])):
350 |         found_files.append(write_data['tracks'][i]['id'])
351 | 
352 |     not_found = list(set(id_list) - set(found_files))
353 |     if not_found:
354 |         print(str(len(not_found)) + " recordings must have their "
355 |               " metadata downloaded.")
356 | 
357 |     # Retrieves metadata for each of the recordings individually
358 |     for i in not_found:
359 |         track_find = 'nr:' + i
360 |         path = metadata([track_find])
361 |         with open(path + '/page1.json') as jsonfile:
362 |             data = jsonfile.read()
363 |         data = json.loads(data)
364 |         write_data['tracks'].append(data['recordings'][0])
365 | 
366 |     with open('data.txt', 'w') as outfile:
367 |         json.dump(write_data, outfile)
368 | 
369 |     os.rename('data.txt', 'dataset/metadata/library.json')
370 |     print("Metadata successfully generated at dataset/metadata/library.json")
371 | 
372 | 
373 | # Accepts command line input to determine function to execute
374 | def main():
375 |     if len(sys.argv) == 1:
376 |         print("No command given. Please consult the README for help.")
377 |         return
378 | 
379 |     if len(sys.argv) == 2:
380 |         print("Commands must be given in a '-command parameter' format. "
381 |               "Please consult the README for help.")
382 | 
383 |     act = sys.argv[1]
384 |     params = sys.argv[2:]
385 | 
386 |     # Retrieve metadata
387 |     if act == "-m":
388 |         metadata(params)
389 | 
390 |     # Download recordings
391 |     elif act == "-dl":
392 |         start = time.time()
393 |         asyncio.run(download(params))
394 |         end = time.time()
395 |         print("Duration: " + str(int(end - start)) + "s")
396 | 
397 |     # Purge folders
398 |     elif act == "-p":
399 |         purge(int(params[0]))
400 | 
401 |     # Generate library metadata
402 |     elif act == "-g":
403 |         if len(params) == 1:
404 |             gen_meta(params[0])
405 |         else:
406 |             gen_meta()
407 | 
408 |     # Delete recordings matching ANY input parameter
409 |     elif act == '-del':
410 |         dec = input("Proceed with deleting? (Y or N)\n")
411 |         if dec == "Y":
412 |             delete(params)
413 | 
414 |     else:
415 |         print("Command not found, please consult the README.")
416 | 
417 | 
418 | # Handles command line execution
419 | if __name__ == '__main__':
420 |     main()
421 | 


--------------------------------------------------------------------------------