├── .gitignore
├── .pylintrc
├── .travis.yml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── bin
├── .travis_doc_url_fix
├── docker_push
├── generate_docs
├── lint
└── test
├── cleansio
├── audio
│ ├── __init__.py
│ ├── accuracy.py
│ ├── audio_file.py
│ ├── chunk_wrapper.py
│ └── convert.py
├── censor
│ ├── __init__.py
│ ├── censor.py
│ ├── censor_file.py
│ ├── censor_realtime.py
│ └── censor_realtime_mac.py
├── cleansio.py
├── data
│ ├── encoding-types
│ └── explicits-list
├── explicits
│ ├── __init__.py
│ ├── explicits.py
│ └── user_explicits.py
├── speech
│ ├── __init__.py
│ ├── timestamp.py
│ └── transcribe.py
└── utils
│ ├── __init__.py
│ ├── cleanup.py
│ ├── cli.py
│ ├── constants.py
│ ├── env.py
│ ├── files.py
│ ├── mac.py
│ └── numbers.py
├── docs
├── Makefile
├── _static
│ └── .keep
├── _templates
│ └── .keep
├── conf.py
├── index.rst
└── make.bat
├── media
├── logo.png
└── poster.png
├── requirements-dev.txt
├── requirements-mac.txt
├── requirements.txt
└── tests
├── accuracy
└── test_loudness_maximization.py
├── censor
└── test_censor.py
├── data
└── testing.wav
├── explicits
└── test_user_explicits.py
└── utils
├── numbers
├── test_gcs_time_to_ms.py
├── test_is_number.py
└── test_leading_zero.py
├── test_cleanup.py
└── test_files.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | .pytest_cache
3 | __pycache__/
4 | *.bak
5 | .cache/
6 |
7 | # Miscellaneous
8 | *.swp
9 |
10 | # Documentation
11 | docs/_build
12 | docs/*.rst
13 | !docs/index.rst
14 | docs/_static/classes.png
15 | docs/_static/packages.png
16 |
17 | # Compiled Python files
18 | *.pyc
19 |
20 | # VIM settings
21 | .vimrc
22 |
23 | # Audio files
24 | *.ac3
25 | *.adx
26 | *.aiff
27 | *.alaw
28 | *.asf
29 | *.au
30 | *.avi
31 | *.eac3
32 | *.f32be
33 | *.f32le
34 | *.f64be
35 | *.f64le
36 | *.ffmetadata
37 | *.flac
38 | *.flv
39 | *.g722
40 | *.mmf
41 | *.mp3
42 | *.mpeg
43 | *.mpegts
44 | *.mulaw
45 | *.nut
46 | *.ogg
47 | *.rm
48 | *.rso
49 | *.s16be
50 | *.s16le
51 | *.s24be
52 | *.s24le
53 | *.s32be
54 | *.s32le
55 | *.s8
56 | *.smjpeg
57 | *.sox
58 | *.spdif
59 | *.swf
60 | *.u16be
61 | *.u16le
62 | *.u24be
63 | *.u24le
64 | *.u32be
65 | *.u32le
66 | *.u8
67 | *.voc
68 | *.wav
69 |
70 | # Test audio file
71 | !tests/data/testing.wav
72 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | disable=fixme,import-error,no-else-return,no-member,too-few-public-methods,too-many-arguments,useless-super-delegation,too-many-locals,too-many-instance-attributes,broad-except,no-self-use
3 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | sudo: required
3 | services:
4 | - docker
5 | python:
6 | - "3.5"
7 | - "3.6"
8 | install:
9 | # Needed for the UML diagrams
10 | - sudo apt-get install graphviz libportaudio2
11 | - pip install -r requirements.txt -r requirements-dev.txt -r requirements-mac.txt
12 | script:
13 | - ./bin/lint
14 | - ./bin/test
15 | - ./bin/generate_docs
16 | - ./bin/.travis_doc_url_fix
17 | deploy:
18 | - provider: script
19 | script: bash bin/docker_push
20 | on:
21 | branch: master
22 | - provider: pages
23 | github-token: $GITHUB_TOKEN
24 | keep-history: false
25 | local-dir: docs/_build/html
26 | on:
27 | branch: master
28 | skip-cleanup: true
29 | target-branch: gh-pages
30 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | When contributing to this repository, please first discuss the change you wish to make via issue,
4 | email, or any other method with the owners of this repository before making a change.
5 |
6 | ## Pull Request Process
7 |
8 | 1. Create an issue for what you want to contribue and create a pull request that links to that issue.
9 | 2. Update the README.md with details of changes to the interface, this includes new libraries, new environment variables, exposed ports, useful file locations and container parameters.
10 | 3. For new functionality and new libraries, create new Wiki pages that explains these.
11 | 4. Merge the Pull Request once you have the approval of another developers (preferably using **Squash and merge**). If you do not have permission to do that, you may request the reviewer to merge it for you.
12 |
13 | ## Code of Conduct
14 |
15 | ### Our Pledge
16 |
17 | In the interest of fostering an open and welcoming environment, we as
18 | contributors and maintainers pledge to making participation in our project and
19 | our community a harassment-free experience for everyone, regardless of age, body
20 | size, disability, ethnicity, gender identity and expression, level of experience,
21 | nationality, personal appearance, race, religion, or sexual identity and
22 | orientation.
23 |
24 | ### Our Standards
25 |
26 | Examples of behavior that contributes to creating a positive environment
27 | include:
28 |
29 | * Using welcoming and inclusive language
30 | * Being respectful of differing viewpoints and experiences
31 | * Gracefully accepting constructive criticism
32 | * Focusing on what is best for the community
33 | * Showing empathy towards other community members
34 |
35 | Examples of unacceptable behavior by participants include:
36 |
37 | * The use of sexualized language or imagery and unwelcome sexual attention or
38 | advances
39 | * Trolling, insulting/derogatory comments, and personal or political attacks
40 | * Public or private harassment
41 | * Publishing others' private information, such as a physical or electronic
42 | address, without explicit permission
43 | * Other conduct which could reasonably be considered inappropriate in a
44 | professional setting
45 |
46 | ### Our Responsibilities
47 |
48 | Project maintainers are responsible for clarifying the standards of acceptable
49 | behavior and are expected to take appropriate and fair corrective action in
50 | response to any instances of unacceptable behavior.
51 |
52 | Project maintainers have the right and responsibility to remove, edit, or
53 | reject comments, commits, code, wiki edits, issues, and other contributions
54 | that are not aligned to this Code of Conduct, or to ban temporarily or
55 | permanently any contributor for other behaviors that they deem inappropriate,
56 | threatening, offensive, or harmful.
57 |
58 | ### Scope
59 |
60 | This Code of Conduct applies both within project spaces and in public spaces
61 | when an individual is representing the project or its community. Examples of
62 | representing a project or community include using an official project e-mail
63 | address, posting via an official social media account, or acting as an appointed
64 | representative at an online or offline event. Representation of a project may be
65 | further defined and clarified by project maintainers.
66 |
67 | ### Enforcement
68 |
69 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
70 | reported by contacting the project team at cleanseaudio@gmail.com. All
71 | complaints will be reviewed and investigated and will result in a response that
72 | is deemed necessary and appropriate to the circumstances. The project team is
73 | obligated to maintain confidentiality with regard to the reporter of an incident.
74 | Further details of specific enforcement policies may be posted separately.
75 |
76 | Project maintainers who do not follow or enforce the Code of Conduct in good
77 | faith may face temporary or permanent repercussions as determined by other
78 | members of the project's leadership.
79 |
80 | ### Attribution
81 |
82 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
83 | available at [http://contributor-covenant.org/version/1/4][version]
84 |
85 | [homepage]: http://contributor-covenant.org
86 | [version]: http://contributor-covenant.org/version/1/4/
87 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.5
2 |
3 | USER root
4 | WORKDIR /root
5 |
6 | COPY . cleansio
7 |
8 | #===============================================================================
9 | # Install Libraries
10 | #===============================================================================
11 | RUN apt-get update && \
12 | apt-get -qqy install \
13 | # For pydub
14 | libav-tools \
15 | libavcodec-extra
16 |
17 | WORKDIR cleansio
18 |
19 | RUN pip install -r requirements.txt
20 |
21 | #===============================================================================
22 | # Set Google Speech API
23 | #===============================================================================
24 | ENV GOOGLE_APPLICATION_CREDENTIALS=/google-cloud-speech-api.json
25 |
26 | #===============================================================================
27 | # Execute cleansio
28 | #===============================================================================
29 | ENTRYPOINT ["python", "cleansio/cleansio.py"]
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Patrick Duncan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Cleansio
2 |
3 | [](https://travis-ci.com/PatrickDuncan/cleansio) [](https://opensource.org/licenses/MIT) [](https://cleansio.readthedocs.io/en/latest/?badge=latest)
4 |
5 |
6 |
7 |
8 | ## Usage
9 |
10 | ```sh
11 | python cleansio/cleansio.py --help
12 | ```
13 |
14 | ### Requirements
15 |
16 | 1. Posix shell
17 | 2. Internet connection
18 |
19 | ## Setup
20 |
21 | 1. Check the Requirements
22 | 2. Clone this repo
23 | 3. Install Python 3.5+
24 | - [Anaconda is recommended](https://www.anaconda.com/download/)
25 | 4. Download your Google Cloud Credentials JSON file
26 | 5. Set the following environment variables:
27 | ```sh
28 | export GOOGLE_APPLICATION_CREDENTIALS=
29 | ```
30 | 6. Follow these additional steps:
31 | - [ffmpeg set-up](https://github.com/jiaaro/pydub#getting-ffmpeg-set-up)
32 | 7. Install Cleansio's dependencies:
33 | ```sh
34 | pip install -r requirements.txt
35 | ```
36 | 8. Follow the **[real-time setup instructions](https://github.com/PatrickDuncan/cleansio/wiki/Real-Time-Installation)**
37 | 9. _(OPTIONAL)_ If you're a developer run:
38 | ```sh
39 | pip install -r requirements-dev.txt
40 | ```
41 | 10. _(OPTIONAL)_ Install the [Google Cloud SDK](https://cloud.google.com/sdk/docs/)
42 | 11. You're all set!
43 |
44 | ## Documentation
45 |
46 | [Technical Documentation](https://patrickdduncan.com/cleansio)
47 |
48 | [Slideshow](https://patrickdduncan.com/clenasio-slideshow)
49 |
50 | **Build Locally.** Available at _docs/\_build/html/index.html_
51 | ```sh
52 | ./bin/generate_docs
53 | ```
54 |
55 | ### Linting
56 |
57 | **Run**
58 | ```sh
59 | ./bin/lint
60 | ```
61 |
62 | **Help**
63 | ```sh
64 | pylint --help-msg=
65 | ```
66 |
67 | ### Testing
68 |
69 | ```sh
70 | ./bin/test
71 | ```
72 |
73 | ## Docker
74 |
75 | **Run**
76 | ```sh
77 | docker run \
78 | --tty \
79 | --rm \
80 | --volume :/music \
81 | --volume :/google-cloud-speech-api.json \
82 | --name cleansio \
83 | patrickduncan/cleansio \
84 | /music/
85 | ```
86 |
87 | **Build**
88 | ```sh
89 | docker build --tag "cleansio:dev" .
90 | ```
91 |
92 | ## Contributors
93 |
94 |
95 | | [
Patrick D. Duncan](https://patrickduncan.co)
[💻](https://github.com/patrickduncan/cleansio/commits?author=patrickduncan) | [
Levin Noronha](https://github.com/levin-noro)
[💻](https://github.com/patrickduncan/cleansio/commits?author=levin-noro) | [
Corie Bain](https://github.com/c-bain)
[💻](https://github.com/patrickduncan/cleansio/commits?author=c-bain) | [
Victor Carri](https://github.com/VictorCarri)
[💻](https://github.com/patrickduncan/cleansio/commits?author=VictorCarri) |
96 | | :---: | :---: | :---: | :---: |
97 |
98 |
--------------------------------------------------------------------------------
/bin/.travis_doc_url_fix:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Underscores break GitHub Pages URLs. Remove them from folder/file identifiers.
4 | for file in $(find docs/_build/html -name "*_*" | sort -r); do
5 | dir=$(dirname "$file")
6 | mv "$file" "$dir/$(basename "$file" | tr -d _)"
7 | done
8 |
9 | # Remove underscores from the HTML files
10 | # TODO: Turn into 1 sed call
11 | sed -i 's/\(href[ ]*=[ ]*".*\)_/\1/g' docs/_build/html/*.html
12 | sed -i 's/\(href[ ]*=[ ]*".*\)_/\1/g' docs/_build/html/*.html
13 | sed -i 's/\(src[ ]*=[ ]*".*\)_/\1/g' docs/_build/html/*.html
14 | sed -i 's/\(src[ ]*=[ ]*".*\)_/\1/g' docs/_build/html/*.html
15 |
--------------------------------------------------------------------------------
/bin/docker_push:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | docker build --tag "$DOCKER_USERNAME/cleansio:latest" .
4 | echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
5 | docker push "$DOCKER_USERNAME"/cleansio
6 |
--------------------------------------------------------------------------------
/bin/generate_docs:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | echo 'WARNING: This will take a long time'
4 | echo
5 | # Generate the class and package diagrams
6 | (cd cleansio && pyreverse -o png */*.py)
7 | mv cleansio/classes.png cleansio/packages.png docs/_static
8 |
9 | # Generate the documentation files
10 | find docs -name '*.rst' ! -name 'index.rst' -type f -exec rm -f {} +
11 | sphinx-apidoc -o docs cleansio
12 | (cd docs && make html)
13 |
--------------------------------------------------------------------------------
/bin/lint:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | exit_code=0
4 |
5 | for file in $(find . -name "*.py"); do
6 | if ! pylint --load-plugins pylint_quotes $file; then
7 | exit_code=1
8 | fi
9 | done
10 |
11 | exit $exit_code
12 |
--------------------------------------------------------------------------------
/bin/test:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | export PYTHONPATH=cleansio && pytest
4 |
--------------------------------------------------------------------------------
/cleansio/audio/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 |
3 | from .audio_file import AudioFile
4 | from .chunk_wrapper import ChunkWrapper
5 | from .convert import read_and_convert_audio, convert_audio_segment, convert_and_write_chunk
6 | from .accuracy import improve_accuracy
7 |
--------------------------------------------------------------------------------
/cleansio/audio/accuracy.py:
--------------------------------------------------------------------------------
1 | """ Functions which preprocess an audio chunk to improve the accuracy of speech
2 | recognition. """
3 |
4 | def __maximize_volume(chunk):
5 | return chunk - chunk.max_dBFS
6 |
7 | def improve_accuracy(chunk):
8 | """ Filter chunk through various functions to improve speech recognition """
9 | return __maximize_volume(chunk)
10 |
--------------------------------------------------------------------------------
/cleansio/audio/audio_file.py:
--------------------------------------------------------------------------------
1 | """ Classifies an audio file that will be broken up into chunks """
2 |
3 | import sys
4 | from textwrap import dedent
5 | from pydub import AudioSegment
6 | from colorama import Fore
7 | from utils import create_temp_dir, create_env_var, file_name_no_ext, \
8 | append_before_ext, CHUNK_LEN
9 | from .accuracy import improve_accuracy
10 | from .convert import convert
11 |
12 | class AudioFile:
13 | """ Classifies an audio file """
14 | def __init__(self, file_path):
15 | try:
16 | self.file_path = convert(file_path)
17 | except FileNotFoundError:
18 | self.__handle_file_not_found(file_path)
19 | self.encoding = 'LINEAR16'
20 | audio_segment = AudioSegment.from_file(self.file_path)
21 | self.channels = audio_segment.channels
22 | self.frame_rate = audio_segment.frame_rate
23 | self.normal_chunks, self.overlapping_chunks = \
24 | self.__init_create_chunks(audio_segment)
25 |
26 | def __init_create_chunks(self, audio_segment):
27 | """ Breaks up the file into small chunks """
28 | temp_dir = create_temp_dir()
29 | normal_chunks = []
30 | overlapping_chunks = []
31 | # Create normal and overlapping chunks
32 | self.__create_chunks(audio_segment, temp_dir, normal_chunks, False)
33 | self.__create_chunks(audio_segment, temp_dir, overlapping_chunks, True)
34 | # Add the list of chunk filepaths to an ENV variable for post cleanup
35 | create_env_var(
36 | 'CLEANSIO_CHUNKS_LIST', str(normal_chunks + overlapping_chunks))
37 | return normal_chunks, overlapping_chunks
38 |
39 | def __create_chunks(self, audio_segment, temp_dir, chunks_arr, overlapping):
40 | start = 2500 if overlapping else 0
41 | chunks = audio_segment[start::CHUNK_LEN]
42 | for index, chunk in enumerate(chunks):
43 | chunk_path = self.__create_chunk(
44 | index, chunk, 'wav', temp_dir, overlapping)
45 | chunks_arr.append(chunk_path)
46 | # Fix for when the last chunk isn't long enough to support overlapping
47 | self.__last_overlapping_chunk(
48 | audio_segment, temp_dir, chunks_arr, overlapping)
49 |
50 | def __create_chunk(self, index, chunk, extension, temp_dir, overlapping):
51 | file_name = file_name_no_ext(self.file_path)
52 | file_path = temp_dir + file_name + '-' + str(index) + '.' + extension
53 | if overlapping:
54 | file_path = append_before_ext(file_path, '-overlapping')
55 | # Chunk that will be modified for accuracy's sake
56 | accuracy_path = append_before_ext(file_path, '-accuracy')
57 | with open(accuracy_path, 'wb') as chunk_file:
58 | accuracy_chunk = improve_accuracy(chunk)
59 | accuracy_chunk.export(chunk_file, format=extension)
60 | if overlapping: # The normal overlapping chunk is not needed
61 | return chunk_file.name
62 | # Chunk that will be censored and preserve audio quality
63 | with open(file_path, 'wb') as chunk_file:
64 | chunk.export(chunk_file, format=extension)
65 | return chunk_file.name
66 |
67 | def __last_overlapping_chunk(
68 | self, audio_segment, temp_dir, chunks_arr, overlapping):
69 | """ Check if the chunk is long enough to support overlapping """
70 | if overlapping and len(audio_segment) % CHUNK_LEN < 4000:
71 | chunk_path = self.__create_chunk(
72 | len(audio_segment) // CHUNK_LEN, # Last index
73 | AudioSegment.silent(frame_rate=44100), # Silent chunk
74 | 'wav', temp_dir, overlapping)
75 | chunks_arr.append(chunk_path)
76 |
77 | @classmethod
78 | def __handle_file_not_found(cls, file_path):
79 | print(dedent('''\
80 | {0}Audio file '{1}{2}{0}' could not be found
81 | Make sure the audio file path is correct.\
82 | '''.format(Fore.RED, Fore.YELLOW, str(file_path))))
83 | sys.exit(0)
84 |
--------------------------------------------------------------------------------
/cleansio/audio/chunk_wrapper.py:
--------------------------------------------------------------------------------
1 | """ Wrapper for pydub's AudioSegment, used to add new properties """
2 |
3 | class ChunkWrapper():
4 | """ Wrapper for pydub's AudioSegment """
5 |
6 | def __init__(self, audio_segment, mute_next_chunk=0):
7 | super().__init__()
8 | self.segment = audio_segment
9 | self.mute_next_start = mute_next_chunk
10 |
--------------------------------------------------------------------------------
/cleansio/audio/convert.py:
--------------------------------------------------------------------------------
1 | """ Converts audio properties """
2 |
3 | import os
4 | from pydub import AudioSegment
5 | from utils import create_temp_dir, time_filename
6 |
7 | def __sample_rate(audio_segment):
8 | """ GCS requires at least 16 kHz. Either upscale or keep the same. """
9 | frame_rate = audio_segment.frame_rate
10 | return 16000 if frame_rate < 16000 else frame_rate
11 |
12 | def __create_converted_file(file_path, encoding):
13 | """ LINEAR16 must be mono and 16 bits (2) """
14 | audio_segment = AudioSegment.from_file(file_path)
15 | audio_segment \
16 | .set_channels(1) \
17 | .set_sample_width(2) \
18 | .set_frame_rate(__sample_rate(audio_segment)) \
19 | .export(os.environ['CLEANSIO_TEMP_FILE'], format=encoding)
20 |
21 | def convert(file_path, encoding='wav'):
22 | """ Converts an audio file's encoding, returns the file path """
23 | temp_dir = create_temp_dir()
24 | os.environ['CLEANSIO_TEMP_FILE'] = temp_dir + \
25 | str(time_filename()) + '.' + encoding
26 | __create_converted_file(file_path, encoding)
27 | return os.environ['CLEANSIO_TEMP_FILE']
28 |
29 | def read_and_convert_audio(file_path):
30 | """ Create a GCS AudioSegment from the file_path """
31 | audio_segment = AudioSegment.from_file(file_path)
32 | audio_segment \
33 | .set_channels(1) \
34 | .set_sample_width(2) \
35 | .set_frame_rate(__sample_rate(audio_segment))
36 | return audio_segment
37 |
38 | def convert_audio_segment(audio_segment):
39 | """ Create a GCS AudioSegment """
40 | audio_segment \
41 | .set_channels(1) \
42 | .set_sample_width(2) \
43 | .set_frame_rate(__sample_rate(audio_segment))
44 | return audio_segment
45 |
46 | def convert_and_write_chunk(chunk, file_path, encoding):
47 | """ Create a GCS AudioSegment and write to the file path """
48 | chunk \
49 | .set_channels(1) \
50 | .set_sample_width(2) \
51 | .set_frame_rate(44100) \
52 | .export(file_path, format=encoding)
53 |
--------------------------------------------------------------------------------
/cleansio/censor/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 |
3 | from .censor_file import CensorFile
4 | from .censor_realtime import CensorRealtime
5 | from .censor import Censor
6 |
--------------------------------------------------------------------------------
/cleansio/censor/censor.py:
--------------------------------------------------------------------------------
1 | """ Censors audio chunks by muting explicit sections """
2 |
3 | from multiprocessing import Lock
4 | from pathlib import Path
5 | from colorama import Fore
6 | from pydub import AudioSegment
7 | from utils import CHUNK_LEN
8 | from audio import ChunkWrapper
9 | from speech import Timestamp, Transcribe
10 |
11 | class Censor():
12 | """ Superclass of CensorFile and CensorRealtime """
13 | lock = Lock()
14 | explicit_count = 0
15 | muted_timestamps = []
16 |
17 | def __init__(self, explicits, output_encoding, output_location):
18 | super().__init__()
19 | self.explicits = explicits
20 | self.encoding = self.__encoding(output_encoding)
21 | self.location = self.__location(output_location)
22 |
23 | def censor_audio_chunk(self, file_path):
24 | """ Common process to censor an audio chunk """
25 | audio_segment = AudioSegment.from_file(file_path)
26 | lyrics = self.__get_lyrics(file_path, audio_segment)
27 | timestamps = self.__get_timestamps(lyrics)
28 | wrapper = ChunkWrapper(audio_segment)
29 | if timestamps:
30 | return self.__mute_explicits(file_path, wrapper, timestamps)
31 | else: # No mute so just return the original file
32 | return wrapper
33 |
34 | def create_clean_file(self, clean_file):
35 | """ Write cleaned up AudioSegment object to an audio file """
36 | self.print_explicits_count()
37 | clean_file.export(self.location, format=self.encoding)
38 | print(Fore.CYAN + 'Successfully created clean file, it\'s located at:')
39 | print(Fore.YELLOW + self.location)
40 |
41 | def print_explicits_count(self):
42 | """ Display to user the number of explicts Cleansio detected """
43 | print('Cleansio found {1}{0}{2} explicit(s)!'.format(
44 | Censor.explicit_count, Fore.GREEN, Fore.RESET))
45 |
46 | def __mute_explicits(self, file_path, wrapper, timestamps):
47 | """ Go through each word, if its an explicit, mute the duration """
48 | for stamp in timestamps:
49 | if stamp['word'] in self.explicits: # Explicit found, mute
50 | chunk_index = int(file_path.split('-')[-1].split('.')[0])
51 | wrapper = self.__mute_explicit(wrapper, stamp)
52 | self.__explicit_count(stamp, chunk_index * CHUNK_LEN)
53 | return wrapper
54 |
55 | def __location(self, location):
56 | if location:
57 | return location[0]
58 | current_dir = str(Path(__file__).parents[2])
59 | return current_dir + '/clean_file.' + self.encoding
60 |
61 | def __encoding(self, encoding):
62 | return encoding[0] if encoding else 'wav'
63 |
64 | @classmethod
65 | def __mute_explicit(cls, wrapper, timestamp):
66 | len_as = len(wrapper.segment)
67 | # Check if the timestamp is outside of this chunk (from overlapping)
68 | if timestamp['start'] > len_as:
69 | return wrapper
70 | beginning = wrapper.segment[:timestamp['start']]
71 | # The end of the timestamp cannot be longer than the file
72 | end_time = len_as if len_as < timestamp['end'] else timestamp['end']
73 | duration = end_time - timestamp['start']
74 | mute = AudioSegment.silent(duration=duration)
75 | end = wrapper.segment[end_time:]
76 | wrapper.segment = (beginning + mute + end)
77 | wrapper.mute_next_start = \
78 | cls.__mute_next_chunk(wrapper, timestamp['end'])
79 | return wrapper
80 |
81 | @classmethod
82 | def __mute_next_chunk(cls, wrapper, end_time):
83 | # Store how much the next chunk should mute from its beginning
84 | extra_time = end_time - CHUNK_LEN
85 | return max(extra_time, wrapper.mute_next_start)
86 |
87 | @classmethod
88 | def __get_lyrics(cls, file_path, audio_segment):
89 | return Transcribe(file_path, audio_segment.frame_rate).lyrics
90 |
91 | @classmethod
92 | def __get_timestamps(cls, lyrics):
93 | return Timestamp(lyrics).timestamps
94 |
95 | @classmethod
96 | def __explicit_count(cls, stamp, chunk_offset):
97 | """ Count the number of explicits safely """
98 | stamp['start'] += chunk_offset
99 | stamp['end'] += chunk_offset
100 | new_stamp = True
101 | Censor.lock.acquire()
102 | for mut in Censor.muted_timestamps:
103 | if cls.__duplicate_stamp(mut, stamp):
104 | new_stamp = False
105 | break
106 | if new_stamp or not Censor.muted_timestamps:
107 | Censor.explicit_count += 1
108 | Censor.muted_timestamps.append(stamp)
109 | Censor.lock.release()
110 |
111 | @classmethod
112 | def __duplicate_stamp(cls, stamp1, stamp2):
113 | """ If 2 timestamps are the same word and start and at relatively the
114 | same time, then assume they're the same timestamp """
115 | if stamp1['word'] == stamp2['word'] and \
116 | abs(stamp1['start'] - stamp2['start']) < 201 and \
117 | abs(stamp1['end'] - stamp2['end']) < 201:
118 | return True
119 | return False
120 |
--------------------------------------------------------------------------------
/cleansio/censor/censor_file.py:
--------------------------------------------------------------------------------
1 | """ Creates a clean version of a file by removing explicits """
2 |
3 | from itertools import repeat
4 | from multiprocessing.dummy import Pool as ThreadPool
5 | from colorama import Fore, Style
6 | from tqdm import tqdm
7 | from pydub import AudioSegment
8 | from audio import AudioFile
9 | from .censor import Censor
10 |
11 | class CensorFile(Censor):
12 | """ Removes explicits from a file """
13 | def __init__(self, args, explicits):
14 | super().__init__(explicits, args.output_encoding, args.output_location)
15 | self.file_path = args.file_path
16 |
17 | def censor(self):
18 | """ Creates a clean/new version of a file by removing explicits """
19 | audio_file = AudioFile(self.file_path)
20 | # Define the CLI progress bar
21 | p_bar, p_bar_step = self.__progress_bar(audio_file.normal_chunks)
22 | async_iter = zip(
23 | repeat(p_bar),
24 | repeat(p_bar_step),
25 | audio_file.normal_chunks)
26 | # Censor each audio chunk file asynchronously
27 | censored_chunks = ThreadPool(6).map(self.__censor_chunk, async_iter)
28 | clean_file = self.__create_clean_segment(censored_chunks)
29 | p_bar.close()
30 | self.create_clean_file(clean_file)
31 |
32 | def __censor_chunk(self, async_iter):
33 | """ Censors a chunk and updates the progress bar """
34 | p_bar, p_bar_step, chunk_file_path = async_iter
35 | p_bar.update(p_bar_step)
36 | return self.censor_audio_chunk(chunk_file_path)
37 |
38 | def __create_clean_file(self, clean_file):
39 | exp = 'explicit' if Censor.explicit_count == 1 else 'explicits'
40 | print('Cleansio found {1}{2}{0}{3} {4}!'.format(
41 | Censor.explicit_count, Style.BRIGHT, Fore.GREEN, Fore.RESET, exp))
42 | clean_file.export(self.location, format=self.encoding)
43 | print(Fore.CYAN + 'Successfully created clean file, it\'s located at:')
44 | print(Fore.YELLOW + self.location)
45 |
46 | @classmethod
47 | def __create_clean_segment(cls, censored_chunks):
48 | clean_file = AudioSegment.empty()
49 | s_mute = 0
50 | for wrapper in censored_chunks: # Join the chunks together
51 | # Mute the start of a chunk based on the previous chunk
52 | clean_file += \
53 | AudioSegment.silent(duration=s_mute) + wrapper.segment[s_mute:]
54 | s_mute = wrapper.mute_next_start
55 | return clean_file
56 |
57 | @classmethod
58 | def __progress_bar(cls, normal_chunks):
59 | progress_bar_total = 100
60 | progress_bar = tqdm(
61 | # Remove the detailed percentage stats
62 | bar_format=Style.BRIGHT + Fore.GREEN + '{l_bar}{bar}' + Fore.RESET,
63 | desc='Censoring file', # Description
64 | leave=False, # Remove bar after completion
65 | ncols=42, # Set width
66 | total=progress_bar_total)
67 | progress_bar_step = (1 / len(normal_chunks)) * progress_bar_total
68 | return progress_bar, progress_bar_step
69 |
--------------------------------------------------------------------------------
/cleansio/censor/censor_realtime.py:
--------------------------------------------------------------------------------
1 | """ Censors audio chunks in a continuous stream """
2 |
3 | import platform
4 | from colorama import Fore
5 | from utils import create_env_var
6 | from .censor_realtime_mac import CensorRealtimeMac
7 |
8 | class CensorRealtime():
9 | """ Filters audio stream in real-time """
10 | def __init__(self, args, explicits):
11 | super().__init__()
12 | self.explicits = explicits
13 | self.args = args
14 | create_env_var('CLEANSIO_REALTIME', 'true')
15 |
16 | def censor(self):
17 | """ Censors audio in real-time. Implementation dependent on OS """
18 | system = platform.system()
19 | if system == 'Darwin':
20 | CensorRealtimeMac(self.args, self.explicits).censor()
21 | else:
22 | print(Fore.RED + 'Real-time does not support your OS' + Fore.RESET)
23 |
--------------------------------------------------------------------------------
/cleansio/censor/censor_realtime_mac.py:
--------------------------------------------------------------------------------
1 | """ Censors audio chunks in a continuous stream """
2 |
3 | import os
4 | import threading
5 | import sounddevice as sd
6 | import soundfile as sf
7 | from pydub import AudioSegment
8 | from audio import improve_accuracy, convert_and_write_chunk, \
9 | read_and_convert_audio
10 | from utils import create_env_var, create_temp_dir, append_before_ext, \
11 | time_filename, MacUtil, CHUNK_LEN
12 | from .censor import Censor
13 |
14 | class CensorRealtimeMac(Censor):
15 | """ Removes explicits from audio stream in real-time """
16 |
17 | running = True
18 |
19 | def __init__(self, args, explicits):
20 | print('Initialzed realtime censor object')
21 | super().__init__(explicits, args.output_encoding, args.output_location)
22 | self.__switch_audio_source()
23 | create_env_var('CLEANSIO_CHUNKS_LIST', '[]')
24 | self.args = args
25 | self.directory = create_temp_dir()
26 | self.chunk_prefix = self.directory + time_filename() + '-'
27 | self.temp_chunk_filepath = self.directory + 'temp_chunk.wav'
28 | self.__update_env_chunks_list(self.temp_chunk_filepath)
29 | self.clean_file = AudioSegment.empty()
30 | self.processing_queue = []
31 | self.processing_lock = threading.Lock()
32 | self.playback_queue = []
33 | self.playback_lock = threading.Lock()
34 | self.samplerate = 44100 # Hertz
35 | self.duration = 5 # seconds
36 |
37 | def censor(self):
38 | """ Censors audio chunks in a continuous stream """
39 |
40 | # Start thread that will analyze and censor recorded chunks
41 | processing_thread = threading.Thread(target=self.run)
42 | processing_thread.daemon = True
43 | processing_thread.start()
44 |
45 | try:
46 | # Device indexes in sd.default.device should have already been set
47 | # to Soundflower (2ch) for input and Built-in Output for output.
48 | # Capture stream from Soundflower (2ch) & play to Built-in Output
49 | with sd.Stream(samplerate=self.samplerate,
50 | blocksize=int(self.samplerate*self.duration),
51 | channels=1, callback=self.callback,
52 | finished_callback=self.finished_callback):
53 | print('#' * 80)
54 | print('Press Return to stop censoring')
55 | print('#' * 80)
56 | input()
57 | except KeyboardInterrupt:
58 | print('\nInterrupted by user')
59 | CensorRealtimeMac.running = False
60 | except Exception as exception:
61 | print(type(exception).__name__ + ': ' + str(exception))
62 | CensorRealtimeMac.running = False
63 |
64 | def callback(self, indata, outdata, _, __, status):
65 | """ Process audio data from Stream """
66 | if status:
67 | print(status)
68 |
69 | # Add to processing_queue
70 | with self.processing_lock:
71 | self.processing_queue.append(indata.copy())
72 |
73 | # Consume playback_queue
74 | with self.playback_lock:
75 | if self.playback_queue:
76 | outdata[:] = self.playback_queue.pop(0)
77 | else:
78 | outdata.fill(0)
79 |
80 | def finished_callback(self):
81 | """ Once stream is inactive, output cleaned recordings to audio file """
82 | if self.args.store_recording:
83 | trailing_audio_length = len(self.playback_queue) * CHUNK_LEN
84 | if trailing_audio_length > 0:
85 | self.clean_file = self.clean_file[:-trailing_audio_length]
86 | self.create_clean_file(self.clean_file)
87 | else:
88 | self.print_explicits_count()
89 |
90 | def run(self):
91 | """ Process 10 seconds of captured audio data at a time """
92 | index = 0
93 | leftover_mute = 0
94 |
95 | while True:
96 | if not CensorRealtimeMac.running:
97 | break
98 |
99 | with self.processing_lock:
100 | processing_queue_length = len(self.processing_queue)
101 |
102 | if processing_queue_length >= 2:
103 | with self.processing_lock:
104 | frames_to_process = self.processing_queue.pop(0)
105 | next_frames = self.processing_queue[0]
106 |
107 | # Convert next two recordings into chunks
108 | recorded_chunk, file_path = \
109 | self.__convert_frames_to_chunk(frames_to_process, index)
110 | next_recorded_chunk, _ = \
111 | self.__convert_frames_to_chunk(next_frames, index+1)
112 |
113 | overlapping_chunk, overlapping_path = \
114 | self.__create_overlapping_chunk(recorded_chunk,
115 | next_recorded_chunk,
116 | file_path)
117 |
118 | # Create accuracy chunk for current chunk and overlapping chunk
119 | self.__create_accuracy_chunk(recorded_chunk, file_path)
120 | self.__create_accuracy_chunk(overlapping_chunk, overlapping_path)
121 |
122 | # Censor current chunk and also mute any spillover explicits
123 | # from previous chunk
124 | clean_chunk_wrapper = self.censor_audio_chunk(file_path)
125 | clean_chunk = AudioSegment.silent(duration=leftover_mute) \
126 | + clean_chunk_wrapper.segment[leftover_mute:]
127 |
128 | # Remember to mute any overlapping explicit in the next chunk
129 | leftover_mute = clean_chunk_wrapper.mute_next_start
130 |
131 | # Convert current chunk into frames and add it to the playback
132 | # queue
133 | clean_frames = self.__convert_clean_chunk_to_frames(clean_chunk)
134 | with self.playback_lock:
135 | self.playback_queue.append(clean_frames)
136 |
137 | if self.args.store_recording:
138 | self.clean_file += clean_chunk
139 |
140 | index += 1
141 |
142 | def __convert_frames_to_chunk(self, frames, index):
143 | file_path = self.chunk_prefix + str(index) +'.wav'
144 | sf.write(file_path, frames, self.samplerate)
145 | self.__update_env_chunks_list(file_path)
146 | recorded_chunk = read_and_convert_audio(file_path)
147 | return recorded_chunk, file_path
148 |
149 | def __convert_clean_chunk_to_frames(self, chunk):
150 | chunk.export(self.temp_chunk_filepath, format='wav')
151 | clean_frames, _ = sf.read(self.temp_chunk_filepath,
152 | dtype='float32',
153 | fill_value=0.0,
154 | frames=int(self.samplerate*self.duration),
155 | always_2d=True)
156 | return clean_frames
157 |
158 | def __create_overlapping_chunk(self, chunk1, chunk2, file_path):
159 | overlapping_chunk = chunk1[2500:] + chunk2[:2500]
160 | overlapping_path = append_before_ext(file_path, '-overlapping')
161 | convert_and_write_chunk(overlapping_chunk, overlapping_path, 'wav')
162 | self.__update_env_chunks_list(overlapping_path)
163 | return overlapping_chunk, overlapping_path
164 |
165 | def __create_accuracy_chunk(self, chunk, file_path):
166 | accuracy_chunk_file_path = append_before_ext(file_path, '-accuracy')
167 | accuracy_chunk = improve_accuracy(chunk)
168 | convert_and_write_chunk(accuracy_chunk, accuracy_chunk_file_path, 'wav')
169 |
170 | @classmethod
171 | def __switch_audio_source(cls):
172 | create_env_var('CLEANSIO_OLD_SOUND_OUT', MacUtil.audio_source('output'))
173 | create_env_var('CLEANSIO_OLD_SOUND_IN', MacUtil.audio_source('input'))
174 | MacUtil.switch_audio_source('output', 'Soundflower (2ch)')
175 | MacUtil.switch_audio_source('input', 'Soundflower (2ch)')
176 | cls.__set_default_device('Soundflower (2ch)', 'Built-in Output')
177 |
178 | @classmethod
179 | def __set_default_device(cls, input_device_name, output_device_name):
180 | device_index = 0
181 | input_device_index = 2 # Soundflower (2ch) is usually no. 2
182 | output_device_index = 1 # Built-in Output is usually no. 1
183 | for device in sd.query_devices():
184 | if device['name'] == input_device_name:
185 | input_device_index = device_index
186 | if device['name'] == output_device_name:
187 | output_device_index = device_index
188 | device_index += 1
189 | sd.default.device = (input_device_index, output_device_index)
190 |
191 | @classmethod
192 | def __update_env_chunks_list(cls, file_path):
193 | """ Call after every write for later cleanup """
194 | env_list = os.environ['CLEANSIO_CHUNKS_LIST']
195 | beginning = '[\'' if env_list[:-1] == '[' else env_list[:-1] + ', \''
196 | create_env_var(
197 | 'CLEANSIO_CHUNKS_LIST', beginning + file_path + '\']')
198 |
--------------------------------------------------------------------------------
/cleansio/cleansio.py:
--------------------------------------------------------------------------------
1 | """ Displays the lyrics of an audio file """
2 |
3 | # Imports from our modules
4 | from censor import CensorFile, CensorRealtime
5 | from utils import setup_cleanup, setup_cli_args
6 | from explicits import Explicits
7 |
8 | def is_file_mode():
9 | """ Validates if user is running file mode """
10 | return ARGS.file_path
11 |
12 | if __name__ == '__main__':
13 | setup_cleanup()
14 | ARGS = setup_cli_args()
15 | EXPLICITS = Explicits(ARGS).set
16 | if is_file_mode():
17 | CensorFile(ARGS, EXPLICITS).censor()
18 | else:
19 | CensorRealtime(ARGS, EXPLICITS).censor()
20 |
--------------------------------------------------------------------------------
/cleansio/data/encoding-types:
--------------------------------------------------------------------------------
1 | ac3
2 | adx
3 | aiff
4 | alaw
5 | asf
6 | au
7 | avi
8 | eac3
9 | f32be
10 | f32le
11 | f64be
12 | f64le
13 | ffmetadata
14 | flac
15 | flv
16 | g722
17 | mmf
18 | mp3
19 | mpeg
20 | mpegts
21 | mulaw
22 | nut
23 | ogg
24 | rm
25 | rso
26 | s16be
27 | s16le
28 | s24be
29 | s24le
30 | s32be
31 | s32le
32 | s8
33 | smjpeg
34 | sox
35 | spdif
36 | swf
37 | u16be
38 | u16le
39 | u24be
40 | u24le
41 | u32be
42 | u32le
43 | u8
44 | voc
45 | wav
46 |
--------------------------------------------------------------------------------
/cleansio/data/explicits-list:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/cleansio/data/explicits-list
--------------------------------------------------------------------------------
/cleansio/explicits/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 |
3 | from .explicits import Explicits
4 | from .user_explicits import UserExplicits
5 |
--------------------------------------------------------------------------------
/cleansio/explicits/explicits.py:
--------------------------------------------------------------------------------
1 | """ Loads list of explicits from an encrypted file """
2 |
3 | from Crypto.Cipher import AES
4 | import yaml
5 | from utils import relative_path
6 | from .user_explicits import UserExplicits
7 |
8 | class Explicits():
9 | """ Object representing set of explicits"""
10 | def __init__(self, args):
11 | """ Decrypt encrypted list of explicits and return a set """
12 | self.set = self.__set(args)
13 |
14 | def __set(self, args):
15 | explicit_set = {}
16 | if args.user_list and args.combine_lists:
17 | internal_set = self.__internal_set()
18 | user_set = self.__user_set(args.user_list[0])
19 | explicit_set = internal_set.union(user_set)
20 | elif args.user_list:
21 | explicit_set = self.__user_set(args.user_list[0])
22 | else:
23 | explicit_set = self.__internal_set()
24 | return set(map(lambda e: e.lower(), explicit_set))
25 |
26 | def __internal_set(self):
27 | with open(self.__get_explicits_path(), 'rb') as file:
28 | decrypted_content = self.__get_decrypted_content(file)
29 |
30 | yaml_content = yaml.load(decrypted_content)
31 | return set(yaml_content['explicits'])
32 |
33 | @classmethod
34 | def __user_set(cls, user_list):
35 | return UserExplicits(user_list).set
36 |
37 | @classmethod
38 | def __get_explicits_path(cls):
39 | """ Return path of encrypted explicits file """
40 | path_to_enc_file = relative_path('../data/explicits-list')
41 | return path_to_enc_file
42 |
43 | @classmethod
44 | def __get_decrypted_content(cls, encrypted_file):
45 | """ Decrypt the encrypted file and return content as string """
46 | decryptor = AES.new('cleansio_sym_key', AES.MODE_CBC, 'cleansioCensorIV')
47 | content = ''
48 | while True:
49 | block = encrypted_file.read(16)
50 | if not block:
51 | break
52 | content += decryptor.decrypt(block).decode('utf-8')
53 | return content
54 |
--------------------------------------------------------------------------------
/cleansio/explicits/user_explicits.py:
--------------------------------------------------------------------------------
1 | """
2 | Loads a list of words from a file.
3 | It is assumed that the words are separated by the given separator.
4 | """
5 |
6 | class UserExplicits():
7 | """
8 | Loads a list of words from a file.
9 | It is assumed that the words are separated by the given separator.
10 | """
11 | def __init__(self, filename, sep='\n'):
12 | # Create the set which stores words
13 | self.set = {}
14 | with open(filename, 'r') as uel:
15 | # Assume that the words are separated by sep
16 | word_list = uel.read().strip().split(sep)
17 | self.set = set(filter(lambda x: x != '', word_list)) # Remove ''
18 |
--------------------------------------------------------------------------------
/cleansio/speech/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 |
3 | from .transcribe import Transcribe
4 | from .timestamp import Timestamp
5 |
--------------------------------------------------------------------------------
/cleansio/speech/timestamp.py:
--------------------------------------------------------------------------------
1 | """ Locates where words are located in an audio chunk """
2 |
3 | from utils import gcs_time_to_ms
4 |
5 | class Timestamp():
6 | """ Words are located by either assessing silence or by estimatingself.
7 | Timestamps in the form of {word:, start:, end:} """
8 | def __init__(self, lyrics):
9 | super().__init__()
10 | self.lyrics = lyrics
11 | self.timestamps = self.__compute_timestamps()
12 |
13 | def __compute_timestamps(self):
14 | """ Goes through each word in the chunk and computes the timestamps """
15 | if not self.lyrics:
16 | return None
17 | return self.__parse_timestamps()
18 |
19 | def __parse_timestamps(self):
20 | """ Parses GCS's output and returns [{word:str, start:ms, end:ms},].
21 | O(n) """
22 | timestamps = []
23 | for word in self.lyrics:
24 | start = max(0, gcs_time_to_ms(word.start_time) - 50)
25 | timestamps.append({
26 | 'word': word.word.lower(),
27 | 'start': start,
28 | 'end': gcs_time_to_ms(word.end_time) + 50
29 | })
30 | return timestamps
31 |
--------------------------------------------------------------------------------
/cleansio/speech/transcribe.py:
--------------------------------------------------------------------------------
1 | """ Convert audio to text using Google Cloud Speech """
2 |
3 | from itertools import repeat
4 | from multiprocessing.dummy import Pool as ThreadPool
5 | from google.cloud.speech import enums, SpeechClient, types
6 | from utils import append_before_ext
7 |
8 | class Transcribe():
9 | """ Transcribes the lyrics from the vocals """
10 | def __init__(self, file_path, frame_rate, encoding='LINEAR16'):
11 | super().__init__()
12 | self.lyrics = self.__transcribe_chunks(encoding, frame_rate, file_path)
13 |
14 | def __transcribe_chunks(self, frame_rate, encoding, file_path):
15 | file_paths = [file_path, append_before_ext(file_path, '-overlapping')]
16 | async_iter = zip(repeat(frame_rate), repeat(encoding), file_paths)
17 | transcripts = ThreadPool(2).map(self.__transcribe_chunk, async_iter)
18 | return self.__combine_transcripts(transcripts)
19 |
20 | def __transcribe_chunk(self, async_iter):
21 | """ Accesses Google Cloud Speech and print the lyrics for each chunk """
22 | frame_rate, encoding, file_path = async_iter
23 | accuracy_chunk_path = append_before_ext(file_path, '-accuracy')
24 | with open(accuracy_chunk_path, 'rb') as audio_content:
25 | content = audio_content.read()
26 | config = self.__get_config(encoding, frame_rate)
27 | audio = types.RecognitionAudio(content=content)
28 | return SpeechClient().recognize(config, audio)
29 |
30 | @classmethod
31 | def __get_config(cls, frame_rate, encoding):
32 | params = {
33 | 'encoding': enums.RecognitionConfig.AudioEncoding[encoding],
34 | 'sample_rate_hertz': frame_rate,
35 | 'language_code': 'en-US',
36 | 'enable_word_time_offsets': True,
37 | 'profanity_filter': False
38 | }
39 | return types.RecognitionConfig(**params)
40 |
41 | @classmethod
42 | def __combine_transcripts(cls, transcripts):
43 | """ Combine the words from the normal and overlapping chunks """
44 | words = []
45 | if transcripts[0].results: # Normal chunk
46 | words += transcripts[0].results[0].alternatives[0].words
47 | if transcripts[1].results: # Overlapping chunk
48 | overlapping = transcripts[1].results[0].alternatives[0].words
49 | shifted_time = list(map(cls.__shift_time, overlapping))
50 | words += shifted_time
51 | return None if words == [] else words
52 |
53 | @classmethod
54 | def __shift_time(cls, word):
55 | """ Increment the time relative to the normal chunk """
56 | word.start_time.seconds += 2
57 | word.start_time.nanos += 500000000
58 | word.end_time.seconds += 2
59 | word.end_time.nanos += 500000000
60 | return word
61 |
--------------------------------------------------------------------------------
/cleansio/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 |
3 | from .cleanup import cleanup, remove_chunks, remove_conversions, setup_cleanup
4 | from .cli import setup_cli_args
5 | from .env import create_env_var
6 | from .files import create_temp_dir, file_name_no_ext, current_dir, \
7 | relative_path, append_before_ext, time_filename
8 | from .numbers import gcs_time_to_ms, is_number, leading_zero
9 | from .constants import CHUNK_LEN
10 | from .mac import MacUtil
11 |
--------------------------------------------------------------------------------
/cleansio/utils/cleanup.py:
--------------------------------------------------------------------------------
1 | """ Cleans up temporary files after the program runs """
2 |
3 | from atexit import register
4 | from os import environ, remove
5 | import platform
6 | from signal import signal, SIGABRT, SIGILL, SIGINT, SIGSEGV, SIGTERM
7 | import sys
8 | from .files import append_before_ext
9 | from .mac import MacUtil
10 |
11 | # Cleans up files on normal or abnormal exit
12 | # The arguments are unused - they are only here to satisfy atexit.
13 | def cleanup(_sig_num=None, _cur_stack_frame=None):
14 | """ Removes temporary files """
15 | remove_conversions()
16 | remove_chunks()
17 | system = platform.system()
18 | if system == 'Darwin':
19 | MacUtil.clean()
20 | sys.exit(0)
21 |
22 | def setup_cleanup():
23 | """ Always call cleanup on any type of exit by creating triggers """
24 | # Set the cleanup handler for each signal which we want to catch
25 | for sig in (SIGABRT, SIGILL, SIGINT, SIGSEGV, SIGTERM):
26 | signal(sig, cleanup)
27 | # Register the cleanup function to be called if the program exits normally
28 | register(cleanup)
29 |
30 | def remove_conversions():
31 | """ Removes converted WAV file """
32 |
33 | if 'CLEANSIO_TEMP_FILE' in environ:
34 | temp_file = environ.get('CLEANSIO_TEMP_FILE')
35 | try:
36 | remove(temp_file)
37 | except FileNotFoundError:
38 | pass
39 |
40 | def remove_chunks():
41 | """ Removes each chunk of the converted WAV file """
42 |
43 | if 'CLEANSIO_CHUNKS_LIST' in environ:
44 | slices_list_env_var = environ['CLEANSIO_CHUNKS_LIST']
45 | chunks_list = slices_list_env_var[2:-2].split('\', \'')
46 | for chunk_file in chunks_list:
47 | try:
48 | remove(append_before_ext(chunk_file, '-accuracy'))
49 | except FileNotFoundError:
50 | pass
51 | try:
52 | remove(chunk_file)
53 | except FileNotFoundError:
54 | pass
55 |
--------------------------------------------------------------------------------
/cleansio/utils/cli.py:
--------------------------------------------------------------------------------
1 | """ Utillity functions for command line interfaces """
2 |
3 | import argparse
4 | import sys
5 | from colorama import Fore
6 | from .files import relative_path
7 |
8 | def setup_cli_args():
9 | """ Defines the different CLI arguments """
10 | parser = argparse.ArgumentParser(description='Real-time music censoring.')
11 | parser = __set_file_path(parser)
12 | parser = __set_user_list(parser)
13 | parser = __set_combine_list(parser)
14 | parser = __set_output_path(parser)
15 | parser = __set_output_encoding(parser)
16 | parser = __set_output_encoding_list(parser)
17 | parser = __set_store_recording(parser)
18 | args = parser.parse_args() # NOTE: Cannot add args after calling parse_args
19 | __exiting_args(args)
20 | __validate_args(args, parser)
21 | return args
22 |
23 | def __set_file_path(parser):
24 | parser.add_argument(
25 | 'file_path',
26 | nargs='?',
27 | help='enables file mode which creates a clean version of the file. \
28 | Relative or full path')
29 | return parser
30 |
31 | def __set_user_list(parser):
32 | """ Sets the arguments which control the user list of explicit words """
33 | parser.add_argument(
34 | '-u',
35 | '--user-list',
36 | nargs=1,
37 | action='store',
38 | help='takes a path which points to a custom list of words which you \
39 | would like to mark as explicit.')
40 | return parser
41 |
42 | def __set_combine_list(parser):
43 | """ Allows the user to combine their list with the internal list """
44 | parser.add_argument(
45 | '-c',
46 | '--combine-lists',
47 | action='store_true',
48 | help='the list which you provide with the \'-u\' option replaces the \
49 | program\'s internal list by default. However, you can pass \
50 | this option in addition to -u to have your list combined with the \
51 | internal list.')
52 | return parser
53 |
54 | def __set_store_recording(parser):
55 | """ Allows the user to determine where the clean file is created """
56 | parser.add_argument(
57 | '-s',
58 | '--store-recording',
59 | action='store_true',
60 | help='save the clean realtime audio as a file in the output location')
61 | return parser
62 |
63 | def __set_output_path(parser):
64 | """ Allows the user to determine where the clean file is created """
65 | parser.add_argument(
66 | '-o',
67 | '--output-location',
68 | nargs=1,
69 | action='store',
70 | help='takes a path which will overwrite the default location of where \
71 | the clean file will be created. If the file already exists it will be \
72 | overwritten.')
73 | return parser
74 |
75 | def __set_output_encoding(parser):
76 | """ Allows the user to determine the audio encoding of the clean file """
77 | parser.add_argument(
78 | '-e',
79 | '--output-encoding',
80 | nargs=1,
81 | action='store',
82 | help='specify the audio encoding type of the output file. The file \
83 | extension of --output-location is not sufficient. Default is wav.')
84 | return parser
85 |
86 | def __set_output_encoding_list(parser):
87 | """ Allows the user to determine the different audio encoding types """
88 | parser.add_argument(
89 | '--output-encoding-list',
90 | action='store_true',
91 | help='list the possible audio encoding types for the output file.')
92 | return parser
93 |
94 | def __exiting_args(args):
95 | """ Handles arguments that simply print and exit """
96 | if args.output_encoding_list:
97 | with open(__encoding_types_path()) as types:
98 | __exit(types.read())
99 |
100 | def __validate_args(args, parser):
101 | """ Validates user input """
102 | __validate_combine_list(args, parser)
103 | __validate_output_encoding(args, parser)
104 |
105 | def __validate_combine_list(args, parser):
106 | # Ensure that the -u option is present if the -c option is given
107 | map_args = vars(args) # Convert the arguments to a map
108 | if map_args['combine_lists'] and not map_args['user_list']:
109 | __error(parser, 'The -c option requires -u!')
110 |
111 | def __validate_output_encoding(args, parser):
112 | # Validate if the user's encoding is valid
113 | output_encoding = args.output_encoding
114 | if output_encoding:
115 | encoding_choice = output_encoding[0]
116 | valid = False
117 | with open(__encoding_types_path()) as types:
118 | for encoding_type in types.readlines():
119 | if encoding_type.strip() == encoding_choice:
120 | valid = True
121 | break
122 | if not valid:
123 | __error(parser, encoding_choice + ' is not supported!')
124 |
125 | def __encoding_types_path():
126 | return relative_path('../data/encoding-types')
127 |
128 | def __error(parser, message):
129 | parser.error(Fore.RED + message)
130 |
131 | def __exit(message):
132 | print(message.strip())
133 | sys.exit(0)
134 |
--------------------------------------------------------------------------------
/cleansio/utils/constants.py:
--------------------------------------------------------------------------------
1 | """ Stores constants used across the project """
2 |
3 | CHUNK_LEN = 5000 # In milliseconds
4 |
--------------------------------------------------------------------------------
/cleansio/utils/env.py:
--------------------------------------------------------------------------------
1 | """ Utillity functions for environment variables """
2 |
3 | import os
4 |
5 | def create_env_var(name, value):
6 | """ Instantiate a new environment variable with given value """
7 | os.environ[name] = value
8 |
--------------------------------------------------------------------------------
/cleansio/utils/files.py:
--------------------------------------------------------------------------------
1 | """ Utillity functions for File I/O """
2 |
3 | from errno import EEXIST
4 | import os
5 | from os.path import basename, expanduser
6 | import time
7 | from .env import create_env_var
8 |
9 | def create_temp_dir():
10 | """ Create directory to store all temporary files """
11 | create_env_var('CLEANSIO_TEMP_DIR', expanduser('~') + '/.cleansio-temp/')
12 | try:
13 | os.makedirs(os.environ['CLEANSIO_TEMP_DIR'])
14 | except OSError as os_error:
15 | # Ignore the error if it's just that the directory exists
16 | if os_error.errno != EEXIST:
17 | raise # Don't ignore errors other than the directory existing
18 | return os.environ['CLEANSIO_TEMP_DIR']
19 |
20 | def file_name_no_ext(file_path):
21 | """ Get a file name with no extension from a file path """
22 | return ''.join(basename(file_path).split('.')[:-1])
23 |
24 | def current_dir():
25 | """ The utils directory path """
26 | return os.path.dirname(__file__)
27 |
28 | def relative_path(path):
29 | """ Path relative to the utils directory """
30 | return os.path.join(current_dir(), path)
31 |
32 | def append_before_ext(path, addition):
33 | """ Add a string between the file descriptor and the extension """
34 | dot_index = path.rfind('.')
35 | if dot_index == -1: # . Not found
36 | return path + addition
37 | return path[:dot_index] + addition + path[dot_index:]
38 |
39 | def time_filename():
40 | """ Return the current time in milliseconds. Used for chunk file names """
41 | return str(int(round(time.time() * 1000)))
42 |
--------------------------------------------------------------------------------
/cleansio/utils/mac.py:
--------------------------------------------------------------------------------
1 | """ Utility functions for macOS """
2 |
3 | from os import environ
4 | from subprocess import run
5 |
6 | class MacUtil():
7 | """ Utility functions for macOS """
8 | def __init__(self):
9 | super().__init__()
10 |
11 | @classmethod
12 | def switch_audio_source(cls, interface, device_name):
13 | """ Switch the system's audio source
14 | interface : [input|output] """
15 | run(['SwitchAudioSource', '-t', interface, '-s', device_name],
16 | capture_output=True, # Ignore output by capturing it
17 | check=True) # Throw an error if command fails
18 |
19 | @classmethod
20 | def audio_source(cls, interface):
21 | """ Returns the system's audio source
22 | interface : [input|output] """
23 | raw_device_name = run(
24 | ['SwitchAudioSource', '-c', '-t', interface],
25 | capture_output=True, # Return output
26 | check=True) # Throw an error if command fails
27 | return raw_device_name.stdout.decode('utf-8').replace('\n', '')
28 |
29 | @classmethod
30 | def clean(cls):
31 | """ Resets the system's state """
32 | if 'CLEANSIO_REALTIME' in environ and \
33 | 'CLEANSIO_OLD_SOUND_OUT' in environ and \
34 | 'CLEANSIO_OLD_SOUND_IN' in environ:
35 | cls.switch_audio_source('output', environ['CLEANSIO_OLD_SOUND_OUT'])
36 | cls.switch_audio_source('input', environ['CLEANSIO_OLD_SOUND_IN'])
37 |
--------------------------------------------------------------------------------
/cleansio/utils/numbers.py:
--------------------------------------------------------------------------------
1 | """ Utillity functions for dealing with numbers """
2 |
3 | import re
4 | from google.protobuf.duration_pb2 import Duration
5 |
6 | def is_number(num):
7 | """ Validates if a string is a number. Can be negative or a float """
8 | return re.match(r'^-?\d+(\.\d+|\d*)$', str(num))
9 |
10 | def leading_zero(num):
11 | """ Adds a leading 0 to single digit numbers. Converts numbers to string """
12 | str_num = str(num)
13 | if not str_num.isdigit(): # Check if it's a number
14 | return str_num
15 | if len(str_num) < 2:
16 | return '0' + str_num
17 | return str_num
18 |
19 | def gcs_time_to_ms(time):
20 | """ Converts seconds and nano to milliseconds """
21 | if not isinstance(time, Duration) \
22 | or (time.nanos and not is_number(time.nanos)) \
23 | or (time.seconds and not is_number(time.seconds)):
24 | return 0
25 | milliseconds = time.seconds * 1000 if time.seconds else 0
26 | milliseconds += time.nanos // 1e6
27 | return milliseconds
28 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = .
8 | BUILDDIR = _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 |
--------------------------------------------------------------------------------
/docs/_static/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/docs/_static/.keep
--------------------------------------------------------------------------------
/docs/_templates/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/docs/_templates/.keep
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Configuration file for the Sphinx documentation builder.
5 | #
6 | # This file does only contain a selection of the most common options. For a
7 | # full list see the documentation:
8 | # http://www.sphinx-doc.org/en/master/config
9 |
10 | # -- Path setup --------------------------------------------------------------
11 |
12 | # If extensions (or modules to document with autodoc) are in another directory,
13 | # add these directories to sys.path here. If the directory is relative to the
14 | # documentation root, use os.path.abspath to make it absolute, like shown here.
15 | #
16 | import os
17 | import sys
18 | sys.path.insert(0, os.path.abspath('../cleansio'))
19 |
20 |
21 | # -- Project information -----------------------------------------------------
22 |
23 | project = 'Cleansio'
24 | copyright = '2019, Patrick Duncan, Victor Carri, Levin Noronha, Corie Bain'
25 | author = 'Patrick Duncan, Victor Carri, Levin Noronha, Corie Bain'
26 |
27 | # The short X.Y version
28 | version = ''
29 | # The full version, including alpha/beta/rc tags
30 | release = ''
31 |
32 |
33 | # -- General configuration ---------------------------------------------------
34 |
35 | # If your documentation needs a minimal Sphinx version, state it here.
36 | #
37 | # needs_sphinx = '1.0'
38 |
39 | # Add any Sphinx extension module names here, as strings. They can be
40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
41 | # ones.
42 | extensions = [
43 | 'sphinx.ext.autodoc',
44 | 'sphinx.ext.todo',
45 | 'sphinx.ext.coverage',
46 | 'sphinx.ext.viewcode',
47 | 'sphinxcontrib.github_ribbon'
48 | ]
49 |
50 | # Add any paths that contain templates here, relative to this directory.
51 | templates_path = ['_templates']
52 |
53 | # The suffix(es) of source filenames.
54 | # You can specify multiple suffix as a list of string:
55 | #
56 | # source_suffix = ['.rst', '.md']
57 | source_suffix = '.rst'
58 |
59 | # The master toctree document.
60 | master_doc = 'index'
61 |
62 | # The language for content autogenerated by Sphinx. Refer to documentation
63 | # for a list of supported languages.
64 | #
65 | # This is also used if you do content translation via gettext catalogs.
66 | # Usually you set "language" from the command line for these cases.
67 | language = None
68 |
69 | # List of patterns, relative to source directory, that match files and
70 | # directories to ignore when looking for source files.
71 | # This pattern also affects html_static_path and html_extra_path.
72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
73 |
74 | # The name of the Pygments (syntax highlighting) style to use.
75 | pygments_style = None
76 |
77 |
78 | # -- Options for HTML output -------------------------------------------------
79 |
80 | # The theme to use for HTML and HTML Help pages. See the documentation for
81 | # a list of builtin themes.
82 | #
83 | html_theme = 'alabaster'
84 |
85 | # Theme options are theme-specific and customize the look and feel of a theme
86 | # further. For a list of options available for each theme, see the
87 | # documentation.
88 | #
89 | # html_theme_options = {}
90 |
91 | # Add any paths that contain custom static files (such as style sheets) here,
92 | # relative to this directory. They are copied after the builtin static files,
93 | # so a file named "default.css" will overwrite the builtin "default.css".
94 | html_static_path = ['_static']
95 |
96 | # Custom sidebar templates, must be a dictionary that maps document names
97 | # to template names.
98 | #
99 | # The default sidebars (for documents that don't match any pattern) are
100 | # defined by theme itself. Builtin themes are using these templates by
101 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
102 | # 'searchbox.html']``.
103 | #
104 | # html_sidebars = {}
105 |
106 |
107 | # -- Options for HTMLHelp output ---------------------------------------------
108 |
109 | # Output file base name for HTML help builder.
110 | htmlhelp_basename = 'Cleansiodoc'
111 |
112 |
113 | # -- Options for LaTeX output ------------------------------------------------
114 |
115 | latex_elements = {
116 | # The paper size ('letterpaper' or 'a4paper').
117 | #
118 | # 'papersize': 'letterpaper',
119 |
120 | # The font size ('10pt', '11pt' or '12pt').
121 | #
122 | # 'pointsize': '10pt',
123 |
124 | # Additional stuff for the LaTeX preamble.
125 | #
126 | # 'preamble': '',
127 |
128 | # Latex figure (float) alignment
129 | #
130 | # 'figure_align': 'htbp',
131 | }
132 |
133 | # Grouping the document tree into LaTeX files. List of tuples
134 | # (source start file, target name, title,
135 | # author, documentclass [howto, manual, or own class]).
136 | latex_documents = [
137 | (master_doc, 'Cleansio.tex', 'Cleansio Documentation',
138 | 'Patrick Duncan, Victor Carri, Levin Noronha, Corie Bain', 'manual'),
139 | ]
140 |
141 |
142 | # -- Options for manual page output ------------------------------------------
143 |
144 | # One entry per manual page. List of tuples
145 | # (source start file, name, description, authors, manual section).
146 | man_pages = [
147 | (master_doc, 'cleansio', 'Cleansio Documentation',
148 | [author], 1)
149 | ]
150 |
151 |
152 | # -- Options for Texinfo output ----------------------------------------------
153 |
154 | # Grouping the document tree into Texinfo files. List of tuples
155 | # (source start file, target name, title, author,
156 | # dir menu entry, description, category)
157 | texinfo_documents = [
158 | (master_doc, 'Cleansio', 'Cleansio Documentation',
159 | author, 'Cleansio', 'One line description of project.',
160 | 'Miscellaneous'),
161 | ]
162 |
163 |
164 | # -- Options for Epub output -------------------------------------------------
165 |
166 | # Bibliographic Dublin Core info.
167 | epub_title = project
168 |
169 | # The unique identifier of the text. This can be a ISBN number
170 | # or the project homepage.
171 | #
172 | # epub_identifier = ''
173 |
174 | # A unique identification for the text.
175 | #
176 | # epub_uid = ''
177 |
178 | # A list of files that should not be packed into the epub file.
179 | epub_exclude_files = ['search.html']
180 |
181 |
182 | # -- Extension configuration -------------------------------------------------
183 |
184 | github_ribbon_repo = 'PatrickDuncan/cleansio'
185 | github_ribbon_position = 'right'
186 | github_ribbon_color = 'darkblue'
187 |
188 | # -- Options for todo extension ----------------------------------------------
189 |
190 | # If true, `todo` and `todoList` produce output, else they produce nothing.
191 | todo_include_todos = True
192 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | Welcome to Cleansio's Technical Documentation!
2 | =======================================================
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 | :caption: Contents:
7 |
8 | .. image:: ../media/logo.png
9 | :width: 150 px
10 | :alt: Logo
11 | :target: https://github.com/PatrickDuncan/cleansio
12 |
13 | Indices and tables
14 | ------------------
15 | * :ref:`genindex`
16 | * :ref:`modindex`
17 | * :ref:`search`
18 |
19 | .. image:: _static/classes.png
20 | :width: 400 px
21 | :alt: Classes Diagram
22 |
23 | .. image:: _static/packages.png
24 | :width: 400 px
25 | :alt: Packages Diagram
26 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/media/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/media/logo.png
--------------------------------------------------------------------------------
/media/poster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/media/poster.png
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pylint>=2.2.2
2 | pylint-quotes>=0.2.0
3 | pytest==4.0.2
4 | sphinx==1.8.3
5 | sphinxcontrib-github_ribbon==0.9.0
6 |
--------------------------------------------------------------------------------
/requirements-mac.txt:
--------------------------------------------------------------------------------
1 | sounddevice>=0.3.12
2 | soundfile>=0.10.2
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | colorama>=0.4.1
2 | google-cloud-speech>=0.36.1
3 | pycrypto>=2.6.1
4 | pydub>=0.23.0
5 | pyyaml>=3.13
6 | tqdm>=4.29.0
7 |
--------------------------------------------------------------------------------
/tests/accuracy/test_loudness_maximization.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 |
3 | import os
4 | from pydub import AudioSegment
5 | from audio.accuracy import __maximize_volume
6 | from audio.accuracy import improve_accuracy
7 |
8 | def __get_file(file_path):
9 | return os.path.dirname(os.path.realpath(__file__)) + file_path
10 |
11 | def test_loudness_maximization():
12 | try:
13 | file_path = __get_file('/../data/testing.wav')
14 | audio_segment = AudioSegment.from_file(file_path)
15 | # Duplicate the audio file
16 | file_path_duplicate = __get_file('/../data/testing-max-volume.wav')
17 | duplicate_file = audio_segment.export(file_path_duplicate, format='wav')
18 | audio_segment_duplicate = AudioSegment.from_file(file_path_duplicate)
19 | init_loudness = audio_segment_duplicate.dBFS
20 | # Test that the volume was successfully maximized
21 | max_volume_chunk = __maximize_volume(audio_segment_duplicate)
22 | max_loundess = max_volume_chunk.dBFS
23 | assert init_loudness < max_loundess
24 | except:
25 | assert False
26 | finally:
27 | #Cleanup
28 | os.remove(file_path_duplicate)
29 |
--------------------------------------------------------------------------------
/tests/censor/test_censor.py:
--------------------------------------------------------------------------------
1 | #pylint: skip-file
2 |
3 | import os
4 | from pathlib import Path
5 | from pydub import AudioSegment, silence
6 | from audio import ChunkWrapper
7 | from censor import Censor
8 |
9 | timestamps = [
10 | {'word': 'hi', 'start': 1000.0, 'end': 1500.0},
11 | {'word': 'bye', 'start': 1700.0, 'end': 1900.0},
12 | {'word': 'mute', 'start': 3800.0, 'end': 4500.0}
13 | ]
14 |
15 | explicits = ['mute']
16 |
17 | def __get_file(file_path):
18 | return os.path.dirname(os.path.realpath(__file__)) + file_path
19 |
20 | def test_censor():
21 | try:
22 | file_path = __get_file('/../data/testing.wav')
23 | audio_segment = AudioSegment.from_file(file_path)
24 | # Duplicate the audio file and begin muting the new file
25 | file_path_dup = __get_file('/../data/testing-censored-0.wav')
26 | dup_file = audio_segment.export(file_path_dup, format='wav')
27 | audio_segment_dup = AudioSegment.from_file(file_path_dup)
28 |
29 | # Test that the explicits were successfully removed
30 | wrapper = ChunkWrapper(audio_segment_dup)
31 | location = str(Path(__file__).parents[2]) + '/clean_file.wav'
32 | audio_segment_dup = Censor(explicits, 'wav', location)._Censor__mute_explicits(
33 | file_path_dup, wrapper, timestamps).segment
34 | # Get the silence segments
35 | silent_ranges = silence.detect_silence(
36 | audio_segment_dup, min_silence_len=500, silence_thresh=-50)
37 |
38 | # Assert silence is only in the 'mute' timestamp
39 | assert len(silent_ranges) == 1
40 | beginning_diff = silent_ranges[0][0] - timestamps[2]['start']
41 | end_diff = silent_ranges[0][1] - timestamps[2]['end']
42 |
43 | # Less than 5 (milliseconds) to account for small inaccuracies
44 | assert abs(beginning_diff) < 5
45 | assert abs(end_diff) < 5
46 | except:
47 | assert False
48 | finally:
49 | # Cleanup
50 | os.remove(file_path_dup)
51 |
52 |
--------------------------------------------------------------------------------
/tests/data/testing.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/tests/data/testing.wav
--------------------------------------------------------------------------------
/tests/explicits/test_user_explicits.py:
--------------------------------------------------------------------------------
1 | #pylint: skip-file
2 |
3 | # Setup import path to include the wordlist files
4 | import sys
5 | # Import modules to test
6 | from explicits import UserExplicits
7 | # Testing
8 | import pytest
9 |
10 | # Simplest case - newline-separated word list
11 | # tmpdir - A temporary directory path, supplied by pytest
12 | def test_newline(tmpdir):
13 | # A list which we will write to the file
14 | correct_arr = {"a", "b", "c"}
15 | # Create a temporary file
16 | tmp_path = tmpdir.join("newlinefile")
17 | # Write a test array to a temporary file, separated by newlines
18 | tmp_path.write("\n".join(correct_arr))
19 | # Test - create a UserExplicits and see whether or not it loads the data
20 | fl = UserExplicits(str(tmp_path))
21 | # The array which we read from the file should be the same as the array which we wrote
22 | assert fl.set == correct_arr
23 | assert '' not in fl.set
24 |
25 | # Nonexistent file given - exception expected
26 | # tmp_path - A temporary directory path, supplied by pytest
27 | def test_nonexistent_file(tmpdir):
28 | tmp_path = tmpdir.join("nonexistent")
29 | with pytest.raises(FileNotFoundError, message="Expecting FileNotFoundError"):
30 | # Try to read a nonexistent file
31 | fl = UserExplicits(str(tmp_path))
32 |
33 | # Non-newline separator (CSV)
34 | # tmp_path - A temporary directory path, supplied by pytest
35 | def test_csv_file(tmpdir):
36 | # The separator which we want to use in the file
37 | separator = ","
38 | # A list which we will write to the file
39 | correct_arr = {"a", "b", "c"}
40 | # Create a temp file
41 | tmp_path = tmpdir.join("test.csv")
42 | # Write a test array to a temporary file, separated by commas
43 | tmp_path.write(separator.join(correct_arr))
44 | # Test - create a UserExplicits and see whether or not it loads the data
45 | fl = UserExplicits(str(tmp_path), separator)
46 | # The array which we read from the file should be the same as the array which we wrote
47 | assert fl.set == correct_arr
48 |
--------------------------------------------------------------------------------
/tests/utils/numbers/test_gcs_time_to_ms.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 |
3 | from utils import gcs_time_to_ms
4 | from google.protobuf.duration_pb2 import Duration
5 |
6 | def test_gcs_time_to_ms_empty():
7 | assert gcs_time_to_ms('') == 0 and gcs_time_to_ms(None) == 0
8 |
9 | def test_gcs_time_to_ms_just_nanos():
10 | duration = Duration()
11 | duration.nanos = 900000000
12 | assert gcs_time_to_ms(duration) == 900
13 |
14 | def test_gcs_time_to_ms_just_seconds():
15 | duration = Duration()
16 | duration.seconds = 2
17 | assert gcs_time_to_ms(duration) == 2000
18 |
19 | def test_gcs_time_to_ms_nanos_and_seconds():
20 | duration = Duration()
21 | duration.nanos = 300000000
22 | duration.seconds = 5
23 | assert gcs_time_to_ms(duration) == 5300
24 |
--------------------------------------------------------------------------------
/tests/utils/numbers/test_is_number.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 |
3 | from utils import is_number
4 |
5 | def test_is_number_empty():
6 | assert not is_number('')
7 |
8 | def test_is_number_string():
9 | assert not is_number('cleansio')
10 |
11 | def test_is_number_string_and_num():
12 | assert not is_number('hello123')
13 |
14 | def test_is_number_different_type():
15 | assert is_number(91327)
16 |
17 | def test_is_number_integer():
18 | assert is_number(str(91327))
19 |
20 | def test_is_number_negative():
21 | assert is_number(str(-91327))
22 |
23 | def test_is_number_float():
24 | assert is_number(str(91.327))
25 |
26 | def test_is_number_negative_float():
27 | assert is_number('-91327.23')
28 | assert is_number(str(-91327.23))
29 |
--------------------------------------------------------------------------------
/tests/utils/numbers/test_leading_zero.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 |
3 | from utils import leading_zero
4 |
5 | def test_leading_zero_empty():
6 | assert leading_zero('') == ''
7 |
8 | def test_leading_zero_int_single_digit():
9 | assert leading_zero(4) == '04'
10 |
11 | def test_leading_zero_int_double_digit():
12 | assert leading_zero(23) == '23'
13 |
14 | def test_leading_zero_string_single_digit():
15 | assert leading_zero('9') == '09'
16 |
17 | def test_leading_zero_string_triple_digit():
18 | assert leading_zero('504') == '504'
19 |
--------------------------------------------------------------------------------
/tests/utils/test_cleanup.py:
--------------------------------------------------------------------------------
1 | #pylint: skip-file
2 |
3 | # Standard imports
4 | from uuid import uuid4
5 | from random import randint
6 | import os
7 | import pytest
8 |
9 | # Function to test
10 | from utils import cleanup
11 |
12 | # Tests the cleanup function
13 | def test_cleanup():
14 | ## Temporary file ##
15 |
16 | # Temp file shouldn't exist
17 | if 'CLEANSIO_TEMP_FILE' in os.environ: # Temp file var exists
18 | # Fetch the name of the temp file
19 | temp_file = os.environ.get('CLEANSIO_TEMP_FILE')
20 | else: # Temp file variable doesn't exist
21 | temp_file_name = str(uuid4().hex) # Fetch a random temporary file name
22 | # Create a path to the temporary file
23 | temp_file = "./{0}".format(temp_file_name)
24 | os.environ['CLEANSIO_TEMP_FILE'] = temp_file # For function
25 |
26 | create_temp_file(temp_file)
27 |
28 | ## Chunks ##
29 | chunks_list = [] # Holds names of chunk files. Used later on in test.
30 |
31 | if 'CLEANSIO_CHUNKS_LIST' in os.environ: # Chunks exist
32 | chunks_list_env_var = os.environ['CLEANSIO_CHUNKS_LIST']
33 | chunks_list = chunks_list_env_var.split[2:-2].split('\', \'')
34 | for chunk_file in chunks_list:
35 | create_temp_file(chunk_file)
36 | else: # Chunks don't exist
37 | # Generate a random number of chunk files
38 | for i in range(0,randint(3, 10)+1):
39 | # Choose a random name for the chunk which we'll generate
40 | temp_chunk_name = str(uuid4().hex)
41 | create_temp_file(temp_chunk_name)
42 | chunks_list.append(temp_chunk_name)
43 | os.environ['CLEANSIO_CHUNKS_LIST'] = str(chunks_list)
44 |
45 | with pytest.raises(SystemExit) as pytest_e: # Ignore sys.exit()
46 | cleanup() # Run the function
47 |
48 | ## Checking whether or not the function worked ##
49 | # The temporary file shouldn't exist after the cleanup function runs
50 | assert(not exists(temp_file))
51 |
52 | for chunk in chunks_list: # None of the "chunk files" should exist
53 | assert(not exists(chunk))
54 |
55 | def exists(file_name):
56 | """ Checks whether a file with the given name exists. """
57 | try:
58 | f = open(file_name, "r")
59 | f.close()
60 | return True
61 | except FileNotFoundError:
62 | return False
63 |
64 | def create_temp_file(file_name):
65 | """ Creates a temp file with the given name, if it doesn't already exist """
66 | if not exists(file_name):
67 | f = open(file_name, "w")
68 | fconts = str(uuid4().hex)
69 | f.write(fconts)
70 | f.close()
71 | else:
72 | errStr = "File {0} already exists, won't overwrite".format(file_name)
73 | raise FileExistsError(errStr)
74 |
--------------------------------------------------------------------------------
/tests/utils/test_files.py:
--------------------------------------------------------------------------------
1 | # pylint: skip-file
2 |
3 | from utils import file_name_no_ext, append_before_ext
4 |
5 | # file_name_no_ext
6 |
7 | def test_file_name_no_ext():
8 | assert file_name_no_ext('/Users/bob/folder/audio.wav') == 'audio'
9 |
10 | def test_file_name_no_ext_empty():
11 | assert file_name_no_ext('') == ''
12 |
13 | def test_file_name_no_ext_with_slash():
14 | assert file_name_no_ext('/c/user/file.mp3') == 'file'
15 |
16 | def test_file_name_no_ext_only_file():
17 | assert file_name_no_ext('file!!.mp3') == 'file!!'
18 |
19 | # append_before_ext
20 |
21 | def test_append_before_ext_empty_string_empty_addition():
22 | assert append_before_ext('', '') == ''
23 |
24 | def test_append_before_ext_empty_string():
25 | assert append_before_ext('', '-acc') == '-acc'
26 |
27 | def test_append_before_ext_empty_addition():
28 | assert append_before_ext('cleansio.wav', '') == 'cleansio.wav'
29 |
30 | def test_append_before_ext_no_extension_empty_string():
31 | assert append_before_ext('cleansio', '') == 'cleansio'
32 |
33 | def test_append_before_ext_no_extension():
34 | assert append_before_ext('cleansio', 'extra') == 'cleansioextra'
35 |
36 | def test_append_before_ext_extension_1_dot():
37 | assert append_before_ext('cleansio.wav', '-acc') == 'cleansio-acc.wav'
38 |
39 | def test_append_before_ext_extension_2_dots():
40 | assert append_before_ext('cleansio.w.av', '-acc') == 'cleansio.w-acc.av'
41 |
42 | def test_append_before_ext_extension_3_dots():
43 | assert append_before_ext('cleansio.w.a.v', '-acc') == 'cleansio.w.a-acc.v'
44 |
--------------------------------------------------------------------------------