├── .gitignore
├── .pylintrc
├── .travis.yml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── bin
    ├── .travis_doc_url_fix
    ├── docker_push
    ├── generate_docs
    ├── lint
    └── test
├── cleansio
    ├── audio
    │   ├── __init__.py
    │   ├── accuracy.py
    │   ├── audio_file.py
    │   ├── chunk_wrapper.py
    │   └── convert.py
    ├── censor
    │   ├── __init__.py
    │   ├── censor.py
    │   ├── censor_file.py
    │   ├── censor_realtime.py
    │   └── censor_realtime_mac.py
    ├── cleansio.py
    ├── data
    │   ├── encoding-types
    │   └── explicits-list
    ├── explicits
    │   ├── __init__.py
    │   ├── explicits.py
    │   └── user_explicits.py
    ├── speech
    │   ├── __init__.py
    │   ├── timestamp.py
    │   └── transcribe.py
    └── utils
    │   ├── __init__.py
    │   ├── cleanup.py
    │   ├── cli.py
    │   ├── constants.py
    │   ├── env.py
    │   ├── files.py
    │   ├── mac.py
    │   └── numbers.py
├── docs
    ├── Makefile
    ├── _static
    │   └── .keep
    ├── _templates
    │   └── .keep
    ├── conf.py
    ├── index.rst
    └── make.bat
├── media
    ├── logo.png
    └── poster.png
├── requirements-dev.txt
├── requirements-mac.txt
├── requirements.txt
└── tests
    ├── accuracy
        └── test_loudness_maximization.py
    ├── censor
        └── test_censor.py
    ├── data
        └── testing.wav
    ├── explicits
        └── test_user_explicits.py
    └── utils
        ├── numbers
            ├── test_gcs_time_to_ms.py
            ├── test_is_number.py
            └── test_leading_zero.py
        ├── test_cleanup.py
        └── test_files.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | .pytest_cache
 3 | __pycache__/
 4 | *.bak
 5 | .cache/
 6 | 
 7 | # Miscellaneous
 8 | *.swp
 9 | 
10 | # Documentation
11 | docs/_build
12 | docs/*.rst
13 | !docs/index.rst
14 | docs/_static/classes.png
15 | docs/_static/packages.png
16 | 
17 | # Compiled Python files
18 | *.pyc
19 | 
20 | # VIM settings
21 | .vimrc
22 | 
23 | # Audio files
24 | *.ac3
25 | *.adx
26 | *.aiff
27 | *.alaw
28 | *.asf
29 | *.au
30 | *.avi
31 | *.eac3
32 | *.f32be
33 | *.f32le
34 | *.f64be
35 | *.f64le
36 | *.ffmetadata
37 | *.flac
38 | *.flv
39 | *.g722
40 | *.mmf
41 | *.mp3
42 | *.mpeg
43 | *.mpegts
44 | *.mulaw
45 | *.nut
46 | *.ogg
47 | *.rm
48 | *.rso
49 | *.s16be
50 | *.s16le
51 | *.s24be
52 | *.s24le
53 | *.s32be
54 | *.s32le
55 | *.s8
56 | *.smjpeg
57 | *.sox
58 | *.spdif
59 | *.swf
60 | *.u16be
61 | *.u16le
62 | *.u24be
63 | *.u24le
64 | *.u32be
65 | *.u32le
66 | *.u8
67 | *.voc
68 | *.wav
69 | 
70 | # Test audio file
71 | !tests/data/testing.wav
72 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | disable=fixme,import-error,no-else-return,no-member,too-few-public-methods,too-many-arguments,useless-super-delegation,too-many-locals,too-many-instance-attributes,broad-except,no-self-use
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: required
 3 | services:
 4 |   - docker
 5 | python:
 6 |   - "3.5"
 7 |   - "3.6"
 8 | install:
 9 |   # Needed for the UML diagrams
10 |   - sudo apt-get install graphviz libportaudio2
11 |   - pip install -r requirements.txt -r requirements-dev.txt -r requirements-mac.txt
12 | script:
13 |   - ./bin/lint
14 |   - ./bin/test
15 |   - ./bin/generate_docs
16 |   - ./bin/.travis_doc_url_fix
17 | deploy:
18 |   - provider: script
19 |     script: bash bin/docker_push
20 |     on:
21 |       branch: master
22 |   - provider: pages
23 |     github-token: $GITHUB_TOKEN
24 |     keep-history: false
25 |     local-dir: docs/_build/html
26 |     on:
27 |       branch: master
28 |     skip-cleanup: true
29 |     target-branch: gh-pages
30 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | When contributing to this repository, please first discuss the change you wish to make via issue,
 4 | email, or any other method with the owners of this repository before making a change.
 5 | 
 6 | ## Pull Request Process
 7 | 
 8 | 1. Create an issue for what you want to contribue and create a pull request that links to that issue.
 9 | 2. Update the README.md with details of changes to the interface, this includes new libraries, new environment variables, exposed ports, useful file locations and container parameters.
10 | 3. For new functionality and new libraries, create new Wiki pages that explains these.
11 | 4. Merge the Pull Request once you have the approval of another developers (preferably using **Squash and merge**). If you do not have permission to do that, you may request the reviewer to merge it for you.
12 | 
13 | ## Code of Conduct
14 | 
15 | ### Our Pledge
16 | 
17 | In the interest of fostering an open and welcoming environment, we as
18 | contributors and maintainers pledge to making participation in our project and
19 | our community a harassment-free experience for everyone, regardless of age, body
20 | size, disability, ethnicity, gender identity and expression, level of experience,
21 | nationality, personal appearance, race, religion, or sexual identity and
22 | orientation.
23 | 
24 | ### Our Standards
25 | 
26 | Examples of behavior that contributes to creating a positive environment
27 | include:
28 | 
29 | * Using welcoming and inclusive language
30 | * Being respectful of differing viewpoints and experiences
31 | * Gracefully accepting constructive criticism
32 | * Focusing on what is best for the community
33 | * Showing empathy towards other community members
34 | 
35 | Examples of unacceptable behavior by participants include:
36 | 
37 | * The use of sexualized language or imagery and unwelcome sexual attention or
38 | advances
39 | * Trolling, insulting/derogatory comments, and personal or political attacks
40 | * Public or private harassment
41 | * Publishing others' private information, such as a physical or electronic
42 |   address, without explicit permission
43 | * Other conduct which could reasonably be considered inappropriate in a
44 |   professional setting
45 | 
46 | ### Our Responsibilities
47 | 
48 | Project maintainers are responsible for clarifying the standards of acceptable
49 | behavior and are expected to take appropriate and fair corrective action in
50 | response to any instances of unacceptable behavior.
51 | 
52 | Project maintainers have the right and responsibility to remove, edit, or
53 | reject comments, commits, code, wiki edits, issues, and other contributions
54 | that are not aligned to this Code of Conduct, or to ban temporarily or
55 | permanently any contributor for other behaviors that they deem inappropriate,
56 | threatening, offensive, or harmful.
57 | 
58 | ### Scope
59 | 
60 | This Code of Conduct applies both within project spaces and in public spaces
61 | when an individual is representing the project or its community. Examples of
62 | representing a project or community include using an official project e-mail
63 | address, posting via an official social media account, or acting as an appointed
64 | representative at an online or offline event. Representation of a project may be
65 | further defined and clarified by project maintainers.
66 | 
67 | ### Enforcement
68 | 
69 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
70 | reported by contacting the project team at cleanseaudio@gmail.com. All
71 | complaints will be reviewed and investigated and will result in a response that
72 | is deemed necessary and appropriate to the circumstances. The project team is
73 | obligated to maintain confidentiality with regard to the reporter of an incident.
74 | Further details of specific enforcement policies may be posted separately.
75 | 
76 | Project maintainers who do not follow or enforce the Code of Conduct in good
77 | faith may face temporary or permanent repercussions as determined by other
78 | members of the project's leadership.
79 | 
80 | ### Attribution
81 | 
82 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
83 | available at [http://contributor-covenant.org/version/1/4][version]
84 | 
85 | [homepage]: http://contributor-covenant.org
86 | [version]: http://contributor-covenant.org/version/1/4/
87 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.5
 2 | 
 3 | USER root
 4 | WORKDIR /root
 5 | 
 6 | COPY . cleansio
 7 | 
 8 | #===============================================================================
 9 | # Install Libraries
10 | #===============================================================================
11 | RUN apt-get update &&  \
12 |   apt-get -qqy install \
13 |   # For pydub
14 |   libav-tools          \
15 |   libavcodec-extra
16 | 
17 | WORKDIR cleansio
18 | 
19 | RUN pip install -r requirements.txt
20 | 
21 | #===============================================================================
22 | # Set Google Speech API
23 | #===============================================================================
24 | ENV GOOGLE_APPLICATION_CREDENTIALS=/google-cloud-speech-api.json
25 | 
26 | #===============================================================================
27 | # Execute cleansio
28 | #===============================================================================
29 | ENTRYPOINT ["python", "cleansio/cleansio.py"]
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Patrick Duncan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cleansio
 2 | 
 3 | [![Build Status](https://travis-ci.com/PatrickDuncan/cleansio.svg?token=9iihWUtXPiNNfbJx3N13&branch=master)](https://travis-ci.com/PatrickDuncan/cleansio) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Documentation Status](https://readthedocs.org/projects/cleansio/badge/?version=latest)](https://cleansio.readthedocs.io/en/latest/?badge=latest)
 4 | 
 5 | 
 6 | <img src="media/logo.png" width="200px" alt="logo">
 7 | 
 8 | ## Usage
 9 | 
10 | ```sh
11 | python cleansio/cleansio.py --help
12 | ```
13 | 
14 | ### Requirements
15 | 
16 | 1. Posix shell
17 | 2. Internet connection
18 | 
19 | ## Setup
20 | 
21 | 1. Check the Requirements
22 | 2. Clone this repo
23 | 3. Install Python 3.5+
24 |     - [Anaconda is recommended](https://www.anaconda.com/download/)
25 | 4. Download your Google Cloud Credentials JSON file
26 | 5. Set the following environment variables:
27 |     ```sh
28 |     export GOOGLE_APPLICATION_CREDENTIALS=<PATH_TO_JSON>
29 |     ```
30 | 6. Follow these additional steps:
31 |     - [ffmpeg set-up](https://github.com/jiaaro/pydub#getting-ffmpeg-set-up)
32 | 7. Install Cleansio's dependencies:
33 |     ```sh
34 |     pip install -r requirements.txt
35 |     ```
36 | 8. Follow the **[real-time setup instructions](https://github.com/PatrickDuncan/cleansio/wiki/Real-Time-Installation)**
37 | 9. _(OPTIONAL)_ If you're a developer run:
38 |     ```sh
39 |     pip install -r requirements-dev.txt
40 |     ```
41 | 10. _(OPTIONAL)_ Install the [Google Cloud SDK](https://cloud.google.com/sdk/docs/)
42 | 11. You're all set!
43 | 
44 | ## Documentation
45 | 
46 | [Technical Documentation](https://patrickdduncan.com/cleansio)
47 | 
48 | [Slideshow](https://patrickdduncan.com/clenasio-slideshow)
49 | 
50 | **Build Locally.** Available at _docs/\_build/html/index.html_
51 | ```sh
52 | ./bin/generate_docs
53 | ```
54 | 
55 | ### Linting
56 | 
57 | **Run**
58 | ```sh
59 | ./bin/lint
60 | ```
61 | 
62 | **Help**
63 | ```sh
64 | pylint --help-msg=<ID>
65 | ```
66 | 
67 | ### Testing
68 | 
69 | ```sh
70 | ./bin/test
71 | ```
72 | 
73 | ## Docker
74 | 
75 | **Run**
76 | ```sh
77 | docker run \
78 |   --tty \
79 |   --rm \
80 |   --volume <PATH_TO_MUSIC>:/music \
81 |   --volume <PATH_TO_GOOGLE_CLOUD_SPEECH_JSON>:/google-cloud-speech-api.json \
82 |   --name cleansio \
83 |   patrickduncan/cleansio \
84 |   /music/<MUSIC_FILE_NAME>
85 | ```
86 | 
87 | **Build**
88 | ```sh
89 | docker build --tag "cleansio:dev" .
90 | ```
91 | 
92 | ## Contributors
93 | 
94 | <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
95 | | [<img src="https://avatars.githubusercontent.com/u/6889074?v=3" width="100px;"/><br /><sub><b>Patrick D. Duncan</b></sub>](https://patrickduncan.co)<br /> [💻](https://github.com/patrickduncan/cleansio/commits?author=patrickduncan) | [<img src="https://avatars.githubusercontent.com/u/11710526?v=3" width="100px;"/><br /><sub><b>Levin Noronha</b></sub>](https://github.com/levin-noro)<br /> [💻](https://github.com/patrickduncan/cleansio/commits?author=levin-noro) | [<img src="https://avatars.githubusercontent.com/u/15528033?v=3" width="100px;"/><br /><sub><b>Corie Bain</b></sub>](https://github.com/c-bain)<br /> [💻](https://github.com/patrickduncan/cleansio/commits?author=c-bain) | [<img src="https://avatars.githubusercontent.com/u/1454713?v=3" width="100px;"/><br /><sub><b>Victor Carri</b></sub>](https://github.com/VictorCarri)<br /> [💻](https://github.com/patrickduncan/cleansio/commits?author=VictorCarri) |
96 | | :---: | :---: | :---: | :---: |
97 | <!-- ALL-CONTRIBUTORS-LIST:END -->
98 | 


--------------------------------------------------------------------------------
/bin/.travis_doc_url_fix:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Underscores break GitHub Pages URLs. Remove them from folder/file identifiers.
 4 | for file in $(find docs/_build/html -name "*_*" | sort -r); do
 5 |   dir=$(dirname "$file")
 6 |   mv "$file" "$dir/$(basename "$file" | tr -d _)"
 7 | done
 8 | 
 9 | # Remove underscores from the HTML files
10 | # TODO: Turn into 1 sed call
11 | sed -i 's/\(href[ ]*=[ ]*".*\)_/\1/g' docs/_build/html/*.html
12 | sed -i 's/\(href[ ]*=[ ]*".*\)_/\1/g' docs/_build/html/*.html
13 | sed -i 's/\(src[ ]*=[ ]*".*\)_/\1/g' docs/_build/html/*.html
14 | sed -i 's/\(src[ ]*=[ ]*".*\)_/\1/g' docs/_build/html/*.html
15 | 


--------------------------------------------------------------------------------
/bin/docker_push:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | docker build --tag "$DOCKER_USERNAME/cleansio:latest" .
4 | echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
5 | docker push "$DOCKER_USERNAME"/cleansio
6 | 


--------------------------------------------------------------------------------
/bin/generate_docs:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo 'WARNING: This will take a long time'
 4 | echo
 5 | # Generate the class and package diagrams
 6 | (cd cleansio && pyreverse -o png */*.py)
 7 | mv cleansio/classes.png cleansio/packages.png docs/_static
 8 | 
 9 | # Generate the documentation files
10 | find docs -name '*.rst' ! -name 'index.rst' -type f -exec rm -f {} +
11 | sphinx-apidoc -o docs cleansio
12 | (cd docs && make html)
13 | 


--------------------------------------------------------------------------------
/bin/lint:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | exit_code=0
 4 | 
 5 | for file in $(find . -name "*.py"); do
 6 |   if ! pylint --load-plugins pylint_quotes $file; then
 7 |     exit_code=1
 8 |   fi
 9 | done
10 | 
11 | exit $exit_code
12 | 


--------------------------------------------------------------------------------
/bin/test:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export PYTHONPATH=cleansio && pytest
4 | 


--------------------------------------------------------------------------------
/cleansio/audio/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 | 
3 | from .audio_file import AudioFile
4 | from .chunk_wrapper import ChunkWrapper
5 | from .convert import read_and_convert_audio, convert_audio_segment, convert_and_write_chunk
6 | from .accuracy import improve_accuracy
7 | 


--------------------------------------------------------------------------------
/cleansio/audio/accuracy.py:
--------------------------------------------------------------------------------
 1 | """ Functions which preprocess an audio chunk to improve the accuracy of speech
 2 |     recognition. """
 3 | 
 4 | def __maximize_volume(chunk):
 5 |     return chunk - chunk.max_dBFS
 6 | 
 7 | def improve_accuracy(chunk):
 8 |     """ Filter chunk through various functions to improve speech recognition """
 9 |     return __maximize_volume(chunk)
10 | 


--------------------------------------------------------------------------------
/cleansio/audio/audio_file.py:
--------------------------------------------------------------------------------
 1 | """ Classifies an audio file that will be broken up into chunks """
 2 | 
 3 | import sys
 4 | from textwrap import dedent
 5 | from pydub import AudioSegment
 6 | from colorama import Fore
 7 | from utils import create_temp_dir, create_env_var, file_name_no_ext, \
 8 |     append_before_ext, CHUNK_LEN
 9 | from .accuracy import improve_accuracy
10 | from .convert import convert
11 | 
12 | class AudioFile:
13 |     """ Classifies an audio file """
14 |     def __init__(self, file_path):
15 |         try:
16 |             self.file_path = convert(file_path)
17 |         except FileNotFoundError:
18 |             self.__handle_file_not_found(file_path)
19 |         self.encoding = 'LINEAR16'
20 |         audio_segment = AudioSegment.from_file(self.file_path)
21 |         self.channels = audio_segment.channels
22 |         self.frame_rate = audio_segment.frame_rate
23 |         self.normal_chunks, self.overlapping_chunks = \
24 |             self.__init_create_chunks(audio_segment)
25 | 
26 |     def __init_create_chunks(self, audio_segment):
27 |         """ Breaks up the file into small chunks """
28 |         temp_dir = create_temp_dir()
29 |         normal_chunks = []
30 |         overlapping_chunks = []
31 |         # Create normal and overlapping chunks
32 |         self.__create_chunks(audio_segment, temp_dir, normal_chunks, False)
33 |         self.__create_chunks(audio_segment, temp_dir, overlapping_chunks, True)
34 |         # Add the list of chunk filepaths to an ENV variable for post cleanup
35 |         create_env_var(
36 |             'CLEANSIO_CHUNKS_LIST', str(normal_chunks + overlapping_chunks))
37 |         return normal_chunks, overlapping_chunks
38 | 
39 |     def __create_chunks(self, audio_segment, temp_dir, chunks_arr, overlapping):
40 |         start = 2500 if overlapping else 0
41 |         chunks = audio_segment[start::CHUNK_LEN]
42 |         for index, chunk in enumerate(chunks):
43 |             chunk_path = self.__create_chunk(
44 |                 index, chunk, 'wav', temp_dir, overlapping)
45 |             chunks_arr.append(chunk_path)
46 |         # Fix for when the last chunk isn't long enough to support overlapping
47 |         self.__last_overlapping_chunk(
48 |             audio_segment, temp_dir, chunks_arr, overlapping)
49 | 
50 |     def __create_chunk(self, index, chunk, extension, temp_dir, overlapping):
51 |         file_name = file_name_no_ext(self.file_path)
52 |         file_path = temp_dir + file_name + '-' + str(index) + '.' + extension
53 |         if overlapping:
54 |             file_path = append_before_ext(file_path, '-overlapping')
55 |         # Chunk that will be modified for accuracy's sake
56 |         accuracy_path = append_before_ext(file_path, '-accuracy')
57 |         with open(accuracy_path, 'wb') as chunk_file:
58 |             accuracy_chunk = improve_accuracy(chunk)
59 |             accuracy_chunk.export(chunk_file, format=extension)
60 |             if overlapping: # The normal overlapping chunk is not needed
61 |                 return chunk_file.name
62 |         # Chunk that will be censored and preserve audio quality
63 |         with open(file_path, 'wb') as chunk_file:
64 |             chunk.export(chunk_file, format=extension)
65 |             return chunk_file.name
66 | 
67 |     def __last_overlapping_chunk(
68 |             self, audio_segment, temp_dir, chunks_arr, overlapping):
69 |         """ Check if the chunk is long enough to support overlapping """
70 |         if overlapping and len(audio_segment) % CHUNK_LEN < 4000:
71 |             chunk_path = self.__create_chunk(
72 |                 len(audio_segment) // CHUNK_LEN, # Last index
73 |                 AudioSegment.silent(frame_rate=44100),   # Silent chunk
74 |                 'wav', temp_dir, overlapping)
75 |             chunks_arr.append(chunk_path)
76 | 
77 |     @classmethod
78 |     def __handle_file_not_found(cls, file_path):
79 |         print(dedent('''\
80 |             {0}Audio file '{1}{2}{0}' could not be found
81 |             Make sure the audio file path is correct.\
82 |         '''.format(Fore.RED, Fore.YELLOW, str(file_path))))
83 |         sys.exit(0)
84 | 


--------------------------------------------------------------------------------
/cleansio/audio/chunk_wrapper.py:
--------------------------------------------------------------------------------
 1 | """ Wrapper for pydub's AudioSegment, used to add new properties """
 2 | 
 3 | class ChunkWrapper():
 4 |     """ Wrapper for pydub's AudioSegment """
 5 | 
 6 |     def __init__(self, audio_segment, mute_next_chunk=0):
 7 |         super().__init__()
 8 |         self.segment = audio_segment
 9 |         self.mute_next_start = mute_next_chunk
10 | 


--------------------------------------------------------------------------------
/cleansio/audio/convert.py:
--------------------------------------------------------------------------------
 1 | """ Converts audio properties """
 2 | 
 3 | import os
 4 | from pydub import AudioSegment
 5 | from utils import create_temp_dir, time_filename
 6 | 
 7 | def __sample_rate(audio_segment):
 8 |     """ GCS requires at least 16 kHz. Either upscale or keep the same. """
 9 |     frame_rate = audio_segment.frame_rate
10 |     return 16000 if frame_rate < 16000 else frame_rate
11 | 
12 | def __create_converted_file(file_path, encoding):
13 |     """ LINEAR16 must be mono and 16 bits (2) """
14 |     audio_segment = AudioSegment.from_file(file_path)
15 |     audio_segment                                     \
16 |         .set_channels(1)                              \
17 |         .set_sample_width(2)                          \
18 |         .set_frame_rate(__sample_rate(audio_segment)) \
19 |         .export(os.environ['CLEANSIO_TEMP_FILE'], format=encoding)
20 | 
21 | def convert(file_path, encoding='wav'):
22 |     """ Converts an audio file's encoding, returns the file path """
23 |     temp_dir = create_temp_dir()
24 |     os.environ['CLEANSIO_TEMP_FILE'] = temp_dir + \
25 |         str(time_filename()) + '.' + encoding
26 |     __create_converted_file(file_path, encoding)
27 |     return os.environ['CLEANSIO_TEMP_FILE']
28 | 
29 | def read_and_convert_audio(file_path):
30 |     """ Create a GCS AudioSegment from the file_path """
31 |     audio_segment = AudioSegment.from_file(file_path)
32 |     audio_segment            \
33 |         .set_channels(1)     \
34 |         .set_sample_width(2) \
35 |         .set_frame_rate(__sample_rate(audio_segment))
36 |     return audio_segment
37 | 
38 | def convert_audio_segment(audio_segment):
39 |     """ Create a GCS AudioSegment """
40 |     audio_segment            \
41 |         .set_channels(1)     \
42 |         .set_sample_width(2) \
43 |         .set_frame_rate(__sample_rate(audio_segment))
44 |     return audio_segment
45 | 
46 | def convert_and_write_chunk(chunk, file_path, encoding):
47 |     """ Create a GCS AudioSegment and write to the file path """
48 |     chunk                      \
49 |         .set_channels(1)       \
50 |         .set_sample_width(2)   \
51 |         .set_frame_rate(44100) \
52 |         .export(file_path, format=encoding)
53 | 


--------------------------------------------------------------------------------
/cleansio/censor/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 | 
3 | from .censor_file import CensorFile
4 | from .censor_realtime import CensorRealtime
5 | from .censor import Censor
6 | 


--------------------------------------------------------------------------------
/cleansio/censor/censor.py:
--------------------------------------------------------------------------------
  1 | """ Censors audio chunks by muting explicit sections """
  2 | 
  3 | from multiprocessing import Lock
  4 | from pathlib import Path
  5 | from colorama import Fore
  6 | from pydub import AudioSegment
  7 | from utils import CHUNK_LEN
  8 | from audio import ChunkWrapper
  9 | from speech import Timestamp, Transcribe
 10 | 
 11 | class Censor():
 12 |     """ Superclass of CensorFile and CensorRealtime """
 13 |     lock = Lock()
 14 |     explicit_count = 0
 15 |     muted_timestamps = []
 16 | 
 17 |     def __init__(self, explicits, output_encoding, output_location):
 18 |         super().__init__()
 19 |         self.explicits = explicits
 20 |         self.encoding = self.__encoding(output_encoding)
 21 |         self.location = self.__location(output_location)
 22 | 
 23 |     def censor_audio_chunk(self, file_path):
 24 |         """ Common process to censor an audio chunk """
 25 |         audio_segment = AudioSegment.from_file(file_path)
 26 |         lyrics = self.__get_lyrics(file_path, audio_segment)
 27 |         timestamps = self.__get_timestamps(lyrics)
 28 |         wrapper = ChunkWrapper(audio_segment)
 29 |         if timestamps:
 30 |             return self.__mute_explicits(file_path, wrapper, timestamps)
 31 |         else: # No mute so just return the original file
 32 |             return wrapper
 33 | 
 34 |     def create_clean_file(self, clean_file):
 35 |         """ Write cleaned up AudioSegment object to an audio file """
 36 |         self.print_explicits_count()
 37 |         clean_file.export(self.location, format=self.encoding)
 38 |         print(Fore.CYAN + 'Successfully created clean file, it\'s located at:')
 39 |         print(Fore.YELLOW + self.location)
 40 | 
 41 |     def print_explicits_count(self):
 42 |         """ Display to user the number of explicts Cleansio detected """
 43 |         print('Cleansio found {1}{0}{2} explicit(s)!'.format(
 44 |             Censor.explicit_count, Fore.GREEN, Fore.RESET))
 45 | 
 46 |     def __mute_explicits(self, file_path, wrapper, timestamps):
 47 |         """ Go through each word, if its an explicit, mute the duration """
 48 |         for stamp in timestamps:
 49 |             if stamp['word'] in self.explicits: # Explicit found, mute
 50 |                 chunk_index = int(file_path.split('-')[-1].split('.')[0])
 51 |                 wrapper = self.__mute_explicit(wrapper, stamp)
 52 |                 self.__explicit_count(stamp, chunk_index * CHUNK_LEN)
 53 |         return wrapper
 54 | 
 55 |     def __location(self, location):
 56 |         if location:
 57 |             return location[0]
 58 |         current_dir = str(Path(__file__).parents[2])
 59 |         return current_dir + '/clean_file.' + self.encoding
 60 | 
 61 |     def __encoding(self, encoding):
 62 |         return encoding[0] if encoding else 'wav'
 63 | 
 64 |     @classmethod
 65 |     def __mute_explicit(cls, wrapper, timestamp):
 66 |         len_as = len(wrapper.segment)
 67 |         # Check if the timestamp is outside of this chunk (from overlapping)
 68 |         if timestamp['start'] > len_as:
 69 |             return wrapper
 70 |         beginning = wrapper.segment[:timestamp['start']]
 71 |         # The end of the timestamp cannot be longer than the file
 72 |         end_time = len_as if len_as < timestamp['end'] else timestamp['end']
 73 |         duration = end_time - timestamp['start']
 74 |         mute = AudioSegment.silent(duration=duration)
 75 |         end = wrapper.segment[end_time:]
 76 |         wrapper.segment = (beginning + mute + end)
 77 |         wrapper.mute_next_start = \
 78 |             cls.__mute_next_chunk(wrapper, timestamp['end'])
 79 |         return wrapper
 80 | 
 81 |     @classmethod
 82 |     def __mute_next_chunk(cls, wrapper, end_time):
 83 |         # Store how much the next chunk should mute from its beginning
 84 |         extra_time = end_time - CHUNK_LEN
 85 |         return max(extra_time, wrapper.mute_next_start)
 86 | 
 87 |     @classmethod
 88 |     def __get_lyrics(cls, file_path, audio_segment):
 89 |         return Transcribe(file_path, audio_segment.frame_rate).lyrics
 90 | 
 91 |     @classmethod
 92 |     def __get_timestamps(cls, lyrics):
 93 |         return Timestamp(lyrics).timestamps
 94 | 
 95 |     @classmethod
 96 |     def __explicit_count(cls, stamp, chunk_offset):
 97 |         """ Count the number of explicits safely """
 98 |         stamp['start'] += chunk_offset
 99 |         stamp['end'] += chunk_offset
100 |         new_stamp = True
101 |         Censor.lock.acquire()
102 |         for mut in Censor.muted_timestamps:
103 |             if cls.__duplicate_stamp(mut, stamp):
104 |                 new_stamp = False
105 |                 break
106 |         if new_stamp or not Censor.muted_timestamps:
107 |             Censor.explicit_count += 1
108 |             Censor.muted_timestamps.append(stamp)
109 |         Censor.lock.release()
110 | 
111 |     @classmethod
112 |     def __duplicate_stamp(cls, stamp1, stamp2):
113 |         """ If 2 timestamps are the same word and start and at relatively the
114 |             same time, then assume they're the same timestamp """
115 |         if stamp1['word'] == stamp2['word'] and            \
116 |           abs(stamp1['start'] - stamp2['start']) < 201 and \
117 |           abs(stamp1['end'] - stamp2['end']) < 201:
118 |             return True
119 |         return False
120 | 


--------------------------------------------------------------------------------
/cleansio/censor/censor_file.py:
--------------------------------------------------------------------------------
 1 | """ Creates a clean version of a file by removing explicits """
 2 | 
 3 | from itertools import repeat
 4 | from multiprocessing.dummy import Pool as ThreadPool
 5 | from colorama import Fore, Style
 6 | from tqdm import tqdm
 7 | from pydub import AudioSegment
 8 | from audio import AudioFile
 9 | from .censor import Censor
10 | 
11 | class CensorFile(Censor):
12 |     """ Removes explicits from a file """
13 |     def __init__(self, args, explicits):
14 |         super().__init__(explicits, args.output_encoding, args.output_location)
15 |         self.file_path = args.file_path
16 | 
17 |     def censor(self):
18 |         """ Creates a clean/new version of a file by removing explicits """
19 |         audio_file = AudioFile(self.file_path)
20 |         # Define the CLI progress bar
21 |         p_bar, p_bar_step = self.__progress_bar(audio_file.normal_chunks)
22 |         async_iter = zip(
23 |             repeat(p_bar),
24 |             repeat(p_bar_step),
25 |             audio_file.normal_chunks)
26 |         # Censor each audio chunk file asynchronously
27 |         censored_chunks = ThreadPool(6).map(self.__censor_chunk, async_iter)
28 |         clean_file = self.__create_clean_segment(censored_chunks)
29 |         p_bar.close()
30 |         self.create_clean_file(clean_file)
31 | 
32 |     def __censor_chunk(self, async_iter):
33 |         """ Censors a chunk and updates the progress bar """
34 |         p_bar, p_bar_step, chunk_file_path = async_iter
35 |         p_bar.update(p_bar_step)
36 |         return self.censor_audio_chunk(chunk_file_path)
37 | 
38 |     def __create_clean_file(self, clean_file):
39 |         exp = 'explicit' if Censor.explicit_count == 1 else 'explicits'
40 |         print('Cleansio found {1}{2}{0}{3} {4}!'.format(
41 |             Censor.explicit_count, Style.BRIGHT, Fore.GREEN, Fore.RESET, exp))
42 |         clean_file.export(self.location, format=self.encoding)
43 |         print(Fore.CYAN + 'Successfully created clean file, it\'s located at:')
44 |         print(Fore.YELLOW + self.location)
45 | 
46 |     @classmethod
47 |     def __create_clean_segment(cls, censored_chunks):
48 |         clean_file = AudioSegment.empty()
49 |         s_mute = 0
50 |         for wrapper in censored_chunks: # Join the chunks together
51 |             # Mute the start of a chunk based on the previous chunk
52 |             clean_file += \
53 |                 AudioSegment.silent(duration=s_mute) + wrapper.segment[s_mute:]
54 |             s_mute = wrapper.mute_next_start
55 |         return clean_file
56 | 
57 |     @classmethod
58 |     def __progress_bar(cls, normal_chunks):
59 |         progress_bar_total = 100
60 |         progress_bar = tqdm(
61 |             # Remove the detailed percentage stats
62 |             bar_format=Style.BRIGHT + Fore.GREEN + '{l_bar}{bar}' + Fore.RESET,
63 |             desc='Censoring file', # Description
64 |             leave=False,           # Remove bar after completion
65 |             ncols=42,              # Set width
66 |             total=progress_bar_total)
67 |         progress_bar_step = (1 / len(normal_chunks)) * progress_bar_total
68 |         return progress_bar, progress_bar_step
69 | 


--------------------------------------------------------------------------------
/cleansio/censor/censor_realtime.py:
--------------------------------------------------------------------------------
 1 | """ Censors audio chunks in a continuous stream """
 2 | 
 3 | import platform
 4 | from colorama import Fore
 5 | from utils import create_env_var
 6 | from .censor_realtime_mac import CensorRealtimeMac
 7 | 
 8 | class CensorRealtime():
 9 |     """ Filters audio stream in real-time """
10 |     def __init__(self, args, explicits):
11 |         super().__init__()
12 |         self.explicits = explicits
13 |         self.args = args
14 |         create_env_var('CLEANSIO_REALTIME', 'true')
15 | 
16 |     def censor(self):
17 |         """ Censors audio in real-time. Implementation dependent on OS """
18 |         system = platform.system()
19 |         if system == 'Darwin':
20 |             CensorRealtimeMac(self.args, self.explicits).censor()
21 |         else:
22 |             print(Fore.RED + 'Real-time does not support your OS' + Fore.RESET)
23 | 


--------------------------------------------------------------------------------
/cleansio/censor/censor_realtime_mac.py:
--------------------------------------------------------------------------------
  1 | """ Censors audio chunks in a continuous stream """
  2 | 
  3 | import os
  4 | import threading
  5 | import sounddevice as sd
  6 | import soundfile as sf
  7 | from pydub import AudioSegment
  8 | from audio import improve_accuracy, convert_and_write_chunk, \
  9 |     read_and_convert_audio
 10 | from utils import create_env_var, create_temp_dir, append_before_ext, \
 11 |     time_filename, MacUtil, CHUNK_LEN
 12 | from .censor import Censor
 13 | 
 14 | class CensorRealtimeMac(Censor):
 15 |     """ Removes explicits from audio stream in real-time """
 16 | 
 17 |     running = True
 18 | 
 19 |     def __init__(self, args, explicits):
 20 |         print('Initialzed realtime censor object')
 21 |         super().__init__(explicits, args.output_encoding, args.output_location)
 22 |         self.__switch_audio_source()
 23 |         create_env_var('CLEANSIO_CHUNKS_LIST', '[]')
 24 |         self.args = args
 25 |         self.directory = create_temp_dir()
 26 |         self.chunk_prefix = self.directory + time_filename() + '-'
 27 |         self.temp_chunk_filepath = self.directory + 'temp_chunk.wav'
 28 |         self.__update_env_chunks_list(self.temp_chunk_filepath)
 29 |         self.clean_file = AudioSegment.empty()
 30 |         self.processing_queue = []
 31 |         self.processing_lock = threading.Lock()
 32 |         self.playback_queue = []
 33 |         self.playback_lock = threading.Lock()
 34 |         self.samplerate = 44100 # Hertz
 35 |         self.duration = 5 # seconds
 36 | 
 37 |     def censor(self):
 38 |         """ Censors audio chunks in a continuous stream """
 39 | 
 40 |         # Start thread that will analyze and censor recorded chunks
 41 |         processing_thread = threading.Thread(target=self.run)
 42 |         processing_thread.daemon = True
 43 |         processing_thread.start()
 44 | 
 45 |         try:
 46 |             # Device indexes in sd.default.device should have already been set
 47 |             #   to Soundflower (2ch) for input and Built-in Output for output.
 48 |             #   Capture stream from Soundflower (2ch) & play to Built-in Output
 49 |             with sd.Stream(samplerate=self.samplerate,
 50 |                            blocksize=int(self.samplerate*self.duration),
 51 |                            channels=1, callback=self.callback,
 52 |                            finished_callback=self.finished_callback):
 53 |                 print('#' * 80)
 54 |                 print('Press Return to stop censoring')
 55 |                 print('#' * 80)
 56 |                 input()
 57 |         except KeyboardInterrupt:
 58 |             print('\nInterrupted by user')
 59 |             CensorRealtimeMac.running = False
 60 |         except Exception as exception:
 61 |             print(type(exception).__name__ + ': ' + str(exception))
 62 |             CensorRealtimeMac.running = False
 63 | 
 64 |     def callback(self, indata, outdata, _, __, status):
 65 |         """ Process audio data from Stream  """
 66 |         if status:
 67 |             print(status)
 68 | 
 69 |         # Add to processing_queue
 70 |         with self.processing_lock:
 71 |             self.processing_queue.append(indata.copy())
 72 | 
 73 |         # Consume playback_queue
 74 |         with self.playback_lock:
 75 |             if self.playback_queue:
 76 |                 outdata[:] = self.playback_queue.pop(0)
 77 |             else:
 78 |                 outdata.fill(0)
 79 | 
 80 |     def finished_callback(self):
 81 |         """ Once stream is inactive, output cleaned recordings to audio file """
 82 |         if self.args.store_recording:
 83 |             trailing_audio_length = len(self.playback_queue) * CHUNK_LEN
 84 |             if trailing_audio_length > 0:
 85 |                 self.clean_file = self.clean_file[:-trailing_audio_length]
 86 |             self.create_clean_file(self.clean_file)
 87 |         else:
 88 |             self.print_explicits_count()
 89 | 
 90 |     def run(self):
 91 |         """ Process 10 seconds of captured audio data at a time """
 92 |         index = 0
 93 |         leftover_mute = 0
 94 | 
 95 |         while True:
 96 |             if not CensorRealtimeMac.running:
 97 |                 break
 98 | 
 99 |             with self.processing_lock:
100 |                 processing_queue_length = len(self.processing_queue)
101 | 
102 |             if processing_queue_length >= 2:
103 |                 with self.processing_lock:
104 |                     frames_to_process = self.processing_queue.pop(0)
105 |                     next_frames = self.processing_queue[0]
106 | 
107 |                 # Convert next two recordings into chunks
108 |                 recorded_chunk, file_path = \
109 |                     self.__convert_frames_to_chunk(frames_to_process, index)
110 |                 next_recorded_chunk, _ = \
111 |                     self.__convert_frames_to_chunk(next_frames, index+1)
112 | 
113 |                 overlapping_chunk, overlapping_path = \
114 |                     self.__create_overlapping_chunk(recorded_chunk,
115 |                                                     next_recorded_chunk,
116 |                                                     file_path)
117 | 
118 |                 # Create accuracy chunk for current chunk and overlapping chunk
119 |                 self.__create_accuracy_chunk(recorded_chunk, file_path)
120 |                 self.__create_accuracy_chunk(overlapping_chunk, overlapping_path)
121 | 
122 |                 # Censor current chunk and also mute any spillover explicits
123 |                 #   from previous chunk
124 |                 clean_chunk_wrapper = self.censor_audio_chunk(file_path)
125 |                 clean_chunk = AudioSegment.silent(duration=leftover_mute) \
126 |                     + clean_chunk_wrapper.segment[leftover_mute:]
127 | 
128 |                 # Remember to mute any overlapping explicit in the next chunk
129 |                 leftover_mute = clean_chunk_wrapper.mute_next_start
130 | 
131 |                 # Convert current chunk into frames and add it to the playback
132 |                 #   queue
133 |                 clean_frames = self.__convert_clean_chunk_to_frames(clean_chunk)
134 |                 with self.playback_lock:
135 |                     self.playback_queue.append(clean_frames)
136 | 
137 |                 if self.args.store_recording:
138 |                     self.clean_file += clean_chunk
139 | 
140 |                 index += 1
141 | 
142 |     def __convert_frames_to_chunk(self, frames, index):
143 |         file_path = self.chunk_prefix + str(index) +'.wav'
144 |         sf.write(file_path, frames, self.samplerate)
145 |         self.__update_env_chunks_list(file_path)
146 |         recorded_chunk = read_and_convert_audio(file_path)
147 |         return recorded_chunk, file_path
148 | 
149 |     def __convert_clean_chunk_to_frames(self, chunk):
150 |         chunk.export(self.temp_chunk_filepath, format='wav')
151 |         clean_frames, _ = sf.read(self.temp_chunk_filepath,
152 |                                   dtype='float32',
153 |                                   fill_value=0.0,
154 |                                   frames=int(self.samplerate*self.duration),
155 |                                   always_2d=True)
156 |         return clean_frames
157 | 
158 |     def __create_overlapping_chunk(self, chunk1, chunk2, file_path):
159 |         overlapping_chunk = chunk1[2500:] + chunk2[:2500]
160 |         overlapping_path = append_before_ext(file_path, '-overlapping')
161 |         convert_and_write_chunk(overlapping_chunk, overlapping_path, 'wav')
162 |         self.__update_env_chunks_list(overlapping_path)
163 |         return overlapping_chunk, overlapping_path
164 | 
165 |     def __create_accuracy_chunk(self, chunk, file_path):
166 |         accuracy_chunk_file_path = append_before_ext(file_path, '-accuracy')
167 |         accuracy_chunk = improve_accuracy(chunk)
168 |         convert_and_write_chunk(accuracy_chunk, accuracy_chunk_file_path, 'wav')
169 | 
170 |     @classmethod
171 |     def __switch_audio_source(cls):
172 |         create_env_var('CLEANSIO_OLD_SOUND_OUT', MacUtil.audio_source('output'))
173 |         create_env_var('CLEANSIO_OLD_SOUND_IN', MacUtil.audio_source('input'))
174 |         MacUtil.switch_audio_source('output', 'Soundflower (2ch)')
175 |         MacUtil.switch_audio_source('input', 'Soundflower (2ch)')
176 |         cls.__set_default_device('Soundflower (2ch)', 'Built-in Output')
177 | 
178 |     @classmethod
179 |     def __set_default_device(cls, input_device_name, output_device_name):
180 |         device_index = 0
181 |         input_device_index = 2 # Soundflower (2ch) is usually no. 2
182 |         output_device_index = 1 # Built-in Output is usually no. 1
183 |         for device in sd.query_devices():
184 |             if device['name'] == input_device_name:
185 |                 input_device_index = device_index
186 |             if device['name'] == output_device_name:
187 |                 output_device_index = device_index
188 |             device_index += 1
189 |         sd.default.device = (input_device_index, output_device_index)
190 | 
191 |     @classmethod
192 |     def __update_env_chunks_list(cls, file_path):
193 |         """ Call after every write for later cleanup """
194 |         env_list = os.environ['CLEANSIO_CHUNKS_LIST']
195 |         beginning = '[\'' if env_list[:-1] == '[' else env_list[:-1] + ', \''
196 |         create_env_var(
197 |             'CLEANSIO_CHUNKS_LIST', beginning + file_path + '\']')
198 | 


--------------------------------------------------------------------------------
/cleansio/cleansio.py:
--------------------------------------------------------------------------------
 1 | """ Displays the lyrics of an audio file """
 2 | 
 3 | # Imports from our modules
 4 | from censor import CensorFile, CensorRealtime
 5 | from utils import setup_cleanup, setup_cli_args
 6 | from explicits import Explicits
 7 | 
 8 | def is_file_mode():
 9 |     """ Validates if user is running file mode """
10 |     return ARGS.file_path
11 | 
12 | if __name__ == '__main__':
13 |     setup_cleanup()
14 |     ARGS = setup_cli_args()
15 |     EXPLICITS = Explicits(ARGS).set
16 |     if is_file_mode():
17 |         CensorFile(ARGS, EXPLICITS).censor()
18 |     else:
19 |         CensorRealtime(ARGS, EXPLICITS).censor()
20 | 


--------------------------------------------------------------------------------
/cleansio/data/encoding-types:
--------------------------------------------------------------------------------
 1 | ac3
 2 | adx
 3 | aiff
 4 | alaw
 5 | asf
 6 | au
 7 | avi
 8 | eac3
 9 | f32be
10 | f32le
11 | f64be
12 | f64le
13 | ffmetadata
14 | flac
15 | flv
16 | g722
17 | mmf
18 | mp3
19 | mpeg
20 | mpegts
21 | mulaw
22 | nut
23 | ogg
24 | rm
25 | rso
26 | s16be
27 | s16le
28 | s24be
29 | s24le
30 | s32be
31 | s32le
32 | s8
33 | smjpeg
34 | sox
35 | spdif
36 | swf
37 | u16be
38 | u16le
39 | u24be
40 | u24le
41 | u32be
42 | u32le
43 | u8
44 | voc
45 | wav
46 | 


--------------------------------------------------------------------------------
/cleansio/data/explicits-list:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/cleansio/data/explicits-list


--------------------------------------------------------------------------------
/cleansio/explicits/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 | 
3 | from .explicits import Explicits
4 | from .user_explicits import UserExplicits
5 | 


--------------------------------------------------------------------------------
/cleansio/explicits/explicits.py:
--------------------------------------------------------------------------------
 1 | """ Loads list of explicits from an encrypted file """
 2 | 
 3 | from Crypto.Cipher import AES
 4 | import yaml
 5 | from utils import relative_path
 6 | from .user_explicits import UserExplicits
 7 | 
 8 | class Explicits():
 9 |     """ Object representing set of explicits"""
10 |     def __init__(self, args):
11 |         """ Decrypt encrypted list of explicits and return a set """
12 |         self.set = self.__set(args)
13 | 
14 |     def __set(self, args):
15 |         explicit_set = {}
16 |         if args.user_list and args.combine_lists:
17 |             internal_set = self.__internal_set()
18 |             user_set = self.__user_set(args.user_list[0])
19 |             explicit_set = internal_set.union(user_set)
20 |         elif args.user_list:
21 |             explicit_set = self.__user_set(args.user_list[0])
22 |         else:
23 |             explicit_set = self.__internal_set()
24 |         return set(map(lambda e: e.lower(), explicit_set))
25 | 
26 |     def __internal_set(self):
27 |         with open(self.__get_explicits_path(), 'rb') as file:
28 |             decrypted_content = self.__get_decrypted_content(file)
29 | 
30 |         yaml_content = yaml.load(decrypted_content)
31 |         return set(yaml_content['explicits'])
32 | 
33 |     @classmethod
34 |     def __user_set(cls, user_list):
35 |         return UserExplicits(user_list).set
36 | 
37 |     @classmethod
38 |     def __get_explicits_path(cls):
39 |         """ Return path of encrypted explicits file """
40 |         path_to_enc_file = relative_path('../data/explicits-list')
41 |         return path_to_enc_file
42 | 
43 |     @classmethod
44 |     def __get_decrypted_content(cls, encrypted_file):
45 |         """ Decrypt the encrypted file and return content as string """
46 |         decryptor = AES.new('cleansio_sym_key', AES.MODE_CBC, 'cleansioCensorIV')
47 |         content = ''
48 |         while True:
49 |             block = encrypted_file.read(16)
50 |             if not block:
51 |                 break
52 |             content += decryptor.decrypt(block).decode('utf-8')
53 |         return content
54 | 


--------------------------------------------------------------------------------
/cleansio/explicits/user_explicits.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Loads a list of words from a file.
 3 |     It is assumed that the words are separated by the given separator.
 4 | """
 5 | 
 6 | class UserExplicits():
 7 |     """
 8 |         Loads a list of words from a file.
 9 |         It is assumed that the words are separated by the given separator.
10 |     """
11 |     def __init__(self, filename, sep='\n'):
12 |         # Create the set which stores words
13 |         self.set = {}
14 |         with open(filename, 'r') as uel:
15 |             # Assume that the words are separated by sep
16 |             word_list = uel.read().strip().split(sep)
17 |             self.set = set(filter(lambda x: x != '', word_list)) # Remove ''
18 | 


--------------------------------------------------------------------------------
/cleansio/speech/__init__.py:
--------------------------------------------------------------------------------
1 | """ Makes the directory a package. Acts as a public interface. """
2 | 
3 | from .transcribe import Transcribe
4 | from .timestamp import Timestamp
5 | 


--------------------------------------------------------------------------------
/cleansio/speech/timestamp.py:
--------------------------------------------------------------------------------
 1 | """ Locates where words are located in an audio chunk """
 2 | 
 3 | from utils import gcs_time_to_ms
 4 | 
 5 | class Timestamp():
 6 |     """ Words are located by either assessing silence or by estimatingself.
 7 |         Timestamps in the form of {word:, start:, end:} """
 8 |     def __init__(self, lyrics):
 9 |         super().__init__()
10 |         self.lyrics = lyrics
11 |         self.timestamps = self.__compute_timestamps()
12 | 
13 |     def __compute_timestamps(self):
14 |         """ Goes through each word in the chunk and computes the timestamps """
15 |         if not self.lyrics:
16 |             return None
17 |         return self.__parse_timestamps()
18 | 
19 |     def __parse_timestamps(self):
20 |         """ Parses GCS's output and returns [{word:str, start:ms, end:ms},].
21 |             O(n) """
22 |         timestamps = []
23 |         for word in self.lyrics:
24 |             start = max(0, gcs_time_to_ms(word.start_time) - 50)
25 |             timestamps.append({
26 |                 'word': word.word.lower(),
27 |                 'start': start,
28 |                 'end': gcs_time_to_ms(word.end_time) + 50
29 |             })
30 |         return timestamps
31 | 


--------------------------------------------------------------------------------
/cleansio/speech/transcribe.py:
--------------------------------------------------------------------------------
 1 | """ Convert audio to text using Google Cloud Speech """
 2 | 
 3 | from itertools import repeat
 4 | from multiprocessing.dummy import Pool as ThreadPool
 5 | from google.cloud.speech import enums, SpeechClient, types
 6 | from utils import append_before_ext
 7 | 
 8 | class Transcribe():
 9 |     """ Transcribes the lyrics from the vocals """
10 |     def __init__(self, file_path, frame_rate, encoding='LINEAR16'):
11 |         super().__init__()
12 |         self.lyrics = self.__transcribe_chunks(encoding, frame_rate, file_path)
13 | 
14 |     def __transcribe_chunks(self, frame_rate, encoding, file_path):
15 |         file_paths = [file_path, append_before_ext(file_path, '-overlapping')]
16 |         async_iter = zip(repeat(frame_rate), repeat(encoding), file_paths)
17 |         transcripts = ThreadPool(2).map(self.__transcribe_chunk, async_iter)
18 |         return self.__combine_transcripts(transcripts)
19 | 
20 |     def __transcribe_chunk(self, async_iter):
21 |         """ Accesses Google Cloud Speech and print the lyrics for each chunk """
22 |         frame_rate, encoding, file_path = async_iter
23 |         accuracy_chunk_path = append_before_ext(file_path, '-accuracy')
24 |         with open(accuracy_chunk_path, 'rb') as audio_content:
25 |             content = audio_content.read()
26 |         config = self.__get_config(encoding, frame_rate)
27 |         audio = types.RecognitionAudio(content=content)
28 |         return SpeechClient().recognize(config, audio)
29 | 
30 |     @classmethod
31 |     def __get_config(cls, frame_rate, encoding):
32 |         params = {
33 |             'encoding': enums.RecognitionConfig.AudioEncoding[encoding],
34 |             'sample_rate_hertz': frame_rate,
35 |             'language_code': 'en-US',
36 |             'enable_word_time_offsets': True,
37 |             'profanity_filter': False
38 |         }
39 |         return types.RecognitionConfig(**params)
40 | 
41 |     @classmethod
42 |     def __combine_transcripts(cls, transcripts):
43 |         """ Combine the words from the normal and overlapping chunks """
44 |         words = []
45 |         if transcripts[0].results: # Normal chunk
46 |             words += transcripts[0].results[0].alternatives[0].words
47 |         if transcripts[1].results: # Overlapping chunk
48 |             overlapping = transcripts[1].results[0].alternatives[0].words
49 |             shifted_time = list(map(cls.__shift_time, overlapping))
50 |             words += shifted_time
51 |         return None if words == [] else words
52 | 
53 |     @classmethod
54 |     def __shift_time(cls, word):
55 |         """ Increment the time relative to the normal chunk """
56 |         word.start_time.seconds += 2
57 |         word.start_time.nanos += 500000000
58 |         word.end_time.seconds += 2
59 |         word.end_time.nanos += 500000000
60 |         return word
61 | 


--------------------------------------------------------------------------------
/cleansio/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Makes the directory a package. Acts as a public interface. """
 2 | 
 3 | from .cleanup import cleanup, remove_chunks, remove_conversions, setup_cleanup
 4 | from .cli import setup_cli_args
 5 | from .env import create_env_var
 6 | from .files import create_temp_dir, file_name_no_ext, current_dir, \
 7 |     relative_path, append_before_ext, time_filename
 8 | from .numbers import gcs_time_to_ms, is_number, leading_zero
 9 | from .constants import CHUNK_LEN
10 | from .mac import MacUtil
11 | 


--------------------------------------------------------------------------------
/cleansio/utils/cleanup.py:
--------------------------------------------------------------------------------
 1 | """ Cleans up temporary files after the program runs """
 2 | 
 3 | from atexit import register
 4 | from os import environ, remove
 5 | import platform
 6 | from signal import signal, SIGABRT, SIGILL, SIGINT, SIGSEGV, SIGTERM
 7 | import sys
 8 | from .files import append_before_ext
 9 | from .mac import MacUtil
10 | 
11 | # Cleans up files on normal or abnormal exit
12 | # The arguments are unused - they are only here to satisfy atexit.
13 | def cleanup(_sig_num=None, _cur_stack_frame=None):
14 |     """ Removes temporary files """
15 |     remove_conversions()
16 |     remove_chunks()
17 |     system = platform.system()
18 |     if system == 'Darwin':
19 |         MacUtil.clean()
20 |     sys.exit(0)
21 | 
22 | def setup_cleanup():
23 |     """ Always call cleanup on any type of exit by creating triggers """
24 |     # Set the cleanup handler for each signal which we want to catch
25 |     for sig in (SIGABRT, SIGILL, SIGINT, SIGSEGV, SIGTERM):
26 |         signal(sig, cleanup)
27 |     # Register the cleanup function to be called if the program exits normally
28 |     register(cleanup)
29 | 
30 | def remove_conversions():
31 |     """ Removes converted WAV file """
32 | 
33 |     if 'CLEANSIO_TEMP_FILE' in environ:
34 |         temp_file = environ.get('CLEANSIO_TEMP_FILE')
35 |         try:
36 |             remove(temp_file)
37 |         except FileNotFoundError:
38 |             pass
39 | 
40 | def remove_chunks():
41 |     """ Removes each chunk of the converted WAV file """
42 | 
43 |     if 'CLEANSIO_CHUNKS_LIST' in environ:
44 |         slices_list_env_var = environ['CLEANSIO_CHUNKS_LIST']
45 |         chunks_list = slices_list_env_var[2:-2].split('\', \'')
46 |         for chunk_file in chunks_list:
47 |             try:
48 |                 remove(append_before_ext(chunk_file, '-accuracy'))
49 |             except FileNotFoundError:
50 |                 pass
51 |             try:
52 |                 remove(chunk_file)
53 |             except FileNotFoundError:
54 |                 pass
55 | 


--------------------------------------------------------------------------------
/cleansio/utils/cli.py:
--------------------------------------------------------------------------------
  1 | """ Utillity functions for command line interfaces """
  2 | 
  3 | import argparse
  4 | import sys
  5 | from colorama import Fore
  6 | from .files import relative_path
  7 | 
  8 | def setup_cli_args():
  9 |     """ Defines the different CLI arguments """
 10 |     parser = argparse.ArgumentParser(description='Real-time music censoring.')
 11 |     parser = __set_file_path(parser)
 12 |     parser = __set_user_list(parser)
 13 |     parser = __set_combine_list(parser)
 14 |     parser = __set_output_path(parser)
 15 |     parser = __set_output_encoding(parser)
 16 |     parser = __set_output_encoding_list(parser)
 17 |     parser = __set_store_recording(parser)
 18 |     args = parser.parse_args() # NOTE: Cannot add args after calling parse_args
 19 |     __exiting_args(args)
 20 |     __validate_args(args, parser)
 21 |     return args
 22 | 
 23 | def __set_file_path(parser):
 24 |     parser.add_argument(
 25 |         'file_path',
 26 |         nargs='?',
 27 |         help='enables file mode which creates a clean version of the file. \
 28 |             Relative or full path')
 29 |     return parser
 30 | 
 31 | def __set_user_list(parser):
 32 |     """ Sets the arguments which control the user list of explicit words """
 33 |     parser.add_argument(
 34 |         '-u',
 35 |         '--user-list',
 36 |         nargs=1,
 37 |         action='store',
 38 |         help='takes a path which points to a custom list of words which you \
 39 |         would like to mark as explicit.')
 40 |     return parser
 41 | 
 42 | def __set_combine_list(parser):
 43 |     """ Allows the user to combine their list with the internal list """
 44 |     parser.add_argument(
 45 |         '-c',
 46 |         '--combine-lists',
 47 |         action='store_true',
 48 |         help='the list which you provide with the \'-u\' option replaces the \
 49 |         program\'s internal list by default. However, you can pass \
 50 |         this option in addition to -u to have your list combined with the \
 51 |         internal list.')
 52 |     return parser
 53 | 
 54 | def __set_store_recording(parser):
 55 |     """ Allows the user to determine where the clean file is created """
 56 |     parser.add_argument(
 57 |         '-s',
 58 |         '--store-recording',
 59 |         action='store_true',
 60 |         help='save the clean realtime audio as a file in the output location')
 61 |     return parser
 62 | 
 63 | def __set_output_path(parser):
 64 |     """ Allows the user to determine where the clean file is created """
 65 |     parser.add_argument(
 66 |         '-o',
 67 |         '--output-location',
 68 |         nargs=1,
 69 |         action='store',
 70 |         help='takes a path which will overwrite the default location of where \
 71 |         the clean file will be created. If the file already exists it will be \
 72 |         overwritten.')
 73 |     return parser
 74 | 
 75 | def __set_output_encoding(parser):
 76 |     """ Allows the user to determine the audio encoding of the clean file """
 77 |     parser.add_argument(
 78 |         '-e',
 79 |         '--output-encoding',
 80 |         nargs=1,
 81 |         action='store',
 82 |         help='specify the audio encoding type of the output file. The file \
 83 |         extension of --output-location is not sufficient. Default is wav.')
 84 |     return parser
 85 | 
 86 | def __set_output_encoding_list(parser):
 87 |     """ Allows the user to determine the different audio encoding types """
 88 |     parser.add_argument(
 89 |         '--output-encoding-list',
 90 |         action='store_true',
 91 |         help='list the possible audio encoding types for the output file.')
 92 |     return parser
 93 | 
 94 | def __exiting_args(args):
 95 |     """ Handles arguments that simply print and exit """
 96 |     if args.output_encoding_list:
 97 |         with open(__encoding_types_path()) as types:
 98 |             __exit(types.read())
 99 | 
100 | def __validate_args(args, parser):
101 |     """ Validates user input """
102 |     __validate_combine_list(args, parser)
103 |     __validate_output_encoding(args, parser)
104 | 
105 | def __validate_combine_list(args, parser):
106 |     # Ensure that the -u option is present if the -c option is given
107 |     map_args = vars(args) # Convert the arguments to a map
108 |     if map_args['combine_lists'] and not map_args['user_list']:
109 |         __error(parser, 'The -c option requires -u!')
110 | 
111 | def __validate_output_encoding(args, parser):
112 |     # Validate if the user's encoding is valid
113 |     output_encoding = args.output_encoding
114 |     if output_encoding:
115 |         encoding_choice = output_encoding[0]
116 |         valid = False
117 |         with open(__encoding_types_path()) as types:
118 |             for encoding_type in types.readlines():
119 |                 if encoding_type.strip() == encoding_choice:
120 |                     valid = True
121 |                     break
122 |         if not valid:
123 |             __error(parser, encoding_choice + ' is not supported!')
124 | 
125 | def __encoding_types_path():
126 |     return relative_path('../data/encoding-types')
127 | 
128 | def __error(parser, message):
129 |     parser.error(Fore.RED + message)
130 | 
131 | def __exit(message):
132 |     print(message.strip())
133 |     sys.exit(0)
134 | 


--------------------------------------------------------------------------------
/cleansio/utils/constants.py:
--------------------------------------------------------------------------------
1 | """ Stores constants used across the project """
2 | 
3 | CHUNK_LEN = 5000 # In milliseconds
4 | 


--------------------------------------------------------------------------------
/cleansio/utils/env.py:
--------------------------------------------------------------------------------
1 | """ Utillity functions for environment variables """
2 | 
3 | import os
4 | 
5 | def create_env_var(name, value):
6 |     """ Instantiate a new environment variable with given value """
7 |     os.environ[name] = value
8 | 


--------------------------------------------------------------------------------
/cleansio/utils/files.py:
--------------------------------------------------------------------------------
 1 | """ Utillity functions for File I/O """
 2 | 
 3 | from errno import EEXIST
 4 | import os
 5 | from os.path import basename, expanduser
 6 | import time
 7 | from .env import create_env_var
 8 | 
 9 | def create_temp_dir():
10 |     """ Create directory to store all temporary files """
11 |     create_env_var('CLEANSIO_TEMP_DIR', expanduser('~') + '/.cleansio-temp/')
12 |     try:
13 |         os.makedirs(os.environ['CLEANSIO_TEMP_DIR'])
14 |     except OSError as os_error:
15 |         # Ignore the error if it's just that the directory exists
16 |         if os_error.errno != EEXIST:
17 |             raise # Don't ignore errors other than the directory existing
18 |     return os.environ['CLEANSIO_TEMP_DIR']
19 | 
20 | def file_name_no_ext(file_path):
21 |     """ Get a file name with no extension from a file path """
22 |     return ''.join(basename(file_path).split('.')[:-1])
23 | 
24 | def current_dir():
25 |     """ The utils directory path """
26 |     return os.path.dirname(__file__)
27 | 
28 | def relative_path(path):
29 |     """ Path relative to the utils directory """
30 |     return os.path.join(current_dir(), path)
31 | 
32 | def append_before_ext(path, addition):
33 |     """ Add a string between the file descriptor and the extension """
34 |     dot_index = path.rfind('.')
35 |     if dot_index == -1: # . Not found
36 |         return path + addition
37 |     return path[:dot_index] + addition + path[dot_index:]
38 | 
39 | def time_filename():
40 |     """ Return the current time in milliseconds. Used for chunk file names """
41 |     return str(int(round(time.time() * 1000)))
42 | 


--------------------------------------------------------------------------------
/cleansio/utils/mac.py:
--------------------------------------------------------------------------------
 1 | """ Utility functions for macOS """
 2 | 
 3 | from os import environ
 4 | from subprocess import run
 5 | 
 6 | class MacUtil():
 7 |     """ Utility functions for macOS """
 8 |     def __init__(self):
 9 |         super().__init__()
10 | 
11 |     @classmethod
12 |     def switch_audio_source(cls, interface, device_name):
13 |         """ Switch the system's audio source
14 |             interface : [input|output] """
15 |         run(['SwitchAudioSource', '-t', interface, '-s', device_name],
16 |             capture_output=True, # Ignore output by capturing it
17 |             check=True)          # Throw an error if command fails
18 | 
19 |     @classmethod
20 |     def audio_source(cls, interface):
21 |         """ Returns the system's audio source
22 |             interface : [input|output] """
23 |         raw_device_name = run(
24 |             ['SwitchAudioSource', '-c', '-t', interface],
25 |             capture_output=True, # Return output
26 |             check=True)          # Throw an error if command fails
27 |         return raw_device_name.stdout.decode('utf-8').replace('\n', '')
28 | 
29 |     @classmethod
30 |     def clean(cls):
31 |         """ Resets the system's state """
32 |         if 'CLEANSIO_REALTIME' in environ and       \
33 |             'CLEANSIO_OLD_SOUND_OUT' in environ and \
34 |             'CLEANSIO_OLD_SOUND_IN' in environ:
35 |             cls.switch_audio_source('output', environ['CLEANSIO_OLD_SOUND_OUT'])
36 |             cls.switch_audio_source('input', environ['CLEANSIO_OLD_SOUND_IN'])
37 | 


--------------------------------------------------------------------------------
/cleansio/utils/numbers.py:
--------------------------------------------------------------------------------
 1 | """ Utillity functions for dealing with numbers """
 2 | 
 3 | import re
 4 | from google.protobuf.duration_pb2 import Duration
 5 | 
 6 | def is_number(num):
 7 |     """ Validates if a string is a number. Can be negative or a float """
 8 |     return re.match(r'^-?\d+(\.\d+|\d*)$', str(num))
 9 | 
10 | def leading_zero(num):
11 |     """ Adds a leading 0 to single digit numbers. Converts numbers to string """
12 |     str_num = str(num)
13 |     if not str_num.isdigit(): # Check if it's a number
14 |         return str_num
15 |     if len(str_num) < 2:
16 |         return '0' + str_num
17 |     return str_num
18 | 
19 | def gcs_time_to_ms(time):
20 |     """ Converts seconds and nano to milliseconds """
21 |     if not isinstance(time, Duration)                 \
22 |         or (time.nanos and not is_number(time.nanos)) \
23 |         or (time.seconds and not is_number(time.seconds)):
24 |         return 0
25 |     milliseconds = time.seconds * 1000 if time.seconds else 0
26 |     milliseconds += time.nanos // 1e6
27 |     return milliseconds
28 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 


--------------------------------------------------------------------------------
/docs/_static/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/docs/_static/.keep


--------------------------------------------------------------------------------
/docs/_templates/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/docs/_templates/.keep


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # pylint: skip-file
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Configuration file for the Sphinx documentation builder.
  5 | #
  6 | # This file does only contain a selection of the most common options. For a
  7 | # full list see the documentation:
  8 | # http://www.sphinx-doc.org/en/master/config
  9 | 
 10 | # -- Path setup --------------------------------------------------------------
 11 | 
 12 | # If extensions (or modules to document with autodoc) are in another directory,
 13 | # add these directories to sys.path here. If the directory is relative to the
 14 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 15 | #
 16 | import os
 17 | import sys
 18 | sys.path.insert(0, os.path.abspath('../cleansio'))
 19 | 
 20 | 
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | project = 'Cleansio'
 24 | copyright = '2019, Patrick Duncan, Victor Carri, Levin Noronha, Corie Bain'
 25 | author = 'Patrick Duncan, Victor Carri, Levin Noronha, Corie Bain'
 26 | 
 27 | # The short X.Y version
 28 | version = ''
 29 | # The full version, including alpha/beta/rc tags
 30 | release = ''
 31 | 
 32 | 
 33 | # -- General configuration ---------------------------------------------------
 34 | 
 35 | # If your documentation needs a minimal Sphinx version, state it here.
 36 | #
 37 | # needs_sphinx = '1.0'
 38 | 
 39 | # Add any Sphinx extension module names here, as strings. They can be
 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 41 | # ones.
 42 | extensions = [
 43 |     'sphinx.ext.autodoc',
 44 |     'sphinx.ext.todo',
 45 |     'sphinx.ext.coverage',
 46 |     'sphinx.ext.viewcode',
 47 |     'sphinxcontrib.github_ribbon'
 48 | ]
 49 | 
 50 | # Add any paths that contain templates here, relative to this directory.
 51 | templates_path = ['_templates']
 52 | 
 53 | # The suffix(es) of source filenames.
 54 | # You can specify multiple suffix as a list of string:
 55 | #
 56 | # source_suffix = ['.rst', '.md']
 57 | source_suffix = '.rst'
 58 | 
 59 | # The master toctree document.
 60 | master_doc = 'index'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = None
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | # This pattern also affects html_static_path and html_extra_path.
 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 73 | 
 74 | # The name of the Pygments (syntax highlighting) style to use.
 75 | pygments_style = None
 76 | 
 77 | 
 78 | # -- Options for HTML output -------------------------------------------------
 79 | 
 80 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 81 | # a list of builtin themes.
 82 | #
 83 | html_theme = 'alabaster'
 84 | 
 85 | # Theme options are theme-specific and customize the look and feel of a theme
 86 | # further.  For a list of options available for each theme, see the
 87 | # documentation.
 88 | #
 89 | # html_theme_options = {}
 90 | 
 91 | # Add any paths that contain custom static files (such as style sheets) here,
 92 | # relative to this directory. They are copied after the builtin static files,
 93 | # so a file named "default.css" will overwrite the builtin "default.css".
 94 | html_static_path = ['_static']
 95 | 
 96 | # Custom sidebar templates, must be a dictionary that maps document names
 97 | # to template names.
 98 | #
 99 | # The default sidebars (for documents that don't match any pattern) are
100 | # defined by theme itself.  Builtin themes are using these templates by
101 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
102 | # 'searchbox.html']``.
103 | #
104 | # html_sidebars = {}
105 | 
106 | 
107 | # -- Options for HTMLHelp output ---------------------------------------------
108 | 
109 | # Output file base name for HTML help builder.
110 | htmlhelp_basename = 'Cleansiodoc'
111 | 
112 | 
113 | # -- Options for LaTeX output ------------------------------------------------
114 | 
115 | latex_elements = {
116 |     # The paper size ('letterpaper' or 'a4paper').
117 |     #
118 |     # 'papersize': 'letterpaper',
119 | 
120 |     # The font size ('10pt', '11pt' or '12pt').
121 |     #
122 |     # 'pointsize': '10pt',
123 | 
124 |     # Additional stuff for the LaTeX preamble.
125 |     #
126 |     # 'preamble': '',
127 | 
128 |     # Latex figure (float) alignment
129 |     #
130 |     # 'figure_align': 'htbp',
131 | }
132 | 
133 | # Grouping the document tree into LaTeX files. List of tuples
134 | # (source start file, target name, title,
135 | #  author, documentclass [howto, manual, or own class]).
136 | latex_documents = [
137 |     (master_doc, 'Cleansio.tex', 'Cleansio Documentation',
138 |      'Patrick Duncan, Victor Carri, Levin Noronha, Corie Bain', 'manual'),
139 | ]
140 | 
141 | 
142 | # -- Options for manual page output ------------------------------------------
143 | 
144 | # One entry per manual page. List of tuples
145 | # (source start file, name, description, authors, manual section).
146 | man_pages = [
147 |     (master_doc, 'cleansio', 'Cleansio Documentation',
148 |      [author], 1)
149 | ]
150 | 
151 | 
152 | # -- Options for Texinfo output ----------------------------------------------
153 | 
154 | # Grouping the document tree into Texinfo files. List of tuples
155 | # (source start file, target name, title, author,
156 | #  dir menu entry, description, category)
157 | texinfo_documents = [
158 |     (master_doc, 'Cleansio', 'Cleansio Documentation',
159 |      author, 'Cleansio', 'One line description of project.',
160 |      'Miscellaneous'),
161 | ]
162 | 
163 | 
164 | # -- Options for Epub output -------------------------------------------------
165 | 
166 | # Bibliographic Dublin Core info.
167 | epub_title = project
168 | 
169 | # The unique identifier of the text. This can be a ISBN number
170 | # or the project homepage.
171 | #
172 | # epub_identifier = ''
173 | 
174 | # A unique identification for the text.
175 | #
176 | # epub_uid = ''
177 | 
178 | # A list of files that should not be packed into the epub file.
179 | epub_exclude_files = ['search.html']
180 | 
181 | 
182 | # -- Extension configuration -------------------------------------------------
183 | 
184 | github_ribbon_repo = 'PatrickDuncan/cleansio'
185 | github_ribbon_position = 'right'
186 | github_ribbon_color = 'darkblue'
187 | 
188 | # -- Options for todo extension ----------------------------------------------
189 | 
190 | # If true, `todo` and `todoList` produce output, else they produce nothing.
191 | todo_include_todos = True
192 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to Cleansio's Technical Documentation!
 2 | =======================================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: Contents:
 7 | 
 8 | .. image:: ../media/logo.png
 9 |  :width: 150 px
10 |  :alt: Logo
11 |  :target: https://github.com/PatrickDuncan/cleansio
12 | 
13 | Indices and tables
14 | ------------------
15 | * :ref:`genindex`
16 | * :ref:`modindex`
17 | * :ref:`search`
18 | 
19 | .. image:: _static/classes.png
20 |  :width: 400 px
21 |  :alt: Classes Diagram
22 | 
23 | .. image:: _static/packages.png
24 |  :width: 400 px
25 |  :alt: Packages Diagram
26 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/media/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/media/logo.png


--------------------------------------------------------------------------------
/media/poster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/media/poster.png


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pylint>=2.2.2
2 | pylint-quotes>=0.2.0
3 | pytest==4.0.2
4 | sphinx==1.8.3
5 | sphinxcontrib-github_ribbon==0.9.0
6 | 


--------------------------------------------------------------------------------
/requirements-mac.txt:
--------------------------------------------------------------------------------
1 | sounddevice>=0.3.12
2 | soundfile>=0.10.2
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | colorama>=0.4.1
2 | google-cloud-speech>=0.36.1
3 | pycrypto>=2.6.1
4 | pydub>=0.23.0
5 | pyyaml>=3.13
6 | tqdm>=4.29.0
7 | 


--------------------------------------------------------------------------------
/tests/accuracy/test_loudness_maximization.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | 
 3 | import os
 4 | from pydub import AudioSegment
 5 | from audio.accuracy import __maximize_volume
 6 | from audio.accuracy import improve_accuracy
 7 | 
 8 | def __get_file(file_path):
 9 |     return os.path.dirname(os.path.realpath(__file__)) + file_path
10 | 
11 | def test_loudness_maximization():
12 |     try:
13 |         file_path = __get_file('/../data/testing.wav')
14 |         audio_segment = AudioSegment.from_file(file_path)
15 |         # Duplicate the audio file
16 |         file_path_duplicate = __get_file('/../data/testing-max-volume.wav')
17 |         duplicate_file = audio_segment.export(file_path_duplicate, format='wav')
18 |         audio_segment_duplicate = AudioSegment.from_file(file_path_duplicate)
19 |         init_loudness = audio_segment_duplicate.dBFS
20 |         # Test that the volume was successfully maximized
21 |         max_volume_chunk = __maximize_volume(audio_segment_duplicate)
22 |         max_loundess = max_volume_chunk.dBFS
23 |         assert init_loudness < max_loundess
24 |     except:
25 |         assert False
26 |     finally:
27 |         #Cleanup
28 |         os.remove(file_path_duplicate)
29 | 


--------------------------------------------------------------------------------
/tests/censor/test_censor.py:
--------------------------------------------------------------------------------
 1 | #pylint: skip-file
 2 | 
 3 | import os
 4 | from pathlib import Path
 5 | from pydub import AudioSegment, silence
 6 | from audio import ChunkWrapper
 7 | from censor import Censor
 8 | 
 9 | timestamps = [
10 |     {'word': 'hi', 'start': 1000.0, 'end': 1500.0},
11 |     {'word': 'bye', 'start': 1700.0, 'end': 1900.0},
12 |     {'word': 'mute', 'start': 3800.0, 'end': 4500.0}
13 | ]
14 | 
15 | explicits = ['mute']
16 | 
17 | def __get_file(file_path):
18 |     return os.path.dirname(os.path.realpath(__file__)) + file_path
19 | 
20 | def test_censor():
21 |     try:
22 |         file_path = __get_file('/../data/testing.wav')
23 |         audio_segment = AudioSegment.from_file(file_path)
24 |         # Duplicate the audio file and begin muting the new file
25 |         file_path_dup = __get_file('/../data/testing-censored-0.wav')
26 |         dup_file = audio_segment.export(file_path_dup, format='wav')
27 |         audio_segment_dup = AudioSegment.from_file(file_path_dup)
28 | 
29 |         # Test that the explicits were successfully removed
30 |         wrapper = ChunkWrapper(audio_segment_dup)
31 |         location = str(Path(__file__).parents[2]) + '/clean_file.wav'
32 |         audio_segment_dup = Censor(explicits, 'wav', location)._Censor__mute_explicits(
33 |             file_path_dup, wrapper, timestamps).segment
34 |         # Get the silence segments
35 |         silent_ranges = silence.detect_silence(
36 |             audio_segment_dup, min_silence_len=500, silence_thresh=-50)
37 | 
38 |         # Assert silence is only in the 'mute' timestamp
39 |         assert len(silent_ranges) == 1
40 |         beginning_diff = silent_ranges[0][0] - timestamps[2]['start']
41 |         end_diff = silent_ranges[0][1] - timestamps[2]['end']
42 | 
43 |         # Less than 5 (milliseconds) to account for small inaccuracies
44 |         assert abs(beginning_diff) < 5
45 |         assert abs(end_diff) < 5
46 |     except:
47 |         assert False
48 |     finally:
49 |         # Cleanup
50 |         os.remove(file_path_dup)
51 | 
52 | 


--------------------------------------------------------------------------------
/tests/data/testing.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickDuncan/cleansio/eb2be14d486cc14b32ceb7bd3f87dfb1f08417fa/tests/data/testing.wav


--------------------------------------------------------------------------------
/tests/explicits/test_user_explicits.py:
--------------------------------------------------------------------------------
 1 | #pylint: skip-file
 2 | 
 3 | # Setup import path to include the wordlist files
 4 | import sys
 5 | # Import modules to test
 6 | from explicits import UserExplicits
 7 | # Testing
 8 | import pytest
 9 | 
10 | # Simplest case - newline-separated word list
11 | # tmpdir - A temporary directory path, supplied by pytest
12 | def test_newline(tmpdir):
13 |     # A list which we will write to the file
14 |     correct_arr = {"a", "b", "c"}
15 |     # Create a temporary file
16 |     tmp_path = tmpdir.join("newlinefile")
17 |     # Write a test array to a temporary file, separated by newlines
18 |     tmp_path.write("\n".join(correct_arr))
19 |     # Test - create a UserExplicits and see whether or not it loads the data
20 |     fl = UserExplicits(str(tmp_path))
21 |     # The array which we read from the file should be the same as the array which we wrote
22 |     assert fl.set == correct_arr
23 |     assert '' not in fl.set
24 | 
25 | # Nonexistent file given - exception expected
26 | # tmp_path - A temporary directory path, supplied by pytest
27 | def test_nonexistent_file(tmpdir):
28 |     tmp_path = tmpdir.join("nonexistent")
29 |     with pytest.raises(FileNotFoundError, message="Expecting FileNotFoundError"):
30 |         # Try to read a nonexistent file
31 |         fl = UserExplicits(str(tmp_path))
32 | 
33 | # Non-newline separator (CSV)
34 | # tmp_path - A temporary directory path, supplied by pytest
35 | def test_csv_file(tmpdir):
36 |     # The separator which we want to use in the file
37 |     separator = ","
38 |     # A list which we will write to the file
39 |     correct_arr = {"a", "b", "c"}
40 |     # Create a temp file
41 |     tmp_path = tmpdir.join("test.csv")
42 |     # Write a test array to a temporary file, separated by commas
43 |     tmp_path.write(separator.join(correct_arr))
44 |     # Test - create a UserExplicits and see whether or not it loads the data
45 |     fl = UserExplicits(str(tmp_path), separator)
46 |     # The array which we read from the file should be the same as the array which we wrote
47 |     assert fl.set == correct_arr
48 | 


--------------------------------------------------------------------------------
/tests/utils/numbers/test_gcs_time_to_ms.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | 
 3 | from utils import gcs_time_to_ms
 4 | from google.protobuf.duration_pb2 import Duration
 5 | 
 6 | def test_gcs_time_to_ms_empty():
 7 |     assert gcs_time_to_ms('') == 0 and gcs_time_to_ms(None) == 0
 8 | 
 9 | def test_gcs_time_to_ms_just_nanos():
10 |     duration = Duration()
11 |     duration.nanos = 900000000
12 |     assert gcs_time_to_ms(duration) == 900
13 | 
14 | def test_gcs_time_to_ms_just_seconds():
15 |     duration = Duration()
16 |     duration.seconds = 2
17 |     assert gcs_time_to_ms(duration) == 2000
18 | 
19 | def test_gcs_time_to_ms_nanos_and_seconds():
20 |     duration = Duration()
21 |     duration.nanos = 300000000
22 |     duration.seconds = 5
23 |     assert gcs_time_to_ms(duration) == 5300
24 | 


--------------------------------------------------------------------------------
/tests/utils/numbers/test_is_number.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | 
 3 | from utils import is_number
 4 | 
 5 | def test_is_number_empty():
 6 |     assert not is_number('')
 7 | 
 8 | def test_is_number_string():
 9 |     assert not is_number('cleansio')
10 | 
11 | def test_is_number_string_and_num():
12 |     assert not is_number('hello123')
13 | 
14 | def test_is_number_different_type():
15 |     assert is_number(91327)
16 | 
17 | def test_is_number_integer():
18 |     assert is_number(str(91327))
19 | 
20 | def test_is_number_negative():
21 |     assert is_number(str(-91327))
22 | 
23 | def test_is_number_float():
24 |     assert is_number(str(91.327))
25 | 
26 | def test_is_number_negative_float():
27 |     assert is_number('-91327.23')
28 |     assert is_number(str(-91327.23))
29 | 


--------------------------------------------------------------------------------
/tests/utils/numbers/test_leading_zero.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | 
 3 | from utils import leading_zero
 4 | 
 5 | def test_leading_zero_empty():
 6 |     assert leading_zero('') == ''
 7 | 
 8 | def test_leading_zero_int_single_digit():
 9 |     assert leading_zero(4) == '04'
10 | 
11 | def test_leading_zero_int_double_digit():
12 |     assert leading_zero(23) == '23'
13 | 
14 | def test_leading_zero_string_single_digit():
15 |     assert leading_zero('9') == '09'
16 | 
17 | def test_leading_zero_string_triple_digit():
18 |     assert leading_zero('504') == '504'
19 | 


--------------------------------------------------------------------------------
/tests/utils/test_cleanup.py:
--------------------------------------------------------------------------------
 1 | #pylint: skip-file
 2 | 
 3 | # Standard imports
 4 | from uuid import uuid4
 5 | from random import randint
 6 | import os
 7 | import pytest
 8 | 
 9 | # Function to test
10 | from utils import cleanup
11 | 
12 | # Tests the cleanup function
13 | def test_cleanup():
14 |     ## Temporary file ##
15 | 
16 |     # Temp file shouldn't exist
17 |     if 'CLEANSIO_TEMP_FILE' in os.environ: # Temp file var exists
18 |         # Fetch the name of the temp file
19 |         temp_file = os.environ.get('CLEANSIO_TEMP_FILE')
20 |     else: # Temp file variable doesn't exist
21 |         temp_file_name = str(uuid4().hex) # Fetch a random temporary file name
22 |         # Create a path to the temporary file
23 |         temp_file = "./{0}".format(temp_file_name)
24 |         os.environ['CLEANSIO_TEMP_FILE'] = temp_file # For function
25 | 
26 |     create_temp_file(temp_file)
27 | 
28 |     ## Chunks ##
29 |     chunks_list = [] # Holds names of chunk files. Used later on in test.
30 | 
31 |     if 'CLEANSIO_CHUNKS_LIST' in os.environ: # Chunks exist
32 |         chunks_list_env_var = os.environ['CLEANSIO_CHUNKS_LIST']
33 |         chunks_list = chunks_list_env_var.split[2:-2].split('\', \'')
34 |         for chunk_file in chunks_list:
35 |             create_temp_file(chunk_file)
36 |     else: # Chunks don't exist
37 |         # Generate a random number of chunk files
38 |         for i in range(0,randint(3, 10)+1):
39 |             # Choose a random name for the chunk which we'll generate
40 |             temp_chunk_name = str(uuid4().hex)
41 |             create_temp_file(temp_chunk_name)
42 |             chunks_list.append(temp_chunk_name)
43 |         os.environ['CLEANSIO_CHUNKS_LIST'] = str(chunks_list)
44 | 
45 |     with pytest.raises(SystemExit) as pytest_e: # Ignore sys.exit()
46 |         cleanup() # Run the function
47 | 
48 |     ## Checking whether or not the function worked ##
49 |     # The temporary file shouldn't exist after the cleanup function runs
50 |     assert(not exists(temp_file))
51 | 
52 |     for chunk in chunks_list: # None of the "chunk files" should exist
53 |         assert(not exists(chunk))
54 | 
55 | def exists(file_name):
56 |     """ Checks whether a file with the given name exists. """
57 |     try:
58 |         f = open(file_name, "r")
59 |         f.close()
60 |         return True
61 |     except FileNotFoundError:
62 |         return False
63 | 
64 | def create_temp_file(file_name):
65 |     """ Creates a temp file with the given name, if it doesn't already exist """
66 |     if not exists(file_name):
67 |         f = open(file_name, "w")
68 |         fconts = str(uuid4().hex)
69 |         f.write(fconts)
70 |         f.close()
71 |     else:
72 |         errStr = "File {0} already exists, won't overwrite".format(file_name)
73 |         raise FileExistsError(errStr)
74 | 


--------------------------------------------------------------------------------
/tests/utils/test_files.py:
--------------------------------------------------------------------------------
 1 | # pylint: skip-file
 2 | 
 3 | from utils import file_name_no_ext, append_before_ext
 4 | 
 5 | # file_name_no_ext
 6 | 
 7 | def test_file_name_no_ext():
 8 |     assert file_name_no_ext('/Users/bob/folder/audio.wav') == 'audio'
 9 | 
10 | def test_file_name_no_ext_empty():
11 |     assert file_name_no_ext('') == ''
12 | 
13 | def test_file_name_no_ext_with_slash():
14 |     assert file_name_no_ext('/c/user/file.mp3') == 'file'
15 | 
16 | def test_file_name_no_ext_only_file():
17 |     assert file_name_no_ext('file!!.mp3') == 'file!!'
18 | 
19 | # append_before_ext
20 | 
21 | def test_append_before_ext_empty_string_empty_addition():
22 |     assert append_before_ext('', '') == ''
23 | 
24 | def test_append_before_ext_empty_string():
25 |     assert append_before_ext('', '-acc') == '-acc'
26 | 
27 | def test_append_before_ext_empty_addition():
28 |     assert append_before_ext('cleansio.wav', '') == 'cleansio.wav'
29 | 
30 | def test_append_before_ext_no_extension_empty_string():
31 |     assert append_before_ext('cleansio', '') == 'cleansio'
32 | 
33 | def test_append_before_ext_no_extension():
34 |     assert append_before_ext('cleansio', 'extra') == 'cleansioextra'
35 | 
36 | def test_append_before_ext_extension_1_dot():
37 |     assert append_before_ext('cleansio.wav', '-acc') == 'cleansio-acc.wav'
38 | 
39 | def test_append_before_ext_extension_2_dots():
40 |     assert append_before_ext('cleansio.w.av', '-acc') == 'cleansio.w-acc.av'
41 | 
42 | def test_append_before_ext_extension_3_dots():
43 |     assert append_before_ext('cleansio.w.a.v', '-acc') == 'cleansio.w.a-acc.v'
44 | 


--------------------------------------------------------------------------------