├── .gitignore ├── README.md ├── agent-video-generator └── functions │ ├── AddCaptionsToVideoFFMPEG.py │ ├── AddCaptionsToVideoMoviepy.py │ ├── AddCaptionsToVideoOpenCV.py │ ├── AudioTranscriptionToSentences.py │ ├── CommandsExecution.py │ ├── ConvertSrtToAss.py │ ├── MindsflowAgent.py │ ├── MusicGeneration.py │ ├── PromptImagesToVideo.py │ ├── ShowFonts.py │ ├── UploadResultZipS3.py │ ├── addAudioSegmentsToVideo.py │ ├── addSoundToVideo.py │ ├── addTextToImage.py │ ├── cloneVoiceValleX.py │ ├── cloneVoiceVits.py │ ├── deleteFilesByExtension.py │ ├── deleteFolders.py │ ├── extractVideoAudioComponents.py │ ├── generateAudioSegmentsFromJson.py │ ├── generateSrtFromJson.py │ ├── generateVideoScript.py │ ├── generateVoiceVits.py │ ├── loadJsonAndReturnKeys.py │ ├── preprocessTrainData.py │ ├── returnInputParameters.py │ ├── setEpochInJsonFile.py │ ├── splitVoiceMusic.py │ ├── textToSpeech.py │ ├── transcribeAudio.py │ ├── translateCaptionsJson.py │ ├── translateSrtFile.py │ ├── translateTargetToSource.py │ └── uploadYoutubeVideo.py ├── results ├── flow │ ├── part1.png │ ├── part2.png │ ├── part3.png │ └── translation │ │ ├── part1.png │ │ ├── part2.png │ │ └── part3.png ├── into_video_transl.png ├── intro.jpg └── videos │ ├── video1.mp4 │ ├── video1_transl.mp4 │ ├── video2.mp4 │ ├── video2_transl.mp4 │ ├── video3.mp4 │ ├── video3_transl.mp4 │ ├── video4.mp4 │ └── video4_transl.mp4 ├── video_translation.md └── voice_clone ├── functions ├── clone_voice_vits.py ├── generate_voice_vits.py └── set_epoch_in_json_config.py └── voice_clone_api ├── functions.py ├── infer.py ├── infer_api.py ├── test_api ├── run_make_prompt.py └── run_voice_clone.py ├── train.py └── train_api.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | secrets/ 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/#use-with-ide 112 | .pdm.toml 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | .idea/ 163 | *.DS_Store 164 | 165 | replicate_models/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI video generator agent 2 | 3 | ![AI Video Generator](./results/intro.jpg) 4 | 5 | This AI-agent utilizes generative AI to automatically generate short videos and post them on social platforms. 6 | It integrates several AI domains such as script generation, image generation, music generation, speech generation, automatic captioning, special effects, automatic upload and video composition to create engaging and high-quality videos. 7 | The agent is hosted on [Mindsflow.ai](https://mindsflow.ai/). 8 | 9 | ## Features 10 | 11 | - **Script Generation**: uses [GPT4](https://openai.com/gpt-4) to generate compelling scripts for your videos. 12 | 13 | - **Image Generation**: Based on the script, it generates relevant and visually appealing frames using [StableDiffusionXL](https://replicate.com/stability-ai/sdxl). 14 | 15 | - **Music Generation**: The system can create original, fitting background music to enhance the mood and tone of the video. It leverages a [music generation](https://replicate.com/meta/musicgen) model 16 | 17 | - **Speech Generation**: Using [Azure-API](https://azure.microsoft.com/en-us/products/ai-services/text-to-speech), the agent can also generate human-like narrations for the scripts. It supports multiple languages. 18 | 19 | - **Automatic Captioning**: This feature ensures accessibility by providing accurate captions for the generated speech. Captions are generated with [ffmpeg](https://ffmpeg.org/about.html). 20 | 21 | - **Special Effects**: The agent includes the ability to apply various special effects to the video to make it more engaging using [moviepy](https://pypi.org/project/moviepy/). 22 | 23 | - **Video Composition**: The agents is based on a [flow-based programming](https://en.wikipedia.org/wiki/Flow-based_programming) model to assemble different AI and algorithmic components into a complete video. The flow is developed and hosted on [Mindsflow.ai](https://mindsflow.ai/). All the blocks of the flow are available [here](agent-video-generator/functions). 24 | 25 | - **Automatic upload**: Once the video is ready, the agent can automatically upload it on your favourite social media platform. 26 | 27 | **Note**: running this agent requires to have an OpenAI key, [Replicate](https://replicate.com/explore) key, and Azure API key. 28 | 29 | ## Results 30 | 31 | You can check out some sample videos at the following links: 32 | 33 | 1. [https://www.instagram.com/inspiration_daily_tales/](https://www.instagram.com/inspiration_daily_tales/) 34 | 35 | 2. [https://www.tiktok.com/@inspiration_tales_daily](https://www.tiktok.com/@inspiration_tales_daily) 36 | 37 | These samples provide a glimpse of what the video-generator agent is capable of. Happy viewing! 38 | 39 | ## Flow 40 | 41 | | Part 1 | Part 2 | Part 3 | 42 | |-------------------------------------|-------------------------------------|-------------------------------------| 43 | | ![Alt text](./results/flow/part1.png) | ![Alt text](./results/flow/part2.png) | ![Alt text](./results/flow/part3.png) | 44 | 45 | For more details you see the full images [here](./results/flow/). 46 | 47 | ## Input format 48 | 49 | ``` 50 | { 51 | "topic": "topic of the video", # example: benefits of eating mango 52 | "language": "en", # narration language 53 | "speaker": "en-US-GuyNeural", # (full list of voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts) 54 | "voice_speed": 1, # (<1 slower voice, >1 faster voice) 55 | "font_size": 30, # font size in pixels 56 | "font_name": "SourceSerif4", # font type 57 | "font_size_title": 50, # title font size in pixels 58 | "text_color": "white", # subtitles color 59 | "text_bg_color": "", # subtitles bg color, "" or "none" means no bg color 60 | "text_bg_opacity": 0, # subtitles bg opacity, in [0-1], 0->transparent, 1->full 61 | "text_border_color": "none", # subtitles border color 62 | "text_border_size": 0, # subtitles border size, 0 no border 63 | "caption_position": "center", # center, top, bottom 64 | "height": 1024, # video height in pixels 65 | "width": 576, # video width in pixels 66 | "fps": 16, # video fps 67 | "image_model": "sdxl", # model to generate frames: sd or sdxl 68 | "music_volume": 0.5, # volume of bg music, in [0-1], 0->no bg music 69 | "transition_time": 1, # frames transitions time, 0->instant transition 70 | "zoom": 1.1, # frames zoom in/out strength, 1->no zoom 71 | "account_name": "mindsflow.ai", # account name, only if you want the video to be automatically uploaded on your platform 72 | "upload": false, # whether to upload the video on social media 73 | "image_duration": 6, # duration of each image 74 | } 75 | ``` 76 | 77 | **Note**: The only compulsory field is "topic". If not specified, all other fields will be set to their default values. 78 | 79 | ## Output format 80 | 81 | The output of the agent is structured in the following way: 82 | 83 | ``` 84 | { 85 | "result": "link to result" 86 | } 87 | ``` 88 | In this output, result is a link pointing to a ZIP file. This ZIP file contains: 89 | 90 | - The generated video in mp4 format 91 | - A thumbnail image for the video 92 | - The video script in text format 93 | - The captions file in srt format 94 | 95 | ## Extra 96 | 97 | Try out more AI agents at [https://chat.mindsflow.ai/en-US/explore](https://chat.mindsflow.ai/en-US/explore). -------------------------------------------------------------------------------- /agent-video-generator/functions/AddCaptionsToVideoFFMPEG.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "AddCaptionsToVideoFFMPEG", 4 | "displayName": "", 5 | "description": "This method receives an SRT or ASS subtitle file path and an MP4 video file path as inputs. Using the FFmpeg library, it integrates the subtitle file with the video and outputs the path of the combined video. It does not operate in command-line mode", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "video_url", 10 | "captions_url" 11 | ], 12 | "properties": { 13 | "video_url": { 14 | "type": "string", 15 | "description": "Path to the MP4 video file." 16 | }, 17 | "captions_url": { 18 | "type": "string", 19 | "description": "Path to the ASS (Advanced SubStation Alpha) subtitle file." 20 | } 21 | } 22 | }, 23 | "outputPattern": { 24 | "type": "object", 25 | "required": [ 26 | "video_url" 27 | ], 28 | "properties": { 29 | "video_url": { 30 | "type": "string", 31 | "description": "Path of the video file after merging with subtitles" 32 | } 33 | } 34 | }, 35 | "tag": "VideoCaptions", 36 | "testCases": [ 37 | { 38 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/output_1701831655_ypexkwiz.mp4", 39 | "captions_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/WQ7TEooutput_file.ass" 40 | } 41 | ], 42 | "aiPrompt": "AddCaptionsToVideoFFMPEG", 43 | "greeting": "" 44 | } 45 | $""" 46 | 47 | import os 48 | import ffmpeg 49 | import requests 50 | import boto3 51 | import random, string 52 | import subprocess 53 | 54 | def download_file(url, filename): 55 | response = requests.get(url) 56 | file = open(filename, 'wb') 57 | file.write(response.content) 58 | file.close() 59 | 60 | s3_client = boto3.client('s3') 61 | 62 | def upload_to_aws(filename: str) -> str: 63 | # Uses your AWS credentials to access the service 64 | bucket_name = os.environ.get('bucket_name') 65 | region = os.environ.get('region') 66 | # Create a session using the provided credentials 67 | session = boto3.Session( 68 | aws_access_key_id=os.environ.get('access_key_id'), 69 | aws_secret_access_key=os.environ.get('secret_access_key') 70 | ) 71 | # Create an S3 client 72 | s3_client = session.client('s3') 73 | bucket_path = 'ai-video' 74 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 75 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 76 | url = f'{s3_base_url}{bucket_path}/{filename}' 77 | return url 78 | 79 | def merge_subtitle_and_video(subtitle_path: str, mp4_path: str, output_path: str): 80 | # determine file type from extension 81 | _, file_extension = os.path.splitext(subtitle_path) 82 | 83 | if file_extension.lower() == ".srt": 84 | ffmpeg.input(mp4_path).output(output_path, vf='subtitles=' + subtitle_path).run(overwrite_output=True) 85 | elif file_extension.lower() == ".ass": 86 | command = f"ffmpeg -i {mp4_path} -vf 'ass={subtitle_path}' {output_path}" 87 | process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, universal_newlines=True) 88 | output = process.stdout 89 | else: 90 | print(f"Unsupported subtitle file type: {file_extension}") 91 | 92 | def mindsflow_function(event, context) -> dict: 93 | # get the srt path from the event 94 | caption_url = event.get("captions_url") 95 | # get the mp4 path from the event 96 | video_url = event.get("video_url") 97 | 98 | command = ' apt install ffmpeg' 99 | process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, universal_newlines=True) 100 | 101 | mp4_path = video_url.split('/')[-1] 102 | caption_path = caption_url.split('/')[-1] 103 | download_file(video_url, mp4_path) 104 | download_file(caption_url, caption_path) 105 | 106 | # Set output path 107 | output_path = "video_with_captions_{}.mp4".format(''.join(random.choices(string.ascii_letters + string.digits, k=5))) 108 | 109 | # Merge the srt and mp4 files 110 | merge_subtitle_and_video(ass_path, mp4_path, output_path) 111 | 112 | upload_url = upload_to_aws(output_path) 113 | os.remove(output_path) 114 | 115 | # define result 116 | result = { 117 | 'video_url': upload_url 118 | } 119 | 120 | return result 121 | 122 | -------------------------------------------------------------------------------- /agent-video-generator/functions/AddCaptionsToVideoMoviepy.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "AddCaptionsToVideoMoviepy", 4 | "displayName": "", 5 | "description": "Add captions to video with moviepy", 6 | "inputPattern": { 7 | "type": "object", 8 | "properties": { 9 | "font_name": { 10 | "type": "string", 11 | "description": "" 12 | }, 13 | "font_size": { 14 | "type": "number", 15 | "description": "" 16 | }, 17 | "video_url": { 18 | "type": "string", 19 | "description": "" 20 | }, 21 | "text_color": { 22 | "type": "string", 23 | "description": "" 24 | }, 25 | "caption_url": { 26 | "type": "string", 27 | "description": "" 28 | }, 29 | "text_bg_color": { 30 | "type": "string", 31 | "description": "" 32 | }, 33 | "highlight_color": { 34 | "type": "string", 35 | "description": "" 36 | }, 37 | "text_bg_opacity": { 38 | "type": "number", 39 | "description": "" 40 | }, 41 | "caption_position": { 42 | "type": "string", 43 | "description": "" 44 | }, 45 | "text_border_size": { 46 | "type": "number", 47 | "description": "" 48 | }, 49 | "text_border_color": { 50 | "type": "string", 51 | "description": "" 52 | } 53 | }, 54 | "required": [ 55 | "video_url", 56 | "caption_url" 57 | ] 58 | }, 59 | "outputPattern": { 60 | "type": "object", 61 | "properties": { 62 | "video_url": { 63 | "type": "string", 64 | "description": "" 65 | } 66 | }, 67 | "required": [ 68 | "video_url" 69 | ] 70 | }, 71 | "tag": "VideoCaptions", 72 | "testCases": [ 73 | { 74 | "font_name": "Heebo", 75 | "font_size": 30, 76 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/teacher_comic.mp4", 77 | "text_color": "white", 78 | "caption_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/test.srt", 79 | "text_bg_color": "black", 80 | "highlight_color": "yellow", 81 | "text_bg_opacity": 0.5, 82 | "caption_position": "bottom", 83 | "text_border_size": 0, 84 | "text_border_color": "" 85 | } 86 | ], 87 | "aiPrompt": "", 88 | "greeting": "" 89 | } 90 | $""" 91 | 92 | import pysrt 93 | from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip, ColorClip, concatenate_videoclips 94 | import os 95 | import requests 96 | import boto3 97 | import random, string 98 | import ast 99 | 100 | font_dir = os.environ['font_dir'] 101 | 102 | color_dict = { 103 | 'red': (255, 0, 0), 104 | 'blue': (0, 0, 255), 105 | 'green': (0, 255, 0), 106 | 'white': (255, 255, 255), 107 | 'black': (0, 0, 0), 108 | 'yellow': (255, 255, 0), 109 | 'cyan': (0, 255, 255), 110 | 'magenta': (255, 0, 255), 111 | 'grey': (128, 128, 128), 112 | 'pink': (255, 192, 203), 113 | 'purple': (128, 0, 128), 114 | 'orange': (255, 165, 0), 115 | 'brown': (165, 42, 42) 116 | } 117 | 118 | def download_file(url, filename): 119 | response = requests.get(url) 120 | file = open(filename, 'wb') 121 | file.write(response.content) 122 | file.close() 123 | 124 | def random_color(): 125 | return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) 126 | 127 | def rgb_to_hex(rgb): 128 | return "#{:02x}{:02x}{:02x}".format(*rgb) 129 | 130 | s3_client = boto3.client('s3') 131 | 132 | def upload_to_aws(filename: str) -> str: 133 | # Uses your AWS credentials to access the service 134 | bucket_name = os.environ.get('bucket_name') 135 | region = os.environ.get('region') 136 | # Create a session using the provided credentials 137 | session = boto3.Session( 138 | aws_access_key_id=os.environ.get('access_key_id'), 139 | aws_secret_access_key=os.environ.get('secret_access_key') 140 | ) 141 | # Create an S3 client 142 | s3_client = session.client('s3') 143 | bucket_path = 'ai-video' 144 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 145 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 146 | url = f'{s3_base_url}{bucket_path}/{filename}' 147 | return url 148 | 149 | 150 | def time_to_seconds(time_obj): 151 | return time_obj.hours * 3600 + time_obj.minutes * 60 + time_obj.seconds + time_obj.milliseconds / 1000 152 | 153 | 154 | llm_prompt = 'Given the input text, choose some important and meaningful words to highlight. Max 1-2 words per sentence. Return them as a python list.\nTEXT: {}' 155 | def highlight_words(input_str: str, event) -> str: 156 | input_str = llm_prompt.format(input_str) 157 | data = { 158 | "style": "LLM-Only", 159 | "stream": False, 160 | "messageContent": input_str, 161 | "agentId": 1548 162 | } 163 | resp = event.chat.messages(data=data) 164 | return resp 165 | 166 | 167 | def create_subtitle_clips(subtitles, videosize, fontsize=24, font='fonts/Caveat.ttf', color='yellow', bg_color='black', border_size=1.5, border_color="black", caption_position='bottom', bg_opacity=0.5, highlight_color=None, event=None): 168 | subtitle_clips = [] 169 | 170 | for subtitle in subtitles: 171 | start_time = time_to_seconds(subtitle.start) 172 | end_time = time_to_seconds(subtitle.end) 173 | duration = end_time - start_time 174 | 175 | video_width, video_height = videosize 176 | 177 | if border_size == 0 or border_size == 0.: 178 | border_color = None 179 | 180 | method = 'caption' 181 | if highlight_color is not None: 182 | # https://docs.gtk.org/Pango/pango_markup.html 183 | important_words = ast.literal_eval(highlight_words(subtitle.text, event)) 184 | print('Important words:', important_words) 185 | for word in important_words: 186 | subtitle.text = subtitle.text.replace(word, f'{word}') 187 | method = 'pango' 188 | subtitle.text = f'{subtitle.text}' 189 | 190 | text_clip = TextClip(subtitle.text, fontsize=fontsize, font=font, color=color, size=(video_width*3/4, None), method=method, stroke_color=border_color, stroke_width=border_size).set_start(start_time).set_duration(duration) 191 | 192 | # add bg color 193 | if bg_color in color_dict.keys(): 194 | im_width, im_height = text_clip.size 195 | color_clip = ColorClip(size=(int(im_width), int(im_height)), color=color_dict[bg_color]) 196 | color_clip = color_clip.set_opacity(bg_opacity).set_start(start_time).set_duration(duration) 197 | text_clip = CompositeVideoClip([color_clip, text_clip]) 198 | 199 | subtitle_x_position = 'center' 200 | y_position_dict = { 201 | 'center': 'center', 202 | 'bottom': video_height * 4/5, 203 | 'top': video_height * 1/5, 204 | } 205 | subtitle_y_position = y_position_dict[caption_position] 206 | 207 | text_position = (subtitle_x_position, subtitle_y_position) 208 | subtitle_clips.append(text_clip.set_position(text_position)) 209 | 210 | return subtitle_clips 211 | 212 | 213 | def mindsflow_function(event, context) -> dict: 214 | 215 | caption_url = event.get("caption_url") 216 | video_url = event.get("video_url") 217 | fontsize = event.get("font_size", 24) 218 | fontname = event.get("font_name", "SourceSerif4") 219 | text_color = event.get('text_color', 'white') 220 | bg_color = event.get('text_bg_color', 'black') 221 | bg_opacity = event.get('text_bg_opacity', 0.5) 222 | border_size = event.get('text_border_size', 1.) 223 | border_color = event.get('text_border_color', None) 224 | caption_position = event.get('caption_position', 'center') 225 | highlight_color = event.get('highlight_color', None) 226 | fontname = f'{font_dir}/{fontname}.ttf' 227 | 228 | mp4_path = video_url.split('/')[-1] 229 | caption_path = caption_url.split('/')[-1] 230 | download_file(video_url, mp4_path) 231 | download_file(caption_url, caption_path) 232 | 233 | if highlight_color not in color_dict.keys(): 234 | highlight_color = None 235 | 236 | # Load video and SRT file 237 | video = VideoFileClip(mp4_path) 238 | subtitles = pysrt.open(caption_path) 239 | 240 | # Set output path 241 | output_path = "video_with_captions_{}.mp4".format(''.join(random.choices(string.ascii_letters + string.digits, k=5))) 242 | 243 | # Create subtitle clips 244 | subtitle_clips = create_subtitle_clips(subtitles, video.size, fontsize, fontname, text_color, bg_color, border_size, border_color, caption_position, bg_opacity, highlight_color, event) 245 | 246 | # Add subtitles to the video 247 | final_video = CompositeVideoClip([video] + subtitle_clips) 248 | 249 | # Write output video file 250 | final_video.write_videofile(output_path) 251 | 252 | upload_url = upload_to_aws(output_path) 253 | os.remove(output_path) 254 | os.remove(caption_path) 255 | os.remove(mp4_path) 256 | 257 | result = { 258 | 'video_url': upload_url 259 | } 260 | 261 | return result 262 | 263 | -------------------------------------------------------------------------------- /agent-video-generator/functions/AddCaptionsToVideoOpenCV.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "AddCaptionsToVideoOpenCV", 4 | "displayName": "", 5 | "description": "The Python method is intended to download a video from a given URL, add captions to that downloaded video, upload the updated video to an S3 bucket, and return a URL for accessing the newly uploaded video.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "video_url", 10 | "json_caption" 11 | ], 12 | "properties": { 13 | "margin": { 14 | "type": "number", 15 | "description": "" 16 | }, 17 | "font_size": { 18 | "type": "number", 19 | "description": "" 20 | }, 21 | "font_type": { 22 | "type": "string", 23 | "description": "" 24 | }, 25 | "video_url": { 26 | "type": "string", 27 | "description": "URL of the video to be downloaded" 28 | }, 29 | "text_color": { 30 | "type": "string", 31 | "description": "" 32 | }, 33 | "border_color": { 34 | "type": "string", 35 | "description": "" 36 | }, 37 | "json_caption": { 38 | "type": "string", 39 | "description": "Captions to be added to the video" 40 | }, 41 | "max_caption_len": { 42 | "type": "number", 43 | "description": "" 44 | }, 45 | "caption_position": { 46 | "type": "string", 47 | "description": "" 48 | } 49 | } 50 | }, 51 | "outputPattern": { 52 | "type": "object", 53 | "required": [ 54 | "video_url" 55 | ], 56 | "properties": { 57 | "video_url": { 58 | "type": "string", 59 | "description": "The URL of the video uploaded to S3" 60 | } 61 | } 62 | }, 63 | "tag": "VideoCaptions", 64 | "testCases": [ 65 | { 66 | "margin": 0.1, 67 | "font_size": 30, 68 | "font_type": "default", 69 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/a78d8376-a5f9-413c-9624-b4eb7680357e_video_no_audio.mp4", 70 | "text_color": "white", 71 | "border_color": "black", 72 | "json_caption": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedBDbUzJ.json", 73 | "max_caption_len": 40, 74 | "caption_position": "threequarter" 75 | } 76 | ], 77 | "aiPrompt": "", 78 | "greeting": "" 79 | } 80 | $""" 81 | 82 | import json 83 | import cv2 84 | from moviepy.editor import VideoFileClip 85 | import boto3 86 | import os 87 | import time 88 | import random 89 | import string 90 | import requests 91 | import numpy as np 92 | from PIL import ImageFont, ImageDraw, Image 93 | 94 | 95 | def upload_to_aws(filename: str) -> str: 96 | # Uses your AWS credentials to access the service 97 | bucket_name = os.environ.get('bucket_name') 98 | region = os.environ.get('region') 99 | 100 | # Create a session using the provided credentials 101 | session = boto3.Session( 102 | aws_access_key_id=os.environ.get('access_key_id'), 103 | aws_secret_access_key=os.environ.get('secret_access_key') 104 | ) 105 | 106 | # Create an S3 client 107 | s3_client = session.client('s3') 108 | 109 | bucket_path = 'ai-video' 110 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 111 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 112 | url = f'{s3_base_url}{bucket_path}/{filename}' 113 | 114 | return url 115 | 116 | def download_file(url, save_path): 117 | response = requests.get(url) 118 | with open(save_path, 'wb') as file: 119 | file.write(response.content) 120 | 121 | def get_random_string(): 122 | letters = string.ascii_lowercase 123 | result_str = ''.join(random.choice(letters) for _ in range(8)) 124 | timestamp = int(time.time()) 125 | random_str = str(timestamp) + '_' + result_str 126 | return random_str 127 | 128 | # Define color dictionary for known colors 129 | color_dict = { 130 | 'black': (0, 0, 0), 131 | 'white': (255, 255, 255), 132 | 'red': (0, 0, 255), # Remember, in OpenCV it's BGR not RGB 133 | 'green': (0, 255, 0), 134 | 'blue': (255, 0, 0), 135 | 'yellow': (0, 255, 255) 136 | } 137 | 138 | 139 | # Define the dictionary for known font types 140 | font_dict = { 141 | 'chinese': 'NotoSansSC', 142 | 'default': 'SourceSerif4', 143 | } 144 | 145 | 146 | def wrap_text(caption, frame_width, font): 147 | words = caption.split(' ') 148 | lines = [words.pop(0)] # Initial 149 | for word in words: 150 | box = font.getbbox(lines[-1] + ' ' + word) 151 | text_width, text_height = box[2] - box[0], box[3] - box[1] 152 | if text_width > frame_width: 153 | lines.append(word) 154 | else: 155 | lines[-1] += ' ' + word 156 | return lines 157 | 158 | def add_captions(video_path, json_file_path, border_size=2, border_color='black', text_color='white', 159 | font_size=30, font_type='DUPLEX', caption_position='bottom', outfile="out.mp4", margin=0.1, 160 | font_dir=''): 161 | # Load video 162 | cap = cv2.VideoCapture(video_path) 163 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 164 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 165 | fps = int(cap.get(cv2.CAP_PROP_FPS)) 166 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 167 | 168 | # Load the JSON file with caption details 169 | with open(json_file_path, 'r') as f: 170 | captions = json.load(f) 171 | print(captions) 172 | 173 | # Get the specified color tuples 174 | border_color = color_dict[border_color.lower()] 175 | text_color = color_dict[text_color.lower()] 176 | # Get the specified font 177 | if font_type is None: 178 | font_type = 'default' 179 | if font_type in font_dict.keys(): 180 | font_type = font_dict[font_type] 181 | font = ImageFont.truetype(f'{os.path.join(font_dir, font_type)}.ttf', size=font_size) 182 | 183 | # Define the codec and create a VideoWriter object 184 | #fourcc_code = int(cap.get(cv2.CAP_PROP_FOURCC)) 185 | #fourcc_code = "".join([chr((fourcc_code >> 8 * i) & 0xFF) for i in range(4)]) 186 | fourcc_code = "vp90" 187 | fourcc = cv2.VideoWriter_fourcc(*fourcc_code) 188 | out = cv2.VideoWriter(outfile, fourcc, fps, (width, height)) 189 | 190 | frame_counter = 0 191 | caption_index = 0 192 | print('fps', fps) 193 | while(cap.isOpened()): 194 | ret, frame = cap.read() 195 | if ret: 196 | current_time = frame_counter * (1e7/fps) # Current timestamp in microseconds 197 | print(current_time / 1e7, captions[caption_index], caption_index) 198 | print(frame_counter, caption_index) 199 | if current_time >= captions[caption_index]['end_time']: 200 | caption_index += 1 201 | # Check if there are no more captions 202 | if caption_index >= len(captions): 203 | break # If no more captions, exit loop 204 | 205 | img_pil = Image.fromarray(frame) 206 | draw = ImageDraw.Draw(img_pil) 207 | 208 | margin_rate = int(width * margin) 209 | 210 | lines = wrap_text(captions[caption_index]['sentence'], width - 2 * margin_rate, font) 211 | for i, line in enumerate(lines): 212 | box = font.getbbox(line) 213 | text_width, text_height = box[2] - box[0], box[3] - box[1] 214 | text_height = font_size * 1.3 215 | 216 | # Center the text 217 | textX = (width - text_width - margin_rate * 2) // 2 + margin_rate 218 | total_lines = len(lines) 219 | total_text_height = total_lines * text_height # The total height of text block 220 | 221 | # Position text as per given caption_position 222 | if caption_position.lower() == 'top': 223 | textY = margin_rate + (i * text_height) 224 | elif caption_position.lower() == 'bottom': 225 | textY = height - margin_rate - (len(lines) - i) * text_height 226 | elif caption_position.lower() == 'threequarter': 227 | three_quarter_height = height * 0.75 228 | textY = three_quarter_height - ((total_lines - i) * text_height) 229 | elif caption_position.lower() == 'onequarter': 230 | one_quarter_height = height * 0.25 231 | textY = one_quarter_height + ((i + 1) * text_height) 232 | else: # Default to center if unknown value 233 | textY = ((height - total_text_height) // 2) + (i * text_height) 234 | 235 | for k in range(-border_size, border_size+1): 236 | for j in range(-border_size, border_size+1): 237 | draw.text((textX+j, textY+k), line, font = font, fill = border_color) 238 | draw.text((textX, textY), line, font = font, fill = text_color) 239 | 240 | out.write(np.array(img_pil)) 241 | 242 | frame_counter += 1 243 | 244 | else: 245 | break 246 | 247 | cap.release() 248 | out.release() 249 | 250 | def mindsflow_function(event, context) -> dict: 251 | # get the video url and caption from the event 252 | video_url = event.get("video_url") 253 | captions_url = event.get("json_caption") 254 | caption_position = event.get("caption_position", "bottom") 255 | border_color = event.get("border_color", "black") 256 | text_color = event.get("text_color", "white") 257 | font_size = event.get("font_size", 30) 258 | max_caption_len = event.get("max_caption_len", 30) 259 | margin = event.get("margin", 0.1) 260 | font_type = event.get("font_type", 'default') 261 | 262 | download_path = "video_" + get_random_string() + ".mp4" 263 | out_path = "video_" + get_random_string() + ".mp4" 264 | download_file(video_url, download_path) 265 | 266 | json_path = "caption_" + get_random_string() + ".json" 267 | download_file(captions_url, json_path) 268 | 269 | # get the captioned video URL 270 | add_captions(download_path, 271 | json_file_path=json_path, 272 | outfile=out_path, 273 | caption_position=caption_position, 274 | border_color=border_color, 275 | text_color=text_color, 276 | font_size=font_size, 277 | margin=margin, 278 | font_type=font_type, 279 | font_dir = os.environ.get('font_dir') 280 | ) 281 | 282 | # upload the combined image to aws and save the url 283 | url = upload_to_aws(out_path) 284 | 285 | # define result 286 | result = { 287 | 'video_url': url 288 | } 289 | 290 | if os.path.exists(download_path): 291 | os.remove(download_path) 292 | if os.path.exists(json_path): 293 | os.remove(json_path) 294 | 295 | return result 296 | 297 | 298 | -------------------------------------------------------------------------------- /agent-video-generator/functions/AudioTranscriptionToSentences.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "AudioTranscriptionToSentences", 4 | "displayName": "", 5 | "description": "This method downloads a JSON file containing the transcription of an audio, including the start and duration of each word. It further splits the transcription into sentences and uses the JSON transcription to map the start and duration of each sentence.", 6 | "inputPattern": { 7 | "type": "object", 8 | "properties": { 9 | "add_punctuation": { 10 | "type": "boolean", 11 | "description": "" 12 | }, 13 | "split_all_punctuation": { 14 | "type": "boolean", 15 | "description": "" 16 | }, 17 | "transcription_json_url": { 18 | "type": "string", 19 | "description": "URL from where to download the json file." 20 | } 21 | }, 22 | "required": [ 23 | "split_all_punctuation", 24 | "transcription_json_url" 25 | ] 26 | }, 27 | "outputPattern": { 28 | "type": "object", 29 | "properties": { 30 | "text": { 31 | "type": "string", 32 | "description": "" 33 | }, 34 | "n_splits": { 35 | "type": "number", 36 | "description": "" 37 | }, 38 | "sentences_json_url": { 39 | "type": "string", 40 | "description": "URL to download JSON" 41 | } 42 | }, 43 | "required": [ 44 | "text", 45 | "n_splits", 46 | "sentences_json_url" 47 | ] 48 | }, 49 | "tag": "DataPreprocessing", 50 | "testCases": [ 51 | { 52 | "add_punctuation": false, 53 | "split_all_punctuation": false, 54 | "transcription_json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/audio_transcription_1703468432_yelpditk.json" 55 | } 56 | ], 57 | "aiPrompt": "", 58 | "greeting": "" 59 | } 60 | $""" 61 | 62 | import json 63 | import requests 64 | import boto3 65 | import time 66 | import random 67 | import string 68 | import os 69 | import nltk 70 | import jieba 71 | import re 72 | import regex 73 | 74 | def download_file(url, save_path): 75 | response = requests.get(url) 76 | with open(save_path, 'wb') as file: 77 | file.write(response.content) 78 | 79 | def get_random_string(): 80 | letters = string.ascii_lowercase 81 | result_str = ''.join(random.choice(letters) for _ in range(8)) 82 | timestamp = int(time.time()) 83 | random_str = str(timestamp) + '_' + result_str 84 | return random_str 85 | 86 | def upload_to_aws(filename: str) -> str: 87 | # Uses your AWS credentials to access the service 88 | bucket_name = os.environ.get('bucket_name') 89 | region = os.environ.get('region') 90 | 91 | # Create a session using the provided credentials 92 | session = boto3.Session( 93 | aws_access_key_id=os.environ.get('access_key_id'), 94 | aws_secret_access_key=os.environ.get('secret_access_key') 95 | ) 96 | 97 | # Create an S3 client 98 | s3_client = session.client('s3') 99 | 100 | bucket_path = 'ai-video' 101 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 102 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 103 | url = f'{s3_base_url}{bucket_path}/{filename}' 104 | 105 | return url 106 | 107 | 108 | light_punctuation = [',', ","] 109 | 110 | 111 | def divide_string(words, words2, split_all_punctuation=True): 112 | substrings = [] 113 | substrings2 = [] 114 | substring_len = [] 115 | 116 | current_substring = "" 117 | current_substring2 = "" 118 | cur_substring_len = 0 119 | 120 | punctuation = [".", "!", "?", ";", "。", "!", "?", ";"] 121 | if split_all_punctuation is True: 122 | punctuation += light_punctuation 123 | for i, word in enumerate(words): 124 | if word[-1] in punctuation: 125 | #print(word, current_substring) 126 | cur_substring_len += 1 127 | if regex.match(r'\p{Script=Han}', word): 128 | current_substring += "" + word 129 | current_substring2 += "" + words2[i] 130 | else: 131 | current_substring += " " + word 132 | current_substring2 += " " + words2[i] 133 | substrings.append(current_substring.strip()) 134 | substrings2.append(current_substring2.strip()) 135 | current_substring = "" 136 | current_substring2 = "" 137 | substring_len.append(cur_substring_len) 138 | cur_substring_len = 0 139 | else: 140 | cur_substring_len += 1 141 | if regex.match(r'\p{Script=Han}', word): 142 | current_substring += "" + word 143 | current_substring2 += "" + words2[i] 144 | else: 145 | current_substring += " " + word 146 | current_substring2 += " " + words2[i] 147 | 148 | if current_substring: # If there's anything left, append it to the list 149 | substrings.append(current_substring.strip()) 150 | substrings2.append(current_substring2.strip()) 151 | substring_len.append(cur_substring_len) 152 | 153 | return substrings, substrings2, substring_len 154 | 155 | llm_prompt = '''split this text into smaller sentences 156 | TEXT: {}''' 157 | def llm_add_puntuaction(input_str: str, event) -> str: 158 | data = { 159 | "style": "LLM-Only", 160 | "stream": False, 161 | "messageContent": llm_prompt.format(input_str), 162 | "agentId": 964 163 | } 164 | resp = event.chat.messages(data=data) 165 | return resp 166 | 167 | def get_sentence_time(json_file_path, event, split_all_punctuation=True, add_punctuation=False): 168 | # Load JSON data from a file 169 | with open(json_file_path, 'r') as f: 170 | data = json.load(f) 171 | 172 | # Get display text and split into sentences 173 | display_lexical = data['Lexical'].strip() 174 | display_text = data['Display'].strip().replace('.', '. ') 175 | 176 | if add_punctuation: 177 | display_text = llm_add_puntuaction(display_text, event) # to test 178 | 179 | lexical_list = display_lexical.split() 180 | text_list = display_text.split() 181 | print(len(lexical_list), lexical_list) 182 | print(len(text_list), text_list) 183 | 184 | def n_split_str(str_, n): 185 | words = str_.split() 186 | return [' '.join(words[i:i+n]) for i in range(0, len(words), n)] 187 | def count_words(sentences): 188 | return [len(sentence.split()) for sentence in sentences] 189 | 190 | if len(lexical_list) != len(text_list): 191 | sentences_text = n_split_str(display_lexical, 10) 192 | sentences_clean = sentences_text 193 | substring_len_text = count_words(sentences_text) 194 | substring_len_lexical = substring_len_text 195 | else: 196 | sentences_text, sentences_clean, substring_len_text = divide_string(text_list, lexical_list, split_all_punctuation) 197 | substring_len_lexical = substring_len_text 198 | 199 | print(sentences_clean) 200 | print(substring_len_text ,sentences_text) 201 | 202 | # Map words to their times 203 | words = [{'Word': w['Word'], 'Index': index, 'Offset': w['Offset'], 'Duration': w['Duration']} for index, w in enumerate(data['Words'])] 204 | #print(words) 205 | 206 | sentence_times = [] 207 | 208 | index = 0 209 | for i, sentence in enumerate(sentences_text): 210 | start_time = words[index]['Offset'] 211 | index += substring_len_lexical[i] - 1 212 | end_time = words[index]['Offset'] + words[index]['Duration'] 213 | duration = end_time - start_time 214 | index += 1 215 | #print(duration) 216 | final_sentence = sentences_text[i] 217 | while final_sentence[-1] in light_punctuation: 218 | final_sentence = final_sentence[:-1] 219 | 220 | sentence_times.append({ 221 | 'sentence': final_sentence, 222 | 'start_time': start_time, 223 | 'end_time': end_time, 224 | 'duration': duration 225 | }) 226 | 227 | return sentence_times, display_text 228 | 229 | 230 | def mindsflow_function(event, context) -> dict: 231 | 232 | url = event.get('transcription_json_url') 233 | split_all_punctuation = event.get('split_all_punctuation', True) 234 | add_punctuation = event.get('add_punctuation', False) 235 | 236 | transcription_path = 'transcript_{}.json'.format(get_random_string()) 237 | download_file(url, transcription_path) 238 | 239 | sentence_times, text = get_sentence_time(transcription_path, event, split_all_punctuation, add_punctuation) 240 | 241 | output_file = 'sentence_times_{}.json'.format(get_random_string()) 242 | with open(output_file, 'w') as f: 243 | json.dump(sentence_times, f) 244 | 245 | url = upload_to_aws(output_file) 246 | 247 | result = { 248 | 'sentences_json_url': url, 249 | 'text': text, 250 | 'n_splits': len(sentence_times) 251 | } 252 | 253 | if os.path.exists(transcription_path): 254 | os.remove(transcription_path) 255 | if os.path.exists(output_file): 256 | os.remove(output_file) 257 | 258 | return result 259 | 260 | -------------------------------------------------------------------------------- /agent-video-generator/functions/CommandsExecution.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "CommandsExecution", 4 | "displayName": "", 5 | "description": "CommandsExecution", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [], 9 | "properties": {} 10 | }, 11 | "outputPattern": { 12 | "type": "object", 13 | "required": [], 14 | "properties": {} 15 | }, 16 | "tag": "Example", 17 | "testCases": [ 18 | {} 19 | ], 20 | "aiPrompt": "", 21 | "greeting": "" 22 | } 23 | $""" 24 | 25 | import json 26 | 27 | def mindsflow_function(event, context) -> dict: 28 | """ 29 | This is the main function that processes an event within a given context. 30 | 31 | Args: 32 | event (class Event): Containing mindsflow internal api and request information. 33 | case1: event.get("param") # inference parameters 34 | case2: event.chat.messages(data) # call mindsflow api 35 | context (class Context): Containing execution context and additional environment information. 36 | 37 | Returns: 38 | dict: A result dictionary meeting the Output Pattern. 39 | """ 40 | import zipfile 41 | import subprocess 42 | 43 | '''def unzip_folder(path_to_zip_file, directory_to_extract_to): 44 | with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref: 45 | zip_ref.extractall(directory_to_extract_to)''' 46 | 47 | # usage 48 | #unzip_folder("fonts.zip", "fonts") 49 | 50 | def execute_command(command): 51 | process = subprocess.Popen(command.split(), stdout=subprocess.PIPE) 52 | output, error = process.communicate() 53 | execute_command("pip uninstall spleeter") 54 | 55 | result = { 56 | 'data': 'Hello, MindsFlow User!' 57 | } 58 | 59 | return result 60 | 61 | -------------------------------------------------------------------------------- /agent-video-generator/functions/ConvertSrtToAss.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "ConvertSrtToAss", 4 | "displayName": "", 5 | "description": "Converts srt file to ass file", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "srt_url" 10 | ], 11 | "properties": { 12 | "shadow": { 13 | "type": "number", 14 | "description": "" 15 | }, 16 | "marginl": { 17 | "type": "integer", 18 | "description": "" 19 | }, 20 | "marginr": { 21 | "type": "integer", 22 | "description": "" 23 | }, 24 | "marginv": { 25 | "type": "integer", 26 | "description": "" 27 | }, 28 | "outline": { 29 | "type": "integer", 30 | "description": "" 31 | }, 32 | "srt_url": { 33 | "type": "string", 34 | "description": "" 35 | }, 36 | "fontname": { 37 | "type": "string", 38 | "description": "arial" 39 | }, 40 | "fontsize": { 41 | "type": "integer", 42 | "description": "" 43 | } 44 | } 45 | }, 46 | "outputPattern": { 47 | "type": "object", 48 | "required": [ 49 | "ass_url" 50 | ], 51 | "properties": { 52 | "ass_url": { 53 | "type": "string", 54 | "description": "" 55 | } 56 | } 57 | }, 58 | "tag": "VideoCaptions", 59 | "testCases": [ 60 | { 61 | "shadow": 0, 62 | "marginl": 0, 63 | "marginr": 0, 64 | "marginv": 0, 65 | "outline": 0, 66 | "srt_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/89z79p.srt", 67 | "fontname": "文泉驿正黑", 68 | "fontsize": 0 69 | } 70 | ], 71 | "aiPrompt": "", 72 | "greeting": "" 73 | } 74 | $""" 75 | 76 | import json 77 | import boto3 78 | import os 79 | import uuid 80 | import requests 81 | import pysubs2 82 | 83 | 84 | def download_file(url, filename): 85 | response = requests.get(url) 86 | file = open(filename, 'wb') 87 | file.write(response.content) 88 | file.close() 89 | 90 | s3_client = boto3.client('s3') 91 | 92 | def upload_to_aws(filename: str) -> str: 93 | # Uses your AWS credentials to access the service 94 | bucket_name = os.environ.get('bucket_name') 95 | region = os.environ.get('region') 96 | # Create a session using the provided credentials 97 | session = boto3.Session( 98 | aws_access_key_id=os.environ.get('access_key_id'), 99 | aws_secret_access_key=os.environ.get('secret_access_key') 100 | ) 101 | # Create an S3 client 102 | s3_client = session.client('s3') 103 | bucket_path = 'ai-video' 104 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 105 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 106 | url = f'{s3_base_url}{bucket_path}/{filename}' 107 | return url 108 | 109 | 110 | def convert_srt_to_ass(srt_path: str, ass_path: str, fontname='Arial', fontsize=16, marginl=10, marginv=10, marginr=10, outline=0, shadow=0): 111 | subs = pysubs2.load(srt_path, encoding="utf-8") 112 | for line in subs: 113 | line.style = "my_style" 114 | subs.styles["my_style"] = pysubs2.SSAStyle(fontname=fontname, fontsize=fontsize, 115 | marginl=marginl, marginr=marginr, 116 | marginv=marginv, outline=outline, 117 | shadow=shadow) 118 | subs.save(ass_path) 119 | 120 | 121 | def mindsflow_function(event, context) -> dict: 122 | srt_url = event.get("srt_url") 123 | fontname = event.get("fontname", "Arial") 124 | fontsize = event.get("fontsize", 10) 125 | marginl = event.get("marginl", 20) 126 | marginr = event.get("marginr", 20) 127 | marginv = event.get("marginv", 10) 128 | outline = event.get("outline", 1) 129 | shadow = event.get("shadow", 0) 130 | 131 | file_name = srt_url.split('/')[-1].split('.')[0] 132 | srt_file = f"{file_name}.srt" 133 | ass_file = f"{file_name}.ass" 134 | download_file(srt_url, srt_file) 135 | 136 | convert_srt_to_ass(srt_file, ass_file, fontname, fontsize, marginl, marginv, marginr, outline, shadow) 137 | 138 | upload_url = upload_to_aws(ass_file) 139 | 140 | os.remove(srt_file) 141 | os.remove(ass_file) 142 | 143 | result = { 144 | 'ass_url': upload_url 145 | } 146 | 147 | return result 148 | 149 | -------------------------------------------------------------------------------- /agent-video-generator/functions/MindsflowAgent.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "MindsflowAgent", 4 | "displayName": "", 5 | "description": "Example of how to invoke Mindsflow agent", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "input_str" 10 | ], 11 | "properties": { 12 | "input_str": { 13 | "type": "string", 14 | "description": "The input string to be translated" 15 | } 16 | } 17 | }, 18 | "outputPattern": { 19 | "type": "object", 20 | "required": [ 21 | "translated_text" 22 | ], 23 | "properties": { 24 | "translated_text": { 25 | "type": "string", 26 | "description": "translation result" 27 | } 28 | } 29 | }, 30 | "tag": "Example", 31 | "testCases": [ 32 | { 33 | "input_str": "hello" 34 | } 35 | ], 36 | "aiPrompt": "aa", 37 | "greeting": "" 38 | } 39 | $""" 40 | 41 | import json 42 | 43 | def translate_text(input_str: str, event) -> str: 44 | data = { 45 | "style": "LLM-Only", 46 | "stream": False, 47 | "messageContent": input_str, 48 | "agentId": 739 49 | } 50 | 51 | resp = event.chat.messages(data=data) 52 | 53 | return resp 54 | 55 | def mindsflow_function(event, context) -> dict: 56 | # get the input string from the event 57 | input_str = event.get("input_str") 58 | 59 | # get the translation result 60 | translated_text = translate_text(input_str, event) 61 | 62 | # define result 63 | result = { 64 | 'translated_text': translated_text 65 | } 66 | 67 | return result 68 | 69 | -------------------------------------------------------------------------------- /agent-video-generator/functions/MusicGeneration.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "MusicGeneration", 4 | "displayName": "", 5 | "description": "Generate music from prompt", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "music_prompt" 10 | ], 11 | "properties": { 12 | "seed": { 13 | "type": "integer", 14 | "description": "" 15 | }, 16 | "duration": { 17 | "type": "number", 18 | "description": "" 19 | }, 20 | "temperature": { 21 | "type": "number", 22 | "description": "" 23 | }, 24 | "music_prompt": { 25 | "type": "string", 26 | "description": "" 27 | } 28 | } 29 | }, 30 | "outputPattern": { 31 | "type": "object", 32 | "required": [ 33 | "music_url" 34 | ], 35 | "properties": { 36 | "music_url": { 37 | "type": "string", 38 | "description": "" 39 | } 40 | } 41 | }, 42 | "tag": "VideoGeneration", 43 | "testCases": [ 44 | { 45 | "seed": -1, 46 | "duration": 4.9, 47 | "temperature": 1, 48 | "music_prompt": "Create a classical music piece" 49 | } 50 | ], 51 | "aiPrompt": "", 52 | "greeting": "" 53 | } 54 | $""" 55 | 56 | import json 57 | from os import path 58 | import math 59 | 60 | '''import subprocess 61 | command = 'pip install replicate' 62 | process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, universal_newlines=True)''' 63 | import replicate 64 | 65 | 66 | # Function to create a short music 67 | def create_music(prompt: str, duration: int=15, temperature: float=1, seed: int=-1) -> str: 68 | output = replicate.run( 69 | "meta/musicgen:7be0f12c54a8d033a0fbd14418c9af98962da9a86f5ff7811f9b3423a1f0b7d7", 70 | input={"model_version": "large", 71 | "prompt": prompt, 72 | "duration": duration, 73 | "temperature": temperature, 74 | "seed": seed} 75 | ) 76 | 77 | return output 78 | 79 | def mindsflow_function(event, context) -> dict: 80 | # get the prompt from the event 81 | prompt = event.get("music_prompt") 82 | duration = event.get("duration", 15) 83 | duration = min(duration, 28) 84 | temperature = event.get("temperature", 1) 85 | seed = event.get("seed", -1) 86 | if isinstance(duration, float): 87 | duration = math.ceil(duration) # Convert to int and approximate by excess 88 | 89 | # get the music URL 90 | music_url = create_music(prompt, duration, temperature, seed) 91 | 92 | # define result 93 | result = { 94 | 'music_url': music_url 95 | } 96 | 97 | return result 98 | 99 | -------------------------------------------------------------------------------- /agent-video-generator/functions/ShowFonts.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "ShowFonts", 4 | "displayName": "", 5 | "description": "Show fonts", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [], 9 | "properties": {} 10 | }, 11 | "outputPattern": { 12 | "type": "object", 13 | "required": [], 14 | "properties": {} 15 | }, 16 | "tag": "VideoCaptions", 17 | "testCases": [ 18 | {} 19 | ], 20 | "aiPrompt": "", 21 | "greeting": "" 22 | } 23 | $""" 24 | 25 | import json 26 | from moviepy.editor import TextClip 27 | import os 28 | from PIL import Image, ImageDraw, ImageFont 29 | from moviepy.editor import concatenate 30 | 31 | FOLDER = 'fonts' # specify the correct path 32 | 33 | def mindsflow_function(event, context) -> dict: 34 | 35 | WIDTH, HEIGHT = 500, 500 # specify dimensions of each image 36 | BG_COLOR = (0, 0, 0) # background color 37 | 38 | # Create images with each font 39 | images = [] 40 | for file in os.listdir(FOLDER): 41 | if file.endswith(".ttf"): 42 | font = ImageFont.truetype(os.path.join(FOLDER, file), 50) 43 | image = Image.new('RGB', (WIDTH, HEIGHT), color=BG_COLOR) 44 | draw = ImageDraw.Draw(image) 45 | 46 | text = '{}'.format(file.replace('.ttf', '')) 47 | x = 10 48 | y = 150 49 | 50 | draw.text((x, y), text, fill=(255,255,255), font=font) 51 | images.append(image) 52 | 53 | # Calculate the grid size - 6 images per row 54 | rows = len(images) // 6 55 | if len(images) % 6: 56 | rows += 1 57 | 58 | # Concatenate all images into grid 59 | concat_image = Image.new('RGB', (WIDTH * 6, HEIGHT * rows), BG_COLOR) 60 | 61 | x_offset = 0 62 | y_offset = 0 63 | for i, img in enumerate(images): 64 | concat_image.paste(img, (x_offset, y_offset)) 65 | if (i+1) % 6 == 0: 66 | x_offset = 0 67 | y_offset += HEIGHT 68 | else: 69 | x_offset += WIDTH 70 | concat_image.save(f'{FOLDER}/fonts.jpg') 71 | 72 | result = { 73 | 'fonts': f'{FOLDER}/fonts.jpg' 74 | } 75 | 76 | return result 77 | 78 | -------------------------------------------------------------------------------- /agent-video-generator/functions/UploadResultZipS3.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "UploadResultZipS3", 4 | "displayName": "", 5 | "description": "UploadResultZipS3", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "video_url", 10 | "title", 11 | "first_frame_url", 12 | "script", 13 | "description" 14 | ], 15 | "properties": { 16 | "title": { 17 | "type": "string", 18 | "description": "" 19 | }, 20 | "script": { 21 | "type": "string", 22 | "description": "" 23 | }, 24 | "video_url": { 25 | "type": "string", 26 | "description": "" 27 | }, 28 | "description": { 29 | "type": "string", 30 | "description": "" 31 | }, 32 | "first_frame_url": { 33 | "type": "string", 34 | "description": "" 35 | }, 36 | "video_url_no_music": { 37 | "type": "string", 38 | "description": "" 39 | } 40 | } 41 | }, 42 | "outputPattern": { 43 | "type": "object", 44 | "required": [ 45 | "result_url" 46 | ], 47 | "properties": { 48 | "result_url": { 49 | "type": "string", 50 | "description": "" 51 | } 52 | } 53 | }, 54 | "tag": "UploadVideo", 55 | "testCases": [ 56 | { 57 | "script": "hello", 58 | "title": "title of the 对的 video 沙发", 59 | "first_frame_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/img_1697717278_uakysssz.png", 60 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/output_1697717270_bmvbdbul.mp4", 61 | "description": "a video about something", 62 | "video_url_no_music": "" 63 | } 64 | ], 65 | "aiPrompt": "Upload result S3", 66 | "greeting": "" 67 | } 68 | $""" 69 | 70 | import os 71 | import urllib.request 72 | import json 73 | import shutil 74 | import boto3 75 | import unicodedata 76 | import random 77 | import string 78 | 79 | # Auxiliary function to download video and image 80 | def download_file(url, path): 81 | try: 82 | urllib.request.urlretrieve(url, path) 83 | return True 84 | except Exception as e: 85 | print(f"An error occurred while downloading the file. Error: {str(e)}") 86 | return False 87 | 88 | # Auxiliary function to write description and title in txt files 89 | def write_txt_file(content, path): 90 | try: 91 | with open(path, 'w') as f: 92 | f.write(content) 93 | return True 94 | except Exception as e: 95 | print(f"An error occurred while writing the text file. Error: {str(e)}") 96 | return False 97 | 98 | def upload_to_aws(filename: str) -> str: 99 | # Uses your AWS credentials to access the service 100 | bucket_name = os.environ.get('bucket_name') 101 | region = os.environ.get('region') 102 | 103 | # Create a session using the provided credentials 104 | session = boto3.Session( 105 | aws_access_key_id=os.environ.get('access_key_id'), 106 | aws_secret_access_key=os.environ.get('secret_access_key') 107 | ) 108 | 109 | # Create an S3 client 110 | s3_client = session.client('s3') 111 | 112 | bucket_path = 'video-results' 113 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 114 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 115 | url = f'{s3_base_url}{bucket_path}/{filename}' 116 | 117 | return url 118 | 119 | # Auxiliary function to create a folder, download the files and then zip the folder 120 | def prepare_files(event): 121 | video_url = event.get("video_url") 122 | image_url = event.get("first_frame_url") 123 | video_title = event.get("title") 124 | video_description = event.get("description") 125 | text = event.get("script") 126 | video_url_no_music = event.get("video_url_no_music", None) 127 | 128 | print(video_title) 129 | video_title_original = video_title 130 | if not video_title.isascii(): 131 | video_title = ''.join(random.choices(string.ascii_letters + string.digits, k=12)) 132 | if len(video_title) > 30: 133 | video_title = video_title[:30] 134 | video_title = video_title.replace(" ", "_") 135 | print(video_title) 136 | 137 | if not os.path.exists(video_title): 138 | os.makedirs(video_title) 139 | 140 | video_path = f"{video_title}/video.{video_url.split('.')[-1]}" 141 | download_file(video_url, video_path) 142 | img_path = f"{video_title}/first_frame.{image_url.split('.')[-1]}" 143 | download_file(image_url, img_path) 144 | 145 | write_txt_file(video_description, f"{video_title}/description.txt") 146 | write_txt_file(video_title_original, f"{video_title}/{video_title}.txt") 147 | write_txt_file(text, f"{video_title}/text.txt") 148 | if video_url_no_music is not None: 149 | write_txt_file(video_url_no_music, f"{video_title}/video_url_no_music.txt") 150 | 151 | shutil.make_archive(video_title, 'zip', video_title) 152 | url = upload_to_aws(f"{video_title}.zip") 153 | 154 | os.remove(video_path) 155 | os.remove(img_path) 156 | os.remove(f"{video_title}.zip") 157 | shutil.rmtree(video_title) 158 | 159 | return url 160 | 161 | # Main function 162 | def mindsflow_function(event, context) -> dict: 163 | 164 | # prepare files and upload to S3 165 | s3_url = prepare_files(event) 166 | 167 | # define result 168 | result = { 169 | 'result_url': s3_url 170 | } 171 | 172 | return result 173 | -------------------------------------------------------------------------------- /agent-video-generator/functions/addAudioSegmentsToVideo.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "addAudioSegmentsToVideo", 4 | "displayName": "", 5 | "description": "Add audio segments to video", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "voice", 10 | "json_url", 11 | "video_url", 12 | "audio_folder", 13 | "use_original_voice" 14 | ], 15 | "properties": { 16 | "voice": { 17 | "type": "string", 18 | "description": "" 19 | }, 20 | "json_url": { 21 | "type": "string", 22 | "description": "" 23 | }, 24 | "video_url": { 25 | "type": "string", 26 | "description": "" 27 | }, 28 | "audio_folder": { 29 | "type": "string", 30 | "description": "" 31 | }, 32 | "use_original_voice": { 33 | "type": "boolean", 34 | "description": "" 35 | } 36 | } 37 | }, 38 | "outputPattern": { 39 | "type": "object", 40 | "required": [ 41 | "video_url" 42 | ], 43 | "properties": { 44 | "video_url": { 45 | "type": "string", 46 | "description": "" 47 | } 48 | } 49 | }, 50 | "tag": "TextToSpeech", 51 | "testCases": [ 52 | { 53 | "voice": "d8369f1b-588b-40b2-8009-3511630bff13_audio", 54 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedAOjUGH.json", 55 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/0ea5ed8d-795e-4120-993d-62bb9ba70920_video_no_audio.mp4", 56 | "audio_folder": "test", 57 | "use_original_voice": false 58 | }, 59 | { 60 | "voice": "d8369f1b-588b-40b2-8009-3511630bff13_audio", 61 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedAEAQmF.json", 62 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/a53247d5-055c-464e-bdc3-242369f1ff46_video_no_audio.mp4", 63 | "audio_folder": "test", 64 | "use_original_voice": false 65 | }, 66 | { 67 | "voice": "zh-CN-YunfengNeural", 68 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedDLYYSi.json", 69 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/combine_8d43656c-2e0c-48cd-a4b6-d8c1c0336740.mp4", 70 | "audio_folder": "test", 71 | "use_original_voice": false 72 | } 73 | ], 74 | "aiPrompt": "", 75 | "greeting": "" 76 | } 77 | $""" 78 | 79 | import json 80 | import moviepy.editor as mpy 81 | import os 82 | import requests 83 | from pydub import AudioSegment 84 | import shutil 85 | import boto3 86 | 87 | 88 | def download_file(url, filename): 89 | if not os.path.exists(filename): 90 | res = requests.get(url) 91 | with open(filename, "wb") as f: 92 | f.write(res.content) 93 | else: 94 | print(f"The file {filename} already exists.") 95 | 96 | 97 | def get_captions_from_url(url): 98 | filename = f"{url.split('/')[-1]}" 99 | # download the json file 100 | download_file(url, filename) 101 | # read the contents 102 | with open(filename, 'r', encoding='utf-8') as f: 103 | captions = json.load(f) 104 | return captions, filename 105 | 106 | 107 | def upload_to_aws(filename: str, bucket_path = None) -> str: 108 | bucket_name = os.environ.get('bucket_name') 109 | region = os.environ.get('region') 110 | session = boto3.Session( 111 | aws_access_key_id=os.environ.get('access_key_id'), 112 | aws_secret_access_key=os.environ.get('secret_access_key') 113 | ) 114 | s3_client = session.client('s3') 115 | if bucket_path is None: 116 | bucket_path = 'ai-video' 117 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 118 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 119 | url = f'{s3_base_url}{bucket_path}/{filename}' 120 | return url 121 | 122 | 123 | def delete_from_aws(filename: str, bucket_path=None): 124 | bucket_name = os.environ.get('bucket_name') 125 | session = boto3.Session( 126 | aws_access_key_id=os.environ.get('access_key_id'), 127 | aws_secret_access_key=os.environ.get('secret_access_key') 128 | ) 129 | s3_client = session.client('s3') 130 | if bucket_path is None: 131 | bucket_path = 'ai-video' 132 | # Now delete the file after upload 133 | s3_client.delete_object(Bucket=bucket_name, Key=f"{bucket_path}/{filename}") 134 | 135 | 136 | unit_time = 10000000 137 | 138 | 139 | def combine_video_audio(video_path: str, captions, audio_folder: str, api_data: dict, voice_clone_url: str, use_original_voice: bool = False) -> str: 140 | # get the video 141 | video = mpy.VideoFileClip(video_path) 142 | audio_tracks = [] 143 | 144 | # loop over all the start times 145 | for i, cap in enumerate(captions): 146 | # start time of audio 147 | start_time = cap['start_time'] / unit_time 148 | audio_path = f"{audio_folder}/audio_segment_{i+1}.wav" 149 | 150 | print(f'Processing audio {i+1} | Start time {start_time} | {audio_path}') 151 | 152 | if use_original_voice: 153 | audio_url = upload_to_aws(audio_path, bucket_path='temp-audio') 154 | headers = {'Content-Type': 'application/json'} 155 | api_data['audio_url'] = audio_url 156 | response = requests.post(voice_clone_url, data=json.dumps(api_data), headers=headers) 157 | if response.status_code != 200: 158 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}') 159 | audio_path = f'{audio_folder}/gen_voice_{i+1}.wav' 160 | print('use original voice', audio_path) 161 | with open(audio_path, 'wb') as file: 162 | file.write(response.content) 163 | delete_from_aws(audio_path, bucket_path='temp-audio') 164 | 165 | # load newly created voice track as an AudioFileClip 166 | new_audio = mpy.AudioFileClip(audio_path) 167 | # set start time for this audio segment 168 | new_audio = new_audio.set_start(start_time) 169 | # add this audio to the audio_tracks list 170 | audio_tracks.append(new_audio) 171 | 172 | print('Writing video...') 173 | # concatenate the original audio with new audio tracks 174 | final_audio = mpy.CompositeAudioClip(audio_tracks) 175 | # build final video with new audio track 176 | video = video.set_audio(final_audio) 177 | new_video_path = f"combine_{video_path}" 178 | if '_video_no_audio' in new_video_path: 179 | new_video_path = new_video_path.replace('_video_no_audio', '') 180 | video.write_videofile(new_video_path, audio_codec='aac') 181 | return new_video_path 182 | 183 | 184 | def mindsflow_function(event, context) -> dict: 185 | video_url = event.get("video_url") 186 | json_url = event.get("json_url") 187 | audio_folder = event.get("audio_folder") 188 | voice = event.get('voice') 189 | use_original_voice = event.get('use_original_voice') 190 | api_ip = os.environ.get('api_ip') 191 | 192 | video_path = video_url.split('/')[-1] 193 | download_file(video_url, video_path) 194 | print(f'Video downloaded from {video_url}') 195 | captions, json_name = get_captions_from_url(json_url) 196 | 197 | voice_clone_url = f"http://{api_ip}:5001/generate_voice/" 198 | 199 | api_data = { 200 | "audio_url": None, 201 | "voice": voice, 202 | "clean_noise": False 203 | } 204 | 205 | # get the audio configuration result 206 | new_video_path = combine_video_audio(video_path, captions, audio_folder, api_data, voice_clone_url, use_original_voice) 207 | result_video = upload_to_aws(new_video_path) 208 | 209 | # delete local files after use 210 | os.remove(video_path) 211 | os.remove(new_video_path) 212 | os.remove(json_name) 213 | #shutil.rmtree(audio_folder) 214 | 215 | # define result 216 | result = { 217 | 'video_url': result_video 218 | } 219 | 220 | return result 221 | -------------------------------------------------------------------------------- /agent-video-generator/functions/addSoundToVideo.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "addSoundToVideo", 4 | "displayName": "", 5 | "description": "The method is designed to add sound to a video file.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "audio_url", 10 | "video_url" 11 | ], 12 | "properties": { 13 | "volume": { 14 | "type": "number", 15 | "description": "" 16 | }, 17 | "audio_url": { 18 | "type": "string", 19 | "description": "URL of the audio to be downloaded" 20 | }, 21 | "video_url": { 22 | "type": "string", 23 | "description": "URL of the video to be downloaded" 24 | }, 25 | "repeat_audio": { 26 | "type": "boolean", 27 | "description": "" 28 | } 29 | } 30 | }, 31 | "outputPattern": { 32 | "type": "object", 33 | "required": [ 34 | "video_url" 35 | ], 36 | "properties": { 37 | "video_url": { 38 | "type": "string", 39 | "description": "The URL of the video file with background music added and uploaded to S3" 40 | } 41 | } 42 | }, 43 | "tag": "DataPreprocessing", 44 | "testCases": [ 45 | { 46 | "volume": 0.5, 47 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/7d670407-7729-4db1-b468-6ca545051de5_audio/accompaniment.wav", 48 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/combine_e37e383a-00db-46d9-a5fa-d9dbfa5e760c.mp4", 49 | "repeat_audio": false 50 | } 51 | ], 52 | "aiPrompt": "addSoundToVideo", 53 | "greeting": "" 54 | } 55 | $""" 56 | 57 | import json 58 | import requests 59 | import moviepy.editor as mpy 60 | import boto3 61 | import time 62 | import random 63 | import string 64 | import os 65 | from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip, concatenate_audioclips 66 | from moviepy.audio.AudioClip import AudioArrayClip 67 | import numpy as np 68 | from moviepy.audio.fx.all import volumex 69 | 70 | def download_file(url: str, save_as: str) -> None: 71 | response = requests.get(url, stream=True) 72 | with open(save_as, 'wb') as f: 73 | for chunk in response.iter_content(chunk_size=1024): 74 | if chunk: 75 | f.write(chunk) 76 | 77 | def get_random_string(): 78 | letters = string.ascii_lowercase 79 | result_str = ''.join(random.choice(letters) for _ in range(8)) 80 | timestamp = int(time.time()) 81 | random_str = str(timestamp) + '_' + result_str 82 | return random_str 83 | 84 | def upload_to_aws(filename: str) -> str: 85 | # Uses your AWS credentials to access the service 86 | bucket_name = os.environ.get('bucket_name') 87 | region = os.environ.get('region') 88 | 89 | # Create a session using the provided credentials 90 | session = boto3.Session( 91 | aws_access_key_id=os.environ.get('access_key_id'), 92 | aws_secret_access_key=os.environ.get('secret_access_key') 93 | ) 94 | 95 | # Create an S3 client 96 | s3_client = session.client('s3') 97 | 98 | bucket_path = 'ai-video' 99 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 100 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 101 | url = f'{s3_base_url}{bucket_path}/{filename}' 102 | 103 | return url 104 | 105 | def add_background_music(video_file_path: str, audio_file_path: str, output_file_path: str, repeat_audio: bool=True, pause: float=1.0, volume: float=1.0) -> None: 106 | video = VideoFileClip(video_file_path) # Existing Video File 107 | existing_audio = video.audio # Existing Audio in Video File 108 | new_audio = AudioFileClip(audio_file_path) # New Audio File 109 | new_audio = new_audio.fx(volumex, volume) # Adjusting the volume of the new audio 110 | if repeat_audio: 111 | # Duration for the silent clip 112 | fps = 44100 113 | audio_array = np.zeros((int(pause*fps), 2)) 114 | cl_silent = AudioArrayClip(audio_array, fps=fps) 115 | cl_silent.write_audiofile('silent.wav') 116 | audio_clips = [new_audio] 117 | silent_audio = AudioFileClip('silent.wav') 118 | # append audio clips until their total duration is greater than the video 119 | while sum(clip.duration for clip in audio_clips) < video.duration: 120 | audio_clips.extend([new_audio, silent_audio]) 121 | new_audio = concatenate_audioclips(audio_clips) 122 | 123 | # If the new audio is longer than the video, limit its duration to that of the video. 124 | if new_audio.duration > video.duration: 125 | new_audio = new_audio.subclip(0, video.duration) 126 | elif video.duration > new_audio.duration: 127 | video = video.subclip(0, new_audio.duration) 128 | 129 | # If the video also has audio, we will overlay the new audio onto the existing audio 130 | if existing_audio is not None: 131 | audio = CompositeAudioClip([existing_audio, new_audio]) 132 | else: 133 | audio = new_audio # If the video has no audio, just set the new audio as the video's audio 134 | 135 | final_clip = video.set_audio(audio) # Set the audio track of the video to the audio clip created above 136 | final_clip.write_videofile(output_file_path, audio_codec='aac') # Write the output 137 | 138 | def mindsflow_function(event, context) -> dict: 139 | video_url = event.get("video_url") 140 | audio_url = event.get("audio_url") 141 | volume = event.get("volume", 1.0) 142 | repeat_audio = event.get("repeat_audio", False) 143 | 144 | video_file_path = "temp_video.mp4" 145 | audio_file_path = "temp_audio.wav" 146 | random_str = get_random_string() 147 | output_file_path = f"output_{random_str}.mp4" 148 | 149 | # Step 1: download files 150 | download_file(video_url, video_file_path) 151 | download_file(audio_url, audio_file_path) 152 | 153 | # Step 2: add background music to video 154 | if volume > 0: 155 | add_background_music(video_file_path, audio_file_path, output_file_path, volume=volume, repeat_audio=repeat_audio) 156 | else: 157 | print('audio not added because specified volume was <= 0') 158 | 159 | # Step 3: upload file to S3 160 | url = upload_to_aws(output_file_path) 161 | 162 | result = { 163 | 'video_url': url 164 | } 165 | return result 166 | 167 | -------------------------------------------------------------------------------- /agent-video-generator/functions/addTextToImage.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "addTextToImage", 4 | "displayName": "", 5 | "description": "This Python method downloads an image from a provided URL, adds a given title to the image, uploads the modified image to an S3 bucket, and then returns the new image's URL.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "text", 10 | "image_url" 11 | ], 12 | "properties": { 13 | "text": { 14 | "type": "string", 15 | "description": "" 16 | }, 17 | "margin": { 18 | "type": "number", 19 | "description": "" 20 | }, 21 | "font_name": { 22 | "type": "string", 23 | "description": "" 24 | }, 25 | "font_size": { 26 | "type": "number", 27 | "description": "" 28 | }, 29 | "image_url": { 30 | "type": "string", 31 | "description": "URL of the video to be downloaded" 32 | }, 33 | "text_color": { 34 | "type": "string", 35 | "description": "" 36 | }, 37 | "caption_position": { 38 | "type": "string", 39 | "description": "" 40 | }, 41 | "text_border_size": { 42 | "type": "number", 43 | "description": "" 44 | }, 45 | "text_border_color": { 46 | "type": "string", 47 | "description": "" 48 | } 49 | } 50 | }, 51 | "outputPattern": { 52 | "type": "object", 53 | "required": [ 54 | "image_url" 55 | ], 56 | "properties": { 57 | "image_url": { 58 | "type": "string", 59 | "description": "The presigned URL for the image uploaded to the S3 bucket" 60 | } 61 | } 62 | }, 63 | "tag": "VideoGeneration", 64 | "testCases": [ 65 | { 66 | "text": "", 67 | "margin": 0, 68 | "font_name": "", 69 | "font_size": 0, 70 | "image_url": "", 71 | "text_color": "", 72 | "caption_position": "", 73 | "text_border_size": 0, 74 | "text_border_color": "" 75 | } 76 | ], 77 | "aiPrompt": "", 78 | "greeting": "" 79 | } 80 | $""" 81 | 82 | import json 83 | import cv2 84 | from moviepy.editor import VideoFileClip 85 | import boto3 86 | import os 87 | import time 88 | import random 89 | import string 90 | import requests 91 | import numpy as np 92 | from PIL import ImageFont, ImageDraw, Image 93 | 94 | font_url = 'https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/fonts/{}.ttf' 95 | 96 | def upload_to_aws(filename: str) -> str: 97 | # Uses your AWS credentials to access the service 98 | bucket_name = os.environ.get('bucket_name') 99 | region = os.environ.get('region') 100 | 101 | # Create a session using the provided credentials 102 | session = boto3.Session( 103 | aws_access_key_id=os.environ.get('access_key_id'), 104 | aws_secret_access_key=os.environ.get('secret_access_key') 105 | ) 106 | 107 | # Create an S3 client 108 | s3_client = session.client('s3') 109 | 110 | bucket_path = 'ai-video' 111 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 112 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 113 | url = f'{s3_base_url}{bucket_path}/{filename}' 114 | 115 | return url 116 | 117 | def download_file(url, save_path): 118 | response = requests.get(url) 119 | with open(save_path, 'wb') as file: 120 | file.write(response.content) 121 | 122 | def get_random_string(): 123 | letters = string.ascii_lowercase 124 | result_str = ''.join(random.choice(letters) for _ in range(6)) 125 | timestamp = int(time.time()) 126 | random_str = str(timestamp) + '_' + result_str 127 | return random_str 128 | 129 | # Define color dictionary for known colors 130 | color_dict = { 131 | 'black': (0, 0, 0), 132 | 'white': (255, 255, 255), 133 | 'red': (0, 0, 255), # Remember, in OpenCV it's BGR not RGB 134 | 'green': (0, 255, 0), 135 | 'blue': (255, 0, 0), 136 | 'yellow': (0, 255, 255), 137 | 'cyan': (255, 255, 0), 138 | 'magenta': (255, 0, 255), 139 | 'light gray': (211, 211, 211), 140 | 'dark gray': (169, 169, 169), 141 | 'pink': (147, 20, 255), 142 | 'purple': (128, 0, 128), 143 | 'orange': (0, 165, 255), 144 | 'brown': (42, 42, 165) 145 | } 146 | 147 | # Define the dictionary for known font types 148 | font_dict = { 149 | 'chinese': 'NotoSansSC', 150 | 'default': 'SourceSerif4', 151 | } 152 | 153 | def wrap_text(caption, frame_width, font): 154 | words = caption.split(' ') 155 | lines = [words.pop(0)] # Initial 156 | for word in words: 157 | box = font.getbbox(lines[-1] + ' ' + word) 158 | text_width, text_height = box[2] - box[0], box[3] - box[1] 159 | if text_width > frame_width: 160 | lines.append(word) 161 | else: 162 | lines[-1] += ' ' + word 163 | return lines 164 | 165 | def add_title_to_img(image_path, caption, outfile='out.jpg', border_size=2, border_color='black', text_color='white', 166 | font_size=30, font_type='DUPLEX', caption_position='bottom', margin=0.1, font_dir=''): 167 | # Load image 168 | img_pil = Image.open(image_path) 169 | draw = ImageDraw.Draw(img_pil) 170 | 171 | width, height = img_pil.size 172 | 173 | # Get the specified font 174 | if font_type is None: 175 | font_type = 'default' 176 | if font_type in font_dict.keys(): 177 | font_type = font_dict[font_type] 178 | try: 179 | font = ImageFont.truetype(f'{os.path.join(font_dir, font_type)}.ttf', size=font_size) 180 | except: 181 | if not os.path.exists(font_dir): 182 | os.makedirs(font_dir) 183 | download_file(font_url.format(font_type), f'{os.path.join(font_dir, font_type)}.ttf') 184 | font = ImageFont.truetype(f'{os.path.join(font_dir, font_type)}.ttf', size=font_size) 185 | 186 | margin_rate = int(width * margin) 187 | 188 | lines = wrap_text(caption, width - 2 * margin_rate, font) 189 | for i, line in enumerate(lines): 190 | box = font.getbbox(line) 191 | text_width, text_height = box[2] - box[0], box[3] - box[1] 192 | text_height = font_size * 1.3 193 | 194 | # Center the text 195 | textX = (width - text_width - margin_rate * 2) // 2 + margin_rate 196 | total_lines = len(lines) 197 | total_text_height = total_lines * text_height # The total height of text block 198 | 199 | # Position text as per given caption_position 200 | if caption_position.lower() == 'top': 201 | textY = margin_rate + (i * text_height) 202 | elif caption_position.lower() == 'bottom': 203 | textY = height - margin_rate - (len(lines) - i) * text_height 204 | elif caption_position.lower() == 'threequarter': 205 | three_quarter_height = height * 0.75 206 | textY = three_quarter_height - ((total_lines - i) * text_height) 207 | elif caption_position.lower() == 'onequarter': 208 | one_quarter_height = height * 0.25 209 | textY = one_quarter_height + ((i + 1) * text_height) 210 | else: # Default to center if unknown value 211 | textY = ((height - total_text_height) // 2) + (i * text_height) 212 | 213 | # Draw the outline 214 | for k in range(-border_size, border_size + 1): 215 | for j in range(-border_size, border_size + 1): 216 | draw.text((textX + j, textY + k), line, font=font, fill=border_color) 217 | # Draw the text 218 | draw.text((textX, textY), line, font=font, fill=text_color) 219 | 220 | # save the image with caption 221 | img_pil.save(outfile) 222 | 223 | def mindsflow_function(event, context) -> dict: 224 | img_url = event.get("image_url") 225 | text = event.get("text") 226 | caption_position = event.get("caption_position", "bottom") 227 | border_color = event.get("text_border_color", "black") 228 | text_color = event.get("text_color", "white") 229 | font_size = event.get("font_size", 30) 230 | margin = event.get("margin", 0.1) 231 | font_type = event.get("font_name", 'default') 232 | border_size = event.get("text_border_size", 2) 233 | 234 | download_path = "img_" + get_random_string() + ".png" 235 | out_path = "img_" + get_random_string() + ".png" 236 | download_file(img_url, download_path) 237 | # add title to the image 238 | add_title_to_img(download_path, 239 | text, 240 | outfile=out_path, 241 | caption_position=caption_position, 242 | border_color=border_color, 243 | text_color=text_color, 244 | font_size=font_size, 245 | margin=margin, 246 | font_type=font_type, 247 | border_size=border_size, 248 | font_dir = os.environ.get('font_dir') 249 | ) 250 | # upload the image to s3 and get the url 251 | url = upload_to_aws(out_path) 252 | 253 | # define result 254 | result = { 255 | 'image_url': url 256 | } 257 | 258 | os.remove(download_path) 259 | os.remove(out_path) 260 | 261 | return result 262 | 263 | -------------------------------------------------------------------------------- /agent-video-generator/functions/cloneVoiceValleX.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "cloneVoiceValleX", 4 | "displayName": "", 5 | "description": "This Python method downloads a wav file, replicates the voice, generates speech from provided text (potentially in a different language), uploads the new file to AWS, and returns the URL.\n\n- The input wav file of the speaker to be cloned MUST be < 15s\n- For now only English to Chinese is supported", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "text" 10 | ], 11 | "properties": { 12 | "text": { 13 | "type": "string", 14 | "description": "Text from which to generate the new voice" 15 | }, 16 | "audio_url": { 17 | "type": "string", 18 | "description": "The url for the original audio file that needs to be processed" 19 | }, 20 | "transcript": { 21 | "type": "string", 22 | "description": "" 23 | }, 24 | "character_name": { 25 | "type": "string", 26 | "description": "Name of the character (optional)" 27 | } 28 | } 29 | }, 30 | "outputPattern": { 31 | "type": "object", 32 | "required": [ 33 | "audio_url" 34 | ], 35 | "properties": { 36 | "audio_url": { 37 | "type": "string", 38 | "description": "The URL of the audio file uploaded to AWS" 39 | } 40 | } 41 | }, 42 | "tag": "VoiceCloning", 43 | "testCases": [ 44 | { 45 | "text": "今天阳光明媚,温度很适宜,所以我打算去附近的公园漫步、欣赏风景、放松心情", 46 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/voice/davide1_split.wav", 47 | "transcript": "", 48 | "character_name": "tony_stark" 49 | }, 50 | { 51 | "text": "I think I is going to rule the earth one day, but fortunately this day is still very far.", 52 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/tony_stark.wav", 53 | "transcript": "", 54 | "character_name": "tony_stark" 55 | }, 56 | { 57 | "text": "I think I is going to rule the earth one day, 但是那天还没到", 58 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/tony_stark.wav", 59 | "transcript": "", 60 | "character_name": "tony_stark" 61 | } 62 | ], 63 | "aiPrompt": "Given the url of a wav file and a text. Download the file, clone the voice and generate a speech according to the next text, the new text can also be in a different language. Upload the new generated wav file to aws and return the url", 64 | "greeting": "" 65 | } 66 | $""" 67 | 68 | import os 69 | import json 70 | import boto3 71 | import requests 72 | import random 73 | import string 74 | 75 | s3 = boto3.resource('s3') 76 | 77 | def download_file(url: str, save_path: str): 78 | resp = requests.get(url) 79 | with open(save_path, 'wb') as f: 80 | f.write(resp.content) 81 | 82 | 83 | def generate_random_string(length): 84 | letters = string.ascii_letters 85 | result_str = ''.join(random.choice(letters) for i in range(length)) 86 | return result_str 87 | 88 | 89 | def upload_to_aws(filename: str) -> str: 90 | bucket_name = os.environ.get('bucket_name') 91 | region = os.environ.get('region') 92 | session = boto3.Session( 93 | aws_access_key_id=os.environ.get('access_key_id'), 94 | aws_secret_access_key=os.environ.get('secret_access_key') 95 | ) 96 | s3_client = session.client('s3') 97 | bucket_path = 'voice-clone' 98 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 99 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 100 | url = f'{s3_base_url}{bucket_path}/{filename}' 101 | return url 102 | 103 | 104 | def mindsflow_function(event, context) -> dict: 105 | # get from event 106 | audio_url = event.get('audio_url', None) 107 | text = event.get('text') 108 | character_name = event.get('character_name', None) 109 | transcript = event.get('transcript', None) 110 | api_ip = os.environ.get('api_ip') 111 | 112 | if character_name is None or len(character_name) == 0: 113 | character_name = 'temp'+generate_random_string(10) 114 | 115 | if transcript is not None and len(transcript) == 0: 116 | transcript = None 117 | 118 | voice_clone_url = f"http://{api_ip}:5000/voice_clone/" 119 | 120 | data = { 121 | "audio_url": audio_url, 122 | "character_name": character_name, 123 | "transcript": transcript 124 | } 125 | 126 | headers = { 127 | 'Content-Type': 'application/json' 128 | } 129 | 130 | print('Cloning voice...') 131 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers) 132 | if response.status_code != 200: 133 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}') 134 | print('Voice cloned') 135 | 136 | voice_gen_url = f"http://{api_ip}:5001/generate_audio/" 137 | 138 | data = { 139 | 'character_name': character_name, 140 | 'text': text 141 | } 142 | 143 | print('Generating new voice...') 144 | response = requests.post(voice_gen_url, json=data) 145 | if response.status_code != 200: 146 | raise RuntimeError(f'Voice generation failed with status code: {response.status_code}') 147 | print('New voice generated') 148 | 149 | audio_path = audio_url.split('/')[-1] 150 | # Save the file to the directory 151 | with open(audio_path, 'wb') as file: 152 | file.write(response.content) 153 | 154 | result_url = upload_to_aws(audio_path) 155 | 156 | # clean up 157 | os.remove(audio_path) 158 | 159 | return { 160 | "audio_url": result_url 161 | } 162 | 163 | -------------------------------------------------------------------------------- /agent-video-generator/functions/cloneVoiceVits.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "cloneVoiceVits", 4 | "displayName": "", 5 | "description": "The Python method is designed to download a WAV file from a specified URL, clone the voice from the file, and generate new speech from a supplied text (potentially in another language). The newly created WAV file is then uploaded to AWS and the URL", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "dataset_url" 10 | ], 11 | "properties": { 12 | "voice": { 13 | "type": "string", 14 | "description": "Name of the cloned voice" 15 | }, 16 | "audio_split": { 17 | "type": "integer", 18 | "description": "" 19 | }, 20 | "clean_noise": { 21 | "type": "boolean", 22 | "description": "Clean noise in audio for training" 23 | }, 24 | "dataset_url": { 25 | "type": "string", 26 | "description": "The url of the original wav file" 27 | }, 28 | "train_config": { 29 | "type": "string", 30 | "description": "The new text to generate speech" 31 | } 32 | } 33 | }, 34 | "outputPattern": { 35 | "type": "object", 36 | "required": [ 37 | "voice", 38 | "succeeded" 39 | ], 40 | "properties": { 41 | "voice": { 42 | "type": "string", 43 | "description": "" 44 | }, 45 | "succeeded": { 46 | "type": "boolean", 47 | "description": "" 48 | } 49 | } 50 | }, 51 | "tag": "VoiceCloning", 52 | "testCases": [ 53 | { 54 | "voice": "", 55 | "audio_split": 12, 56 | "clean_noise": true, 57 | "dataset_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/chinese_poadcast_woman1.zip", 58 | "train_config": "config_1000" 59 | }, 60 | { 61 | "voice": "", 62 | "audio_split": 0, 63 | "clean_noise": false, 64 | "dataset_url": "", 65 | "train_config": "" 66 | } 67 | ], 68 | "aiPrompt": "Given the url of a wav file and a text. Download the file, clone the voice and generate a speech according to the next text, the new text can also be in a different language. Upload the new generated wav file to aws and return the url", 69 | "greeting": "" 70 | } 71 | $""" 72 | 73 | import os 74 | import json 75 | import boto3 76 | import requests 77 | import random 78 | import string 79 | 80 | default_train_config = 'config_1000' 81 | 82 | def mindsflow_function(event, context) -> dict: 83 | # get from event 84 | dataset_url = event.get('dataset_url') 85 | config = event.get('train_config', default_train_config) 86 | split = event.get('audio_split', 12) 87 | clean_noise = event.get('clean_noise', False) 88 | voice = event.get('voice', None) 89 | api_ip = os.environ.get('api_ip') 90 | 91 | if config is None or len(config) == 0: 92 | config = default_train_config 93 | if voice is not None and len(voice) == 0: 94 | voice = None 95 | 96 | voice_clone_url = f"http://{api_ip}:5000/voice_clone/" 97 | 98 | data = { 99 | "dataset_url": dataset_url, 100 | "config": config, 101 | "split": split, 102 | "clean_noise": clean_noise 103 | } 104 | 105 | headers = { 106 | 'Content-Type': 'application/json' 107 | } 108 | 109 | print('Cloning voice...') 110 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers) 111 | if response.status_code != 200: 112 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}') 113 | print('Voice cloned') 114 | 115 | response_dict = response.json() 116 | 117 | return { 118 | "succeeded": response_dict["succeeded"], 119 | "voice": response_dict["voice"] if voice is None else voice 120 | } 121 | 122 | -------------------------------------------------------------------------------- /agent-video-generator/functions/deleteFilesByExtension.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "deleteFilesByExtension", 4 | "displayName": "", 5 | "description": "This method is used for deleting all files within a directory, with an optional filter for specific file extensions.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [], 9 | "properties": {} 10 | }, 11 | "outputPattern": { 12 | "type": "object", 13 | "required": [ 14 | "status" 15 | ], 16 | "properties": { 17 | "status": { 18 | "type": "string", 19 | "description": "Indicates whether the operation was successful" 20 | } 21 | } 22 | }, 23 | "tag": "FileDeletion", 24 | "testCases": [ 25 | {}, 26 | {} 27 | ], 28 | "aiPrompt": "delete all files in dir, can filter by extension", 29 | "greeting": "" 30 | } 31 | $""" 32 | 33 | import json 34 | import os 35 | import glob 36 | 37 | def process_files(dir_path: str, file_type: str) -> list: 38 | # construct the path with file type 39 | file_paths = glob.glob(os.path.join(dir_path, '*.' + file_type)) 40 | 41 | 42 | # read and print file content 43 | for file_path in file_paths: 44 | print(file_path) 45 | os.remove(file_path) 46 | 47 | def mindsflow_function(event, context) -> dict: 48 | # get the directory path and file type from the event 49 | dir_path = '' 50 | file_type = ['wav', 'mp4', 'json', 'html', 'log', 'zip', 'srt', 'mp3', 'jpg', 'ass'] 51 | 52 | for ext in file_type: 53 | # process the files and get the content 54 | process_files(dir_path, ext) 55 | 56 | # define result 57 | result = { 58 | 'status': 'ok' 59 | } 60 | 61 | return result 62 | 63 | -------------------------------------------------------------------------------- /agent-video-generator/functions/deleteFolders.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "deleteFolders", 4 | "displayName": "", 5 | "description": "delete all folders with exceptions", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [], 9 | "properties": {} 10 | }, 11 | "outputPattern": { 12 | "type": "object", 13 | "required": [ 14 | "status" 15 | ], 16 | "properties": { 17 | "status": { 18 | "type": "string", 19 | "description": "Indicates whether the operation was successful" 20 | } 21 | } 22 | }, 23 | "tag": "FileDeletion", 24 | "testCases": [ 25 | {}, 26 | {} 27 | ], 28 | "aiPrompt": "", 29 | "greeting": "" 30 | } 31 | $""" 32 | 33 | import json 34 | import os 35 | import glob 36 | import shutil 37 | 38 | exclude_list = [os.getenv('font_dir')] # define your exclude list 39 | 40 | def process_files(dir_path: str) -> list: 41 | # list all the subdirectories 42 | dir_paths = [d for d in glob.glob(os.path.join(dir_path, '*')) if os.path.isdir(d)] 43 | 44 | for dir_path in dir_paths: 45 | 46 | folder_name = os.path.basename(dir_path) 47 | # only delete the folder if it's not in the exclude list 48 | if folder_name not in exclude_list: 49 | 50 | # delete the folder 51 | shutil.rmtree(dir_path) 52 | print(f'Deleted: {dir_path}') 53 | 54 | def mindsflow_function(event, context) -> dict: 55 | # get the directory path from the event 56 | dir_path = '' 57 | 58 | # process the directories and delete them 59 | process_files(dir_path) 60 | 61 | # define result 62 | result = { 63 | 'status': 'ok' 64 | } 65 | 66 | return result 67 | 68 | -------------------------------------------------------------------------------- /agent-video-generator/functions/extractVideoAudioComponents.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "extractVideoAudioComponents", 4 | "displayName": "", 5 | "description": "This method is designed to download a YouTube video, extract its audio, and upload the video without audio and the extracted audio to an S3 server, returning the respective URLs. It is presented in a way that allows for future adaptation to other platforms.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "video_url" 10 | ], 11 | "properties": { 12 | "video_url": { 13 | "type": "string", 14 | "description": "The URL of the video to be downloaded and split" 15 | } 16 | } 17 | }, 18 | "outputPattern": { 19 | "type": "object", 20 | "required": [ 21 | "audio_url", 22 | "video_url" 23 | ], 24 | "properties": { 25 | "audio_url": { 26 | "type": "string", 27 | "description": "The url for the downloaded audio file" 28 | }, 29 | "video_url": { 30 | "type": "string", 31 | "description": "The url for the downloaded video file without audio" 32 | }, 33 | "original_video_url": { 34 | "type": "string", 35 | "description": "" 36 | } 37 | } 38 | }, 39 | "tag": "DataPreprocessing", 40 | "testCases": [ 41 | { 42 | "video_url": "https://www.youtube.com/watch?app=desktop&v=Lv06Razi3Y4" 43 | }, 44 | { 45 | "video_url": "https://www.bilibili.com/video/BV14d4y1U7iG/" 46 | }, 47 | { 48 | "video_url": "https://www.instagram.com/reel/Cx43zhAvdwL/" 49 | }, 50 | { 51 | "video_url": "https://www.tiktok.com/@tedtoks/video/7304757623600057631" 52 | } 53 | ], 54 | "aiPrompt": "Given the URL of a video youtube, download it, extract the audio. Upload the video without audio and the audio to S3 and return the corresponding URLs. Make the code such the download can be generalized to other platforms in the future", 55 | "greeting": "" 56 | } 57 | $""" 58 | 59 | import json 60 | from pytube import YouTube 61 | from moviepy.editor import * 62 | import boto3 63 | import uuid 64 | import os 65 | from pydub import AudioSegment 66 | import youtube_dl 67 | import requests 68 | import instaloader 69 | from urllib.parse import urlparse 70 | 71 | def extract_reel_id(url): 72 | path = urlparse(url).path 73 | segments = path.split('/') 74 | if "reel" in segments: 75 | reel_index = segments.index("reel") 76 | if reel_index+1 < len(segments): 77 | return segments[reel_index+1] 78 | return None 79 | 80 | s3_client = boto3.client('s3') 81 | 82 | def upload_to_aws(filename: str) -> str: 83 | # Uses your AWS credentials to access the service 84 | bucket_name = os.environ.get('bucket_name') 85 | region = os.environ.get('region') 86 | # Create a session using the provided credentials 87 | session = boto3.Session( 88 | aws_access_key_id=os.environ.get('access_key_id'), 89 | aws_secret_access_key=os.environ.get('secret_access_key') 90 | ) 91 | # Create an S3 client 92 | s3_client = session.client('s3') 93 | bucket_path = 'ai-video' 94 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 95 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 96 | url = f'{s3_base_url}{bucket_path}/{filename}' 97 | return url 98 | 99 | ydl_opts = { 100 | 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best', 101 | 'postprocessors': [{ 102 | 'key': 'FFmpegVideoConvertor', 103 | 'preferedformat': 'mp4', 104 | }], 105 | } 106 | 107 | def download_and_split_video(url, download_path=""): 108 | if 'youtube.com' in url: 109 | yt = YouTube(url) 110 | try: 111 | print('try download 720p') 112 | video = yt.streams.get_by_resolution('720p').download(download_path) 113 | except: 114 | print('download failed') 115 | video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(download_path) 116 | elif 'www.bilibili.com' in url: 117 | with youtube_dl.YoutubeDL(ydl_opts) as ydl: 118 | info_dict = ydl.extract_info(url, download=True) 119 | video_title = ydl.prepare_filename(info_dict) 120 | video = os.path.join(download_path, video_title.replace('flv', 'mp4')) 121 | elif 'www.tiktok.com' in url: 122 | # pip install yt-dlp 123 | video_name = url.split('/')[-1] 124 | video = f"tiktok_video_{video_name}.mp4" 125 | os.system("yt-dlp {} -o {}".format(url, video)) 126 | elif 'www.instagram.com' in url: # currently not working 127 | reel_id = extract_reel_id(url) 128 | L = instaloader.Instaloader() 129 | post = instaloader.Post.from_shortcode(L.context, reel_id) 130 | video_url = post.video_url 131 | video_name = f'ins_reel_{reel_id}' 132 | video = video_name + '.mp4' 133 | from datetime import datetime, timedelta 134 | L.download_pic(filename=video_name, url=url, mtime=datetime.now()) 135 | else: 136 | response = requests.get(url) 137 | video = os.path.join(download_path, url.split('/')[-1]) 138 | with open(video, 'wb') as file: 139 | file.write(response.content) 140 | 141 | video_clip = VideoFileClip(video) 142 | audio = video_clip.audio 143 | video_without_audio = video_clip.without_audio() 144 | audio_file = os.path.join(download_path, f'{str(uuid.uuid4())}_audio') 145 | 146 | # Save audio to wav 147 | audio.write_audiofile(audio_file + ".wav") 148 | 149 | # Save the video file without audio 150 | video_file = os.path.join(download_path, f'{str(uuid.uuid4())}_video_no_audio.mp4') 151 | video_without_audio.write_videofile(video_file, audio=False) 152 | 153 | return audio_file + ".wav", video_file, video 154 | 155 | def mindsflow_function(event, context) -> dict: 156 | url = event.get("video_url") 157 | audio_file, video_file, original_video = download_and_split_video(url) 158 | audio_url = upload_to_aws(audio_file) 159 | video_url = upload_to_aws(video_file) 160 | original_video_url = upload_to_aws(original_video) 161 | os.remove(original_video) 162 | os.remove(audio_file) 163 | os.remove(video_file) 164 | result = { 165 | 'audio_url': audio_url, 166 | 'video_url': video_url, 167 | 'original_video_url': original_video_url 168 | } 169 | return result 170 | 171 | -------------------------------------------------------------------------------- /agent-video-generator/functions/generateAudioSegmentsFromJson.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "generateAudioSegmentsFromJson", 4 | "displayName": "", 5 | "description": "Generate audio from json captions.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "json_url", 10 | "target_lang" 11 | ], 12 | "properties": { 13 | "voice": { 14 | "type": "string", 15 | "description": "" 16 | }, 17 | "json_url": { 18 | "type": "string", 19 | "description": "URL of the JSON file containing captions" 20 | }, 21 | "target_lang": { 22 | "type": "string", 23 | "description": "The language into which the captions should be translated" 24 | }, 25 | "enhance_sync": { 26 | "type": "boolean", 27 | "description": "" 28 | }, 29 | "max_speech_rate": { 30 | "type": "number", 31 | "description": "" 32 | }, 33 | "min_speech_rate": { 34 | "type": "number", 35 | "description": "" 36 | }, 37 | "summarize_long_sentences": { 38 | "type": "boolean", 39 | "description": "" 40 | } 41 | } 42 | }, 43 | "outputPattern": { 44 | "type": "object", 45 | "required": [ 46 | "translation_folder" 47 | ], 48 | "properties": { 49 | "translation_folder": { 50 | "type": "string", 51 | "description": "" 52 | } 53 | } 54 | }, 55 | "tag": "TextToSpeech", 56 | "testCases": [ 57 | { 58 | "voice": "zh-CN-male", 59 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedVceeOp.json", 60 | "target_lang": "zh", 61 | "enhance_sync": false, 62 | "max_speech_rate": 1.5, 63 | "min_speech_rate": 0.5, 64 | "summarize_long_sentences": false 65 | }, 66 | { 67 | "voice": "it-IT': 'Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)", 68 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedAOjUGH.json", 69 | "target_lang": "it", 70 | "enhance_sync": false, 71 | "max_speech_rate": 0, 72 | "min_speech_rate": 0, 73 | "summarize_long_sentences": false 74 | }, 75 | { 76 | "voice": "zh-CN-YunfengNeural", 77 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedDLYYSi.json", 78 | "target_lang": "zh", 79 | "enhance_sync": true, 80 | "max_speech_rate": 0, 81 | "min_speech_rate": 0, 82 | "summarize_long_sentences": false 83 | } 84 | ], 85 | "aiPrompt": "Is given the URL of a json file containing a set of captions and their start and duration in a video. download the file and read the content. Translate each sentence into a target language. Then generate the audio of each translated sentence. Is also given the URl of the video. download it and add each audio segment back to the video according to its start time", 86 | "greeting": "" 87 | } 88 | $""" 89 | 90 | import json 91 | import os 92 | import boto3 93 | import requests 94 | import azure.cognitiveservices.speech as speechsdk 95 | from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, SpeechSynthesisOutputFormat 96 | import langid 97 | langid.set_languages(['en', 'zh', 'ja']) 98 | import shutil 99 | import random 100 | import string 101 | import pydub 102 | 103 | 104 | time_unit = 10000000 105 | 106 | 107 | def download_file(url, filename): 108 | res = requests.get(url) 109 | with open(filename, "wb") as f: 110 | f.write(res.content) 111 | 112 | 113 | def upload_to_aws(filename: str) -> str: 114 | bucket_name = os.environ.get('bucket_name') 115 | region = os.environ.get('region') 116 | session = boto3.Session( 117 | aws_access_key_id=os.environ.get('access_key_id'), 118 | aws_secret_access_key=os.environ.get('secret_access_key') 119 | ) 120 | s3_client = session.client('s3') 121 | bucket_path = 'temp_audio' 122 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 123 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 124 | url = f'{s3_base_url}{bucket_path}/{filename}' 125 | return url 126 | 127 | 128 | def get_captions_from_url(url): 129 | filename = f"{url.split('/')[-1]}" 130 | # download the json file 131 | download_file(url, filename) 132 | # read the contents 133 | with open(filename, 'r', encoding='utf-8') as f: 134 | captions = json.load(f) 135 | return captions, filename 136 | 137 | 138 | def calculate_element_count(text): 139 | chinese_punctuations = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." 140 | text = text.translate(str.maketrans('', '', string.punctuation + chinese_punctuations)) 141 | # Consider language specifics (ex: Chinese is rather based on characters) 142 | if langid.classify(text)[0] in ['zh', 'ja']: 143 | return len(text.replace(' ', '')) # Spaces are not typically considered in character count 144 | else: 145 | return len(text.split()) 146 | 147 | 148 | def calculate_speech_rate(text, duration): 149 | element_count = calculate_element_count(text) 150 | #print('Element count:', element_count) 151 | #print('Duration:', duration) 152 | duration_in_seconds = float(duration) / float(time_unit) 153 | #print('Duration in seconds', duration_in_seconds) 154 | speech_rate = element_count / float(duration_in_seconds) * 60. 155 | return speech_rate, element_count 156 | 157 | llm_prompt = 'Shorten the input text. The output must have less words than the input. Keep the original language.\n INPUT: {}.\n OUTPUT:' 158 | def summarize_text(input_str: str, event) -> str: 159 | data = { 160 | "style": "LLM-Only", 161 | "stream": False, 162 | "messageContent": input_str, 163 | "agentId": 964 164 | } 165 | resp = event.chat.messages(data=data) 166 | return resp 167 | 168 | lang_dict = { 169 | 'en': 'en-US', 170 | 'zh': 'zh-CN', 171 | 'ch': 'zh-CN', 172 | 'de': 'de-DE', 173 | 'ge': 'de-DE', 174 | 'it': 'it-IT', 175 | 'fr': 'fr-FR', 176 | 'sp': 'es-ES', 177 | 'es': 'es-ES', 178 | } 179 | 180 | speaker_dict = { 181 | 'en-US': 'Microsoft Server Speech Text to Speech Voice (en-US, Jessa24kRUS)', 182 | 'zh-CN': 'Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)', 183 | 'it-IT': 'Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)', 184 | 'de-DE': 'Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)', 185 | 'fr-FR': 'Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)', 186 | 'es-ES': 'Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)', 187 | 188 | 'zh-CN-male': 'zh-CN-YunfengNeural', 189 | 'zh-CN-female': 'zh-CN-XiaomengNeural', 190 | } 191 | 192 | speech_rate_dict = { 193 | 'en-US': 150, 194 | 'zh-CN': 400, 195 | } 196 | 197 | 198 | def generate_audio(captions, lang: str = 'en', translation_folder: str = 'translation_folder', enhance_sync: bool = True, event = None, voice=None, summarize_long_sentences=False, min_speech_rate=0.5, max_speech_rate=1.5): 199 | if lang in lang_dict.keys(): 200 | lang = lang_dict[lang] 201 | if 'male' in voice or 'female' in voice: 202 | speaker = speaker_dict[voice] 203 | elif lang in voice: 204 | speaker = voice 205 | else: 206 | speaker = speaker_dict[lang] 207 | print('Using speaker:', speaker) 208 | 209 | filename = '{}/audio_segment_{}.wav' 210 | speech_key = os.environ.get('azure_key') 211 | service_region = os.environ.get('azure_region') 212 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) 213 | 214 | tot_error = [] 215 | for i, cap in enumerate(captions): 216 | temp_filename = filename.format(translation_folder, str(i+1)) 217 | audio_output = speechsdk.audio.AudioOutputConfig(filename=temp_filename) 218 | speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output) 219 | text = cap['translation'] 220 | duration = cap['duration'] 221 | original_text = cap['sentence'] 222 | #voice_speed = (speech_rate / ai_speech_rate) 223 | print(i+1, text, original_text) 224 | voice_speed = 1. 225 | #break 226 | if voice_speed != 1.0 and voice_speed != 1: 227 | voice_speed = int(voice_speed * 100.0 - 100.0) 228 | text = f"" + text + "" 229 | else: 230 | text = f"" + text + "" 231 | 232 | result = speech_synthesizer.speak_ssml_async(text).get() 233 | stream = AudioDataStream(result) 234 | stream.save_to_wav_file(temp_filename) 235 | 236 | # Get the duration of the audio file 237 | audio = pydub.AudioSegment.from_file(temp_filename) 238 | duration = audio.duration_seconds 239 | 240 | speech_rate_min, speech_rate_max = min_speech_rate, max_speech_rate 241 | if enhance_sync: 242 | text = cap['translation'] 243 | dur_diff_rate = duration / (cap['duration'] / time_unit) 244 | print('Duration diff rate', dur_diff_rate) 245 | if summarize_long_sentences is True and dur_diff_rate > speech_rate_max and len(text) >= 3: # when translated audio is too long 246 | prev_text = text 247 | text = summarize_text(llm_prompt.format(text), event) 248 | print(f"Translated text is too long: {cap['duration']}s vs {duration}s. Rewording: {prev_text} -> {text} ") 249 | prev_duration = duration 250 | err = abs(duration-cap['duration'] / time_unit) 251 | print('Before synch', prev_duration, cap['duration'] / time_unit, err) 252 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) 253 | temp_filename = filename.format(translation_folder, str(i+1)) 254 | audio_output = speechsdk.audio.AudioOutputConfig(filename=temp_filename) 255 | speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output) 256 | voice_speed = duration / (cap['duration'] / time_unit) 257 | min_speed, max_speed = speech_rate_min, speech_rate_max 258 | voice_speed = min(max_speed, max(min_speed, voice_speed)) 259 | voice_speed = int(voice_speed * 100.0 - 100.0) 260 | text = f"" + text + "" 261 | result = speech_synthesizer.speak_ssml_async(text).get() 262 | stream = AudioDataStream(result) 263 | stream.save_to_wav_file(temp_filename) 264 | # Get the duration of the audio file 265 | audio = pydub.AudioSegment.from_file(temp_filename) 266 | duration = audio.duration_seconds 267 | 268 | err = abs(duration-cap['duration'] / time_unit) 269 | print('After synch', duration, cap['duration'] / time_unit, err) 270 | tot_error.append(err) 271 | print('Total mismatch:', sum(tot_error) / len(tot_error)) 272 | 273 | return filename 274 | 275 | 276 | def mindsflow_function(event, context) -> dict: 277 | json_url = event.get("json_url") 278 | target_language = event.get("target_lang") 279 | enhance_sync = event.get("enhance_sync", False) 280 | summarize_long_sentences = event.get("summarize_long_sentences", None) 281 | voice = event.get("voice", None) 282 | min_speech_rate = event.get("min_speech_rate", 0.5) 283 | max_speech_rate = event.get("max_speech_rate", 1.5) 284 | 285 | if voice is not None and voice.lower() in ['none']: 286 | voice = None 287 | 288 | audio_folder = 'audio_folder_' + ''.join(random.choice(string.ascii_letters) for _ in range(6)) # make static name for debug 289 | if os.path.exists(audio_folder): 290 | shutil.rmtree(audio_folder) 291 | os.makedirs(audio_folder) 292 | 293 | # download and read the captions from the json file 294 | captions, _ = get_captions_from_url(json_url) 295 | # generate audios from the translated captions 296 | generate_audio(captions, target_language, audio_folder, enhance_sync, event, voice, summarize_long_sentences, min_speech_rate, max_speech_rate) 297 | 298 | return {'audio_folder': audio_folder} 299 | -------------------------------------------------------------------------------- /agent-video-generator/functions/generateSrtFromJson.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "generateSrtFromJson", 4 | "displayName": "", 5 | "description": "This Python method downloads a JSON file from a given URL which contains captions with their respective start, end, and duration time. It processes this data, generates a subtitle (SRT) file, and subsequently uploads it to S3 storage.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "sentences_json_url" 10 | ], 11 | "properties": { 12 | "min_words_sentence": { 13 | "type": "integer", 14 | "description": "" 15 | }, 16 | "sentences_json_url": { 17 | "type": "string", 18 | "description": "URL of the JSON file containing the subtitles to be downloaded" 19 | } 20 | } 21 | }, 22 | "outputPattern": { 23 | "type": "object", 24 | "required": [ 25 | "srt_url" 26 | ], 27 | "properties": { 28 | "srt_url": { 29 | "type": "string", 30 | "description": "The status of the function operation" 31 | } 32 | } 33 | }, 34 | "tag": "VideoCaptions", 35 | "testCases": [ 36 | { 37 | "min_words_sentence": 5, 38 | "sentences_json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/sentence_times_1703164123_oewslyvu.json" 39 | }, 40 | { 41 | "min_words_sentence": 0, 42 | "sentences_json_url": "" 43 | } 44 | ], 45 | "aiPrompt": "Given the url of a json, download it. It contains some captions with their start, end and duration. The json is a list in this format\nsentence: \"今日话题做题速度太慢 怎么办?\"\nstart_time: 6000000\nend_time: 35500000\nduration: 29500000\nfrom it generate a srt file containing subtitles and upload it so s3", 46 | "greeting": "" 47 | } 48 | $""" 49 | 50 | import json 51 | import requests 52 | from typing import Dict, List 53 | import boto3 54 | from datetime import timedelta 55 | import random 56 | import os 57 | import string 58 | 59 | 60 | def download_json(url: str) -> List[Dict[str, int]]: 61 | response = requests.get(url) 62 | data = response.json() 63 | return data 64 | 65 | def upload_to_aws(filename: str, bucket_path = None) -> str: 66 | bucket_name = os.environ.get('bucket_name') 67 | region = os.environ.get('region') 68 | session = boto3.Session( 69 | aws_access_key_id=os.environ.get('access_key_id'), 70 | aws_secret_access_key=os.environ.get('secret_access_key') 71 | ) 72 | s3_client = session.client('s3') 73 | if bucket_path is None: 74 | bucket_path = 'ai-video' 75 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 76 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 77 | url = f'{s3_base_url}{bucket_path}/{filename}' 78 | return url 79 | 80 | def deciseconds_to_time_format(ds: int) -> str: 81 | ms = int(ds / 10000) # converting deciseconds to milliseconds 82 | seconds, milliseconds = divmod(ms, 1000) 83 | minutes, seconds = divmod(seconds, 60) 84 | hours, minutes = divmod(minutes, 60) 85 | time_string = f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" 86 | return time_string 87 | 88 | 89 | punctuation = '。,、?!;:“”‘’【】()《》「」.,?!;:(){}[]<>' 90 | strong_punctuation = ['.', '?', '!', '。', '?', '!'] 91 | def generate_srt(subtitles: List[Dict[str, int]], min_length: int) -> str: 92 | srt_string = "" 93 | index = 1 94 | while subtitles: 95 | # Pop the first subtitle off the list 96 | subtitle = subtitles.pop(0) 97 | # Store the start and end time 98 | start_time = deciseconds_to_time_format(subtitle["start_time"]) 99 | end_time = deciseconds_to_time_format(subtitle["end_time"]) 100 | # Combine the sentences until the length is at least min_length 101 | combined_sentence = subtitle['sentence'] 102 | while len(combined_sentence.split()) < min_length and subtitles: 103 | if combined_sentence.replace(' ', '')[-1] in strong_punctuation: 104 | break 105 | next_subtitle = subtitles.pop(0) 106 | end_time = deciseconds_to_time_format(next_subtitle["end_time"]) # update end time 107 | combined_sentence += ' ' + next_subtitle['sentence'] 108 | # Remove trailing punctuation 109 | while combined_sentence[-1] in punctuation: 110 | combined_sentence = combined_sentence[:-1] 111 | # Add to the SRT string 112 | srt_string += f"{index}\n{start_time} --> {end_time}\n{combined_sentence}\n\n" 113 | index += 1 114 | return srt_string 115 | 116 | 117 | def mindsflow_function(event, context) -> dict: 118 | # get the s3 bucket, file_name, and url from the event 119 | url = event.get("sentences_json_url") 120 | min_words_sentence = event.get("min_words_sentence", 5) 121 | 122 | # download the json from the url 123 | subtitles_json = download_json(url) 124 | 125 | # generate the srt from the json 126 | srt_data = generate_srt(subtitles_json, min_words_sentence) 127 | 128 | file_name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=6)) 129 | file_name_srt = file_name + '.srt' 130 | with open(file_name_srt, 'w') as file: 131 | file.write(srt_data ) 132 | srt_url = upload_to_aws(file_name_srt) 133 | os.remove(file_name_srt) 134 | 135 | print(srt_data) 136 | 137 | # define result 138 | result = { 139 | 'srt_url': srt_url, 140 | } 141 | 142 | return result 143 | 144 | -------------------------------------------------------------------------------- /agent-video-generator/functions/generateVideoScript.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "generateVideoScript", 4 | "displayName": "", 5 | "description": "generate video script", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "topic" 10 | ], 11 | "properties": { 12 | "topic": { 13 | "type": "string", 14 | "description": "" 15 | }, 16 | "text_style": { 17 | "type": "string", 18 | "description": "" 19 | } 20 | } 21 | }, 22 | "outputPattern": { 23 | "type": "object", 24 | "required": [ 25 | "json_string" 26 | ], 27 | "properties": { 28 | "json_string": { 29 | "type": "string", 30 | "description": "" 31 | } 32 | } 33 | }, 34 | "tag": "VideoGeneration", 35 | "testCases": [ 36 | { 37 | "topic": "Benefits of eating mango", 38 | "text_style": "scientific, straight to the point, easy to read" 39 | }, 40 | { 41 | "topic": "Story of two brothers, sci-fi", 42 | "text_style": "" 43 | }, 44 | { 45 | "topic": "story, sci-fi, epic", 46 | "text_style": "" 47 | } 48 | ], 49 | "aiPrompt": "", 50 | "greeting": "" 51 | } 52 | $""" 53 | 54 | import json 55 | import boto3 56 | import os 57 | import uuid 58 | 59 | 60 | def upload_to_aws(filename: str) -> str: 61 | # Uses your AWS credentials to access the service 62 | bucket_name = os.environ.get('bucket_name') 63 | region = os.environ.get('region') 64 | # Create a session using the provided credentials 65 | session = boto3.Session( 66 | aws_access_key_id=os.environ.get('access_key_id'), 67 | aws_secret_access_key=os.environ.get('secret_access_key') 68 | ) 69 | # Create an S3 client 70 | s3_client = session.client('s3') 71 | bucket_path = 'ai-video' 72 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 73 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 74 | url = f'{s3_base_url}{bucket_path}/{filename}' 75 | return url 76 | 77 | 78 | def generate_story_prompt(input_str: str, event) -> str: 79 | data = { 80 | "style": "LLM-Only", 81 | "stream": False, 82 | "messageContent": input_str, 83 | "agentId": 1601 84 | } 85 | resp = event.chat.messages(data=data) 86 | return resp 87 | 88 | def generate_paragraph_prompt(input_str: str, event) -> str: 89 | data = { 90 | "style": "LLM-Only", 91 | "stream": False, 92 | "messageContent": input_str, 93 | "agentId": 1599 94 | } 95 | resp = event.chat.messages(data=data) 96 | return resp 97 | 98 | def generate_music_prompt(input_str: str, event) -> str: 99 | data = { 100 | "style": "LLM-Only", 101 | "stream": False, 102 | "messageContent": input_str, 103 | "agentId": 1604 104 | } 105 | resp = event.chat.messages(data=data) 106 | return resp 107 | 108 | prompt = 'Given a text style and a text, turn the text into that style\nTEXT: {}\nSTYLE: {}\nNEW TEXT: ' 109 | def personalize_text(text: str, style: str, event) -> str: 110 | input_str = prompt.format(text, style) 111 | data = { 112 | "style": "LLM-Only", 113 | "stream": False, 114 | "messageContent": input_str, 115 | "agentId": 1548 116 | } 117 | resp = event.chat.messages(data=data) 118 | return resp 119 | 120 | def mindsflow_function(event, context) -> dict: 121 | topic = event.get("topic") 122 | style = event.get("text_style", None) 123 | return_url = event.get("return_url", True) 124 | if 'story' in topic or 'Story' in topic or 'STORY' in topic: 125 | json_string = generate_story_prompt(topic, event) 126 | else: 127 | json_string = generate_paragraph_prompt(topic, event) 128 | 129 | json_url = None 130 | #print(json_string) 131 | dict_object = json.loads(json_string.replace('\\', '')) 132 | 133 | music_prompt = generate_music_prompt(topic, event) 134 | dict_object['music_prompt'] = music_prompt.replace('//', '').replace('"', '') 135 | 136 | if style is not None: 137 | dict_object['original_text'] = dict_object['Text'] 138 | dict_object['Text'] = personalize_text(dict_object['Text'], style, event) 139 | 140 | json_path = f"script_{uuid.uuid4()}.json" 141 | with open(json_path, 'w') as f: 142 | json.dump(dict_object, f) 143 | json_url = upload_to_aws(json_path) 144 | os.remove(json_path) 145 | 146 | result = { 147 | 'json_string': json_string, 148 | 'json_url': json_url 149 | } 150 | # iterates over each key-value pair in the JSON object 151 | for key, value in dict_object.items(): 152 | result[key.lower()] = value 153 | 154 | return result 155 | 156 | -------------------------------------------------------------------------------- /agent-video-generator/functions/generateVoiceVits.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "generateVoiceVits", 4 | "displayName": "", 5 | "description": "A Python method that downloads a specified wav file, clones the voice, generates a new speech - potentially in another language - from provided text, and subsequently uploads the newly created wav file to AWS, finally returning the URL of the uploaded file.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "voice", 10 | "audio_url" 11 | ], 12 | "properties": { 13 | "voice": { 14 | "type": "string", 15 | "description": "Voice to use to generate new audio" 16 | }, 17 | "audio_url": { 18 | "type": "string", 19 | "description": "URL of the audio to be cloned" 20 | }, 21 | "clean_noise": { 22 | "type": "boolean", 23 | "description": "Clean audio input noise" 24 | } 25 | } 26 | }, 27 | "outputPattern": { 28 | "type": "object", 29 | "required": [ 30 | "audio_url" 31 | ], 32 | "properties": { 33 | "audio_url": { 34 | "type": "string", 35 | "description": "Url of the wav audio file" 36 | } 37 | } 38 | }, 39 | "tag": "VoiceCloning", 40 | "testCases": [ 41 | { 42 | "voice": "chinese_poadcast_woman1", 43 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/voice/zh_woman1_split.wav", 44 | "clean_noise": true 45 | }, 46 | { 47 | "voice": "", 48 | "audio_url": "", 49 | "clean_noise": false 50 | } 51 | ], 52 | "aiPrompt": "Given the url of a wav file and a text. Download the file, clone the voice and generate a speech according to the next text, the new text can also be in a different language. Upload the new generated wav file to aws and return the url", 53 | "greeting": "" 54 | } 55 | $""" 56 | 57 | import os 58 | import json 59 | import boto3 60 | import requests 61 | import random 62 | import string 63 | 64 | s3 = boto3.resource('s3') 65 | 66 | def download_file(url: str, save_path: str): 67 | resp = requests.get(url) 68 | with open(save_path, 'wb') as f: 69 | f.write(resp.content) 70 | 71 | 72 | def generate_random_string(length): 73 | letters = string.ascii_letters 74 | result_str = ''.join(random.choice(letters) for i in range(length)) 75 | return result_str 76 | 77 | 78 | def upload_to_aws(filename: str) -> str: 79 | bucket_name = os.environ.get('bucket_name') 80 | region = os.environ.get('region') 81 | session = boto3.Session( 82 | aws_access_key_id=os.environ.get('access_key_id'), 83 | aws_secret_access_key=os.environ.get('secret_access_key') 84 | ) 85 | s3_client = session.client('s3') 86 | bucket_path = 'voice-clone' 87 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 88 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 89 | url = f'{s3_base_url}{bucket_path}/{filename}' 90 | return url 91 | 92 | 93 | def mindsflow_function(event, context) -> dict: 94 | # get from event 95 | audio_url = event.get('audio_url') 96 | voice= event.get('voice') 97 | clean_noise = event.get('clean_noise') 98 | api_ip = os.environ.get('api_ip') 99 | 100 | voice_clone_url = f"http://{api_ip}:5001/generate_voice/" 101 | 102 | data = { 103 | "audio_url": audio_url, 104 | "voice": voice, 105 | "clean_noise": clean_noise 106 | } 107 | 108 | headers = { 109 | 'Content-Type': 'application/json' 110 | } 111 | 112 | print('Generating voice...') 113 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers) 114 | if response.status_code != 200: 115 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}') 116 | print('Voice generated') 117 | 118 | audio_path = voice + '_' + audio_url.split('/')[-1] 119 | # Save the file to the directory 120 | with open(audio_path, 'wb') as file: 121 | file.write(response.content) 122 | 123 | result_url = upload_to_aws(audio_path) 124 | 125 | # clean up 126 | os.remove(audio_path) 127 | 128 | return { 129 | "audio_url": result_url 130 | } 131 | 132 | -------------------------------------------------------------------------------- /agent-video-generator/functions/loadJsonAndReturnKeys.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "loadJsonAndReturnKeys", 4 | "displayName": "", 5 | "description": "This method takes a string input, interprets it as a JSON object, and returns each key within it as a string.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "json_string" 10 | ], 11 | "properties": { 12 | "json_string": { 13 | "type": "string", 14 | "description": "A JSON string variable" 15 | } 16 | } 17 | }, 18 | "outputPattern": { 19 | "type": "object", 20 | "required": [], 21 | "properties": {} 22 | }, 23 | "tag": "DataPreprocessing", 24 | "testCases": [ 25 | { 26 | "json_string": "{\n\\\"Title\\\": \\\"The Enchantments of the Mystic World\\\",\n\\\"Text\\\": \\\"In a land of dreams and lore, where mythical beasts roar under an eternally twilight sky, unfurls the enigma of a fantasy style poem. Weaving an intricate tapestry of knights and elves, wizards and dragons, this poem is a saga of heroic adventures and epic battles. Dreamlike imagery is brushstroked with sonorous verses, blending the borders of reality with the enchanting realm of magical dimensions.\\\",\n\\\"Description\\\": \\\"A brief depiction of a fantasy style poem enriching the mystical world of myths and magic, injecting life into fictional characters and their bewitching land.\\\",\n\\\"Prompt\\\": \\\"An epic painting of mythical creatures like dragons and unicorns embarking on heroic adventures, with knights and elves in a magical realm under a twilight sky.\\\",\n\\\"Hashtags\\\": \\\"#SpartanRace #SpearThrow #ObstacleCourse #FitnessGoals #RaceTraining #Endurance #GetSpartanFit\\\"\\n\n}" 27 | } 28 | ], 29 | "aiPrompt": "", 30 | "greeting": "" 31 | } 32 | $""" 33 | 34 | import json 35 | 36 | def json_from_string(json_str: str) -> dict: 37 | return json.loads(json_str) 38 | 39 | def mindsflow_function(event, context) -> dict: 40 | json_string = event.get("json_string").replace('\\n', '').replace('\n', '').replace('\\', '') 41 | print(json_string) 42 | json_data = json.loads(json_string) 43 | 44 | keys = ', '.join([str(elem) for elem in json_data.keys()]) 45 | 46 | results = {} 47 | 48 | for k in json_data.keys(): 49 | results[k.lower()] = json_data[k] 50 | if k.lower() == 'description' and 'Hashtags' in json_data.keys(): 51 | results[k.lower()] += '\n' + json_data['Hashtags'].lower() 52 | 53 | return results 54 | 55 | -------------------------------------------------------------------------------- /agent-video-generator/functions/preprocessTrainData.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "preprocessTrainData", 4 | "displayName": "", 5 | "description": "This function downloads an audio file, transforms it into a wav format, and then uploads it to a specified data storage bucket.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "audio_url" 10 | ], 11 | "properties": { 12 | "voice": { 13 | "type": "string", 14 | "description": "" 15 | }, 16 | "make_zip": { 17 | "type": "boolean", 18 | "description": "" 19 | }, 20 | "audio_url": { 21 | "type": "string", 22 | "description": "URL of the file to be downloaded and converted" 23 | } 24 | } 25 | }, 26 | "outputPattern": { 27 | "type": "object", 28 | "required": [ 29 | "audio_url" 30 | ], 31 | "properties": { 32 | "audio_url": { 33 | "type": "string", 34 | "description": "url of the converted file" 35 | } 36 | } 37 | }, 38 | "tag": "VoiceCloning", 39 | "testCases": [ 40 | { 41 | "voice": "hhh", 42 | "make_zip": true, 43 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/chinese_poadcast_woman1.m4a" 44 | }, 45 | { 46 | "voice": "", 47 | "make_zip": false, 48 | "audio_url": "" 49 | } 50 | ], 51 | "aiPrompt": "Cloning voice...", 52 | "greeting": "" 53 | } 54 | $""" 55 | 56 | import os 57 | import json 58 | import requests 59 | from pydub import AudioSegment 60 | import boto3 61 | import zipfile 62 | import glob 63 | import shutil 64 | from datetime import datetime 65 | 66 | def download_file(url: str) -> str: 67 | local_filename = url.split('/')[-1] 68 | with requests.get(url, stream=True) as r: 69 | r.raise_for_status() 70 | with open(local_filename, 'wb') as f: 71 | for chunk in r.iter_content(chunk_size=8192): 72 | f.write(chunk) 73 | return local_filename 74 | 75 | def convert_audio_to_wav(file_path: str) -> str: 76 | audio = AudioSegment.from_file(file_path) 77 | wav_filename = os.path.splitext(file_path)[0] + '.wav' 78 | audio.export(wav_filename, format="wav") 79 | return wav_filename 80 | 81 | def upload_to_aws(filename: str) -> str: 82 | # Uses your AWS credentials to access the service 83 | bucket_name = os.environ.get('bucket_name') 84 | region = os.environ.get('region') 85 | 86 | # Create a session using the provided credentials 87 | session = boto3.Session( 88 | aws_access_key_id=os.environ.get('access_key_id'), 89 | aws_secret_access_key=os.environ.get('secret_access_key') 90 | ) 91 | 92 | # Create an S3 client 93 | s3_client = session.client('s3') 94 | 95 | bucket_path = 'temp_audio' 96 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 97 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 98 | url = f'{s3_base_url}{bucket_path}/{filename}' 99 | 100 | return url 101 | 102 | def zip_wav_file(wav_file_path): 103 | # Check if file exists 104 | if not os.path.isfile(wav_file_path): 105 | print("File does not exist at provided path.") 106 | return 107 | 108 | # Extracting directory path, file name and file base name 109 | dir_path, file_name = os.path.split(wav_file_path) 110 | file_base_name, _ = os.path.splitext(file_name) 111 | 112 | # Creating new directory with same name as the wav file 113 | new_dir_path = os.path.join(dir_path, file_base_name) 114 | 115 | # If the directory already exists, append a timestamp to its name 116 | #if os.path.exists(new_dir_path): 117 | # timestamp = datetime.now().strftime("_%Y%m%d_%H%M%S") 118 | # new_dir_path += timestamp 119 | 120 | os.makedirs(new_dir_path, exist_ok=True) 121 | 122 | # Moving the wav file to the new directory 123 | shutil.move(wav_file_path, os.path.join(new_dir_path, file_name)) 124 | 125 | # Creating a zip file and adding the directory with the wav file in it 126 | # If the zip file already exists, append a timestamp to its name 127 | zip_file_path = dir_path + '/' + file_base_name + '.zip' 128 | if os.path.isfile(zip_file_path): 129 | zip_file_path = os.path.splitext(zip_file_path)[0] + ".zip" 130 | 131 | with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf: 132 | for foldername, subfolders, filenames in os.walk(new_dir_path): 133 | for filename in filenames: 134 | # create complete filepath of file in directory 135 | file_to_zip = os.path.join(foldername, filename) 136 | # add file to zip 137 | zipf.write(file_to_zip, os.path.relpath(file_to_zip, dir_path)) 138 | 139 | print(f"Zip file saved at: {zip_file_path}") 140 | return zip_file_path 141 | 142 | def mindsflow_function(event, context) -> dict: 143 | # get params from the event 144 | url = event.get("audio_url") 145 | make_zip = event.get("make_zip", False) 146 | voice = event.get("voice", None) 147 | ext = url.split('.')[-1] 148 | if ext in [ 'zip']: 149 | return { 150 | 'audio_url': url 151 | } 152 | if ext in [ 'wav'] and make_zip is False: 153 | return { 154 | 'audio_url': url 155 | } 156 | 157 | # Download file 158 | local_filename = download_file(url) 159 | if voice is not None: 160 | new_filename = f'{voice}.wav' 161 | shutil.move(local_filename, new_filename) 162 | local_filename = new_filename 163 | 164 | # Convert audio to wav 165 | wav_filename = convert_audio_to_wav(local_filename) 166 | 167 | if make_zip: # TODo chasnge file nasme 168 | wav_filename = zip_wav_file(wav_filename) 169 | if voice is not None: 170 | new_filename = f'{voice}.zip' 171 | shutil.move(wav_filename, new_filename) 172 | wav_filename = new_filename 173 | 174 | # Upload wav file to S3 bucket 175 | response = upload_to_aws(wav_filename) 176 | 177 | files = glob.glob('./*.zip') + glob.glob('./*.wav') + glob.glob('./*.m4a') + glob.glob('./*.mp3') 178 | for file_name in files: 179 | try: 180 | os.remove(file_name) 181 | print('File ', file_name ,'removed successfully.') 182 | except: 183 | print('Error while deleting file ', file_name) 184 | 185 | # define result 186 | result = { 187 | 'audio_url': response 188 | } 189 | 190 | return result 191 | 192 | -------------------------------------------------------------------------------- /agent-video-generator/functions/returnInputParameters.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "returnInputParameters", 4 | "displayName": "", 5 | "description": "This method is designed to accept and return input parameters.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [], 9 | "properties": {} 10 | }, 11 | "outputPattern": { 12 | "type": "object", 13 | "required": [], 14 | "properties": {} 15 | }, 16 | "tag": "ParameterReturn", 17 | "testCases": [ 18 | {} 19 | ], 20 | "aiPrompt": "Return the input parameters", 21 | "greeting": "" 22 | } 23 | $""" 24 | 25 | import json 26 | 27 | def mindsflow_function(event, context) -> dict: 28 | # directly return the input parameters 29 | return event 30 | -------------------------------------------------------------------------------- /agent-video-generator/functions/setEpochInJsonFile.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "setEpochInJsonFile", 4 | "displayName": "", 5 | "description": "Opens a local JSON file, modifies the 'epochs' field to a specified input value (N), and saves the changes back to the same JSON file.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "epochs" 10 | ], 11 | "properties": { 12 | "epochs": { 13 | "type": "integer", 14 | "description": "New epoch value to be set in the JSON file" 15 | } 16 | } 17 | }, 18 | "outputPattern": { 19 | "type": "object", 20 | "required": [ 21 | "config_url" 22 | ], 23 | "properties": { 24 | "config_url": { 25 | "type": "string", 26 | "description": "" 27 | } 28 | } 29 | }, 30 | "tag": "VoiceCloning", 31 | "testCases": [ 32 | { 33 | "epochs": 130 34 | }, 35 | { 36 | "epochs": 0 37 | } 38 | ], 39 | "aiPrompt": "Open a json file from local, change the field epoc to N, where N is an input, and save the json in the same location. The json has this structure:\n\n{\n \"train\": {\n \"log_interval\": 100,\n \"eval_interval\": 200,\n \"seed\": 1234,\n \"epochs\": 100,\n \"learning_rate\": 0.0001,\n \"betas\": [\n 0.8,\n 0.99\n ],\n \"eps\": 1e-09,\n \"batch_size\": 16,\n \"fp16_run\": false,\n \"bf16_run\": false,", 40 | "greeting": "" 41 | } 42 | $""" 43 | 44 | import json 45 | import os 46 | import boto3 47 | 48 | s3 = boto3.resource('s3') 49 | 50 | def upload_to_aws(filename: str) -> str: 51 | bucket_name = os.environ.get('bucket_name') 52 | region = os.environ.get('region') 53 | session = boto3.Session( 54 | aws_access_key_id=os.environ.get('access_key_id'), 55 | aws_secret_access_key=os.environ.get('secret_access_key') 56 | ) 57 | s3_client = session.client('s3') 58 | bucket_path = 'voice-clone' 59 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 60 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 61 | url = f'{s3_base_url}{bucket_path}/{filename}' 62 | return url 63 | 64 | def modify_epochs(file_path:str, new_epoch:int) -> bool: 65 | with open(file_path, 'r') as json_file: 66 | data = json.load(json_file) 67 | data['train']['epochs'] = new_epoch 68 | 69 | new_file_name = f'config_{new_epoch}.json' 70 | new_file_path = os.path.join(os.path.dirname(file_path), new_file_name) 71 | with open(new_file_path, 'w') as new_file: 72 | json.dump(data, new_file, indent=4) 73 | 74 | return new_file_path 75 | 76 | def mindsflow_function(event, context) -> dict: 77 | # extract parameters from event 78 | file_path = 'train_configs/config.json' 79 | new_epoch = event.get("epochs") 80 | 81 | # modify the epochs in JSON file 82 | new_file_path = modify_epochs(file_path, new_epoch) 83 | 84 | url = upload_to_aws(new_file_path) 85 | 86 | os.remove(new_file_path) 87 | 88 | # formulate the result 89 | result = { 90 | 'config_url': url 91 | } 92 | 93 | return result 94 | 95 | -------------------------------------------------------------------------------- /agent-video-generator/functions/splitVoiceMusic.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "splitVoiceMusic", 4 | "displayName": "", 5 | "description": "This Python method downloads an audio file from a given URL, separates music and voice using Spleeter, uploads the results to S3, and returns their URLs.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "audio_url" 10 | ], 11 | "properties": { 12 | "audio_url": { 13 | "type": "string", 14 | "description": "The url of the audio file to be processed" 15 | } 16 | } 17 | }, 18 | "outputPattern": { 19 | "type": "object", 20 | "required": [ 21 | "vocals_url", 22 | "accompaniment_url" 23 | ], 24 | "properties": { 25 | "vocals_url": { 26 | "type": "string", 27 | "description": "The url of the vocal part of the audio file on S3" 28 | }, 29 | "accompaniment_url": { 30 | "type": "string", 31 | "description": "The url of the accompaniment part of the audio file on S3" 32 | } 33 | } 34 | }, 35 | "tag": "DataPreprocessing", 36 | "testCases": [ 37 | { 38 | "audio_url": "https://github.com/deezer/spleeter/raw/master/audio_example.mp3" 39 | }, 40 | { 41 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/dd6f9f73-de0c-4792-a48c-ccc2e8abe7bd_audio.wav" 42 | } 43 | ], 44 | "aiPrompt": "Given the url of a audio file, download it. split music and voice with Spleeter. Upload the results to s3 and return their urls", 45 | "greeting": "" 46 | } 47 | $""" 48 | 49 | import json 50 | import os 51 | import subprocess 52 | import urllib.request 53 | import boto3 54 | from botocore.exceptions import NoCredentialsError 55 | import uuid 56 | import shutil 57 | 58 | 59 | def download_file(url: str) -> str: 60 | local_filename = url.split('/')[-1] 61 | urllib.request.urlretrieve(url, local_filename) 62 | return local_filename 63 | 64 | def upload_to_aws(filename: str) -> str: 65 | # Uses your AWS credentials to access the service 66 | bucket_name = os.environ.get('bucket_name') 67 | region = os.environ.get('region') 68 | # Create a session using the provided credentials 69 | session = boto3.Session( 70 | aws_access_key_id=os.environ.get('access_key_id'), 71 | aws_secret_access_key=os.environ.get('secret_access_key') 72 | ) 73 | # Create an S3 client 74 | s3_client = session.client('s3') 75 | bucket_path = 'ai-video' 76 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 77 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 78 | url = f'{s3_base_url}{bucket_path}/{filename}' 79 | return url 80 | 81 | def mindsflow_function(event, context) -> dict: 82 | 83 | # execute only first time to set up env 84 | def execute_command(command): 85 | process = subprocess.Popen(command.split(), stdout=subprocess.PIPE) 86 | output, error = process.communicate() 87 | execute_command("pip uninstall -y ffmpeg") 88 | execute_command("pip uninstall -y ffmpeg-python") 89 | execute_command("pip install ffmpeg-python") 90 | execute_command("pip install spleeter") 91 | 92 | from spleeter.separator import Separator 93 | 94 | # Get the audio URL from the event 95 | audio_url = event.get("audio_url") 96 | 97 | # Download the audio file 98 | audio_file = download_file(audio_url) 99 | audio_name = audio_file.split('.')[0] 100 | 101 | # Split the music and voice with Spleeter 102 | vocals_file = f"{audio_name}/vocals.wav" 103 | accompaniment_file = f"{audio_name}/accompaniment.wav" 104 | 105 | 106 | # Create a separator object 107 | separator = Separator('spleeter:2stems') 108 | 109 | # Use the separator to separate the streams 110 | # 'audio_example.mp3' is your input audio file 111 | separator.separate_to_file(audio_file, '') 112 | 113 | # Upload the results to S3 114 | vocals_url = upload_to_aws(vocals_file) 115 | accompaniment_url = upload_to_aws(accompaniment_file) 116 | 117 | execute_command("pip uninstall spleeter") 118 | 119 | # Define result 120 | result = { 121 | 'vocals_url': vocals_url, 122 | 'accompaniment_url': accompaniment_url 123 | } 124 | 125 | # Delete the files after uploading 126 | os.remove(vocals_file) 127 | os.remove(accompaniment_file) 128 | shutil.rmtree(audio_name) 129 | 130 | return result 131 | -------------------------------------------------------------------------------- /agent-video-generator/functions/textToSpeech.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "textToSpeech", 4 | "displayName": "", 5 | "description": "This Python method converts a text string into audio. The URL of the resulting audio is then returned.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "text" 10 | ], 11 | "properties": { 12 | "text": { 13 | "type": "string", 14 | "description": "Text to convert into voice" 15 | }, 16 | "speaker": { 17 | "type": "string", 18 | "description": "speaker" 19 | }, 20 | "language": { 21 | "type": "string", 22 | "description": "Voice language" 23 | }, 24 | "voice_speed": { 25 | "type": "number", 26 | "description": "voice speed" 27 | } 28 | } 29 | }, 30 | "outputPattern": { 31 | "type": "object", 32 | "required": [ 33 | "duration", 34 | "audio_url" 35 | ], 36 | "properties": { 37 | "duration": { 38 | "type": "number", 39 | "description": "" 40 | }, 41 | "audio_url": { 42 | "type": "string", 43 | "description": "URL address of the generated voice" 44 | } 45 | } 46 | }, 47 | "tag": "TextToSpeech", 48 | "testCases": [ 49 | { 50 | "text": "Hi, my name is Hello world", 51 | "speaker": "en-US-GuyNeural", 52 | "language": "en", 53 | "voice_speed": 1 54 | }, 55 | { 56 | "text": "What is the weather today?", 57 | "speaker": "", 58 | "language": "en", 59 | "voice_speed": 1 60 | }, 61 | { 62 | "text": "Mi piace mangiare la pasta", 63 | "speaker": "", 64 | "language": "it", 65 | "voice_speed": 1 66 | } 67 | ], 68 | "aiPrompt": "The method converts a given text string into audio. If a sample voice is provided, the generated audio is created by cloning the sample voice. The URL address of the generated voice is returned.", 69 | "greeting": "" 70 | } 71 | $""" 72 | 73 | import os 74 | import boto3 75 | import datetime 76 | import requests 77 | from pydub import AudioSegment 78 | import pydub 79 | import azure.cognitiveservices.speech as speechsdk 80 | from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, SpeechSynthesisOutputFormat 81 | 82 | def download_file(url, save_path): 83 | response = requests.get(url) 84 | with open(save_path, 'wb') as file: 85 | file.write(response.content) 86 | file_extension = url.split(".")[-1].lower() 87 | if file_extension == "mp3": # Convert the MP3 file to WAV 88 | audio = AudioSegment.from_mp3(save_path) 89 | audio.export(save_path, format="wav") 90 | return save_path 91 | elif file_extension == "wav": 92 | return save_path 93 | else: 94 | raise Exception("Unsupported file format. Only MP3 and WAV files are supported.") 95 | 96 | lang_dict = { 97 | 'en': 'en-US', 98 | 'ch': 'zh-CN', 99 | 'zh': 'zh-CN', 100 | 'it': 'it-IT', 101 | 'de': 'de-DE', 102 | 'fr': 'fr-FR', 103 | 'es': 'es-ES' 104 | } 105 | 106 | speaker_dict = { 107 | 'en-US': 'Microsoft Server Speech Text to Speech Voice (en-US, Jessa24kRUS)', 108 | 'zh-CN': 'Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)', 109 | 'it-IT': 'Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)', 110 | 'de-DE': 'Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)', 111 | 'fr-FR': 'Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)', 112 | 'es-ES': 'Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)' 113 | } 114 | 115 | def generate_audio(text: str, lang: str = 'en', voice_speed: float = 1.0, speaker: str = None): 116 | if lang in lang_dict.keys(): 117 | lang = lang_dict[lang] 118 | print('Setting lang:', lang) 119 | if speaker is None or speaker in ['none', '']: # use default speaker 120 | speaker = speaker_dict[lang] 121 | print('Using speaker:', speaker) 122 | current_time = datetime.datetime.now() 123 | timestamp = current_time.strftime("%Y%m%d%H%M%S") 124 | filename = f'audio_{timestamp}.wav' 125 | speech_key = os.environ.get('azure_key') 126 | service_region = os.environ.get('azure_region') 127 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) 128 | audio_output = speechsdk.audio.AudioOutputConfig(filename=filename) 129 | speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output) 130 | 131 | if voice_speed != 1.0 and voice_speed != 1: 132 | voice_speed = int(voice_speed * 100.0 - 100.0) 133 | text = f"" + text + "" 134 | else: 135 | text = f"" + text + "" 136 | result = speech_synthesizer.speak_ssml_async(text).get() 137 | stream = AudioDataStream(result) 138 | stream.save_to_wav_file(filename) 139 | 140 | # Get the duration of the audio file 141 | audio = pydub.AudioSegment.from_file(filename) 142 | duration = audio.duration_seconds 143 | 144 | bucket_name = os.environ.get('bucket_name') 145 | region = os.environ.get('region') 146 | 147 | # Create a session using the provided credentials 148 | session = boto3.Session( 149 | aws_access_key_id=os.environ.get('access_key_id'), 150 | aws_secret_access_key=os.environ.get('secret_access_key') 151 | ) 152 | 153 | # Create an S3 client 154 | s3_client = session.client('s3') 155 | 156 | bucket_path = 'temp_audio' 157 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 158 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 159 | video_url = f'{s3_base_url}{bucket_path}/{filename}' 160 | 161 | os.remove(filename) 162 | 163 | return video_url, duration 164 | 165 | def mindsflow_function(event, context) -> dict: 166 | # get the text and save path from the event 167 | text = event.get("text") 168 | lang = event.get("language", "en") 169 | voice_speed = event.get("voice_speed", None) 170 | speaker = event.get("speaker", None) 171 | 172 | # generate the audio file 173 | audio_url, duration = generate_audio(text, lang, voice_speed, speaker) 174 | 175 | # define result 176 | result = { 177 | 'audio_url': audio_url, 178 | 'duration': duration 179 | } 180 | 181 | return result 182 | 183 | -------------------------------------------------------------------------------- /agent-video-generator/functions/transcribeAudio.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "transcribeAudio", 4 | "displayName": "", 5 | "description": "This method transcribes audio into text using Azure API, maps start time and duration for each word, converts the transcription to JSON format, and uploads the resulting file to AWS S3. Its input is an audio file.", 6 | "inputPattern": { 7 | "type": "object", 8 | "properties": { 9 | "lang": { 10 | "type": "string", 11 | "description": "" 12 | }, 13 | "audio_url": { 14 | "type": "string", 15 | "description": "URL string of the audio to be transcribed" 16 | } 17 | }, 18 | "required": [ 19 | "audio_url" 20 | ] 21 | }, 22 | "outputPattern": { 23 | "type": "object", 24 | "properties": { 25 | "text": { 26 | "type": "string", 27 | "description": "" 28 | }, 29 | "duration": { 30 | "type": "number", 31 | "description": "" 32 | }, 33 | "transcription_json_url": { 34 | "type": "string", 35 | "description": "The transcription results from the audio file" 36 | } 37 | }, 38 | "required": [ 39 | "text", 40 | "duration", 41 | "transcription_json_url" 42 | ] 43 | }, 44 | "tag": "TextToSpeech", 45 | "testCases": [ 46 | { 47 | "lang": "en", 48 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/audio_20231226132719.wav" 49 | }, 50 | { 51 | "lang": "en", 52 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/tony_stark.wav" 53 | } 54 | ], 55 | "aiPrompt": "This method is designed to transcribe audio using the Azure API, get the start time and duration of each word, convert the output to JSON format, and then upload the resulting file to AWS S3. The input for this process is an audio file", 56 | "greeting": "" 57 | } 58 | $""" 59 | 60 | import json 61 | import requests 62 | import boto3 63 | import time 64 | import random 65 | import string 66 | import os 67 | from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer, AudioConfig 68 | import azure.cognitiveservices.speech as speechsdk 69 | from pydub.utils import mediainfo 70 | 71 | 72 | def get_random_string(): 73 | letters = string.ascii_lowercase 74 | result_str = ''.join(random.choice(letters) for _ in range(8)) 75 | timestamp = int(time.time()) 76 | random_str = str(timestamp) + '_' + result_str 77 | return random_str 78 | 79 | 80 | def upload_to_aws(filename: str) -> str: 81 | # Uses your AWS credentials to access the service 82 | bucket_name = os.environ.get('bucket_name') 83 | region = os.environ.get('region') 84 | 85 | # Create a session using the provided credentials 86 | session = boto3.Session( 87 | aws_access_key_id=os.environ.get('access_key_id'), 88 | aws_secret_access_key=os.environ.get('secret_access_key') 89 | ) 90 | 91 | # Create an S3 client 92 | s3_client = session.client('s3') 93 | 94 | bucket_path = 'ai-video' 95 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 96 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 97 | url = f'{s3_base_url}{bucket_path}/{filename}' 98 | 99 | return url 100 | 101 | 102 | def modify_last_word(input_string): 103 | # Remove any trailing whitespace 104 | input_string = input_string.strip() 105 | 106 | if input_string.endswith(','): 107 | # Replace the last character with a period 108 | input_string = input_string[:-1] + '.' 109 | # Check if the last word ends with a period 110 | if not input_string.endswith('.'): 111 | # Add a period at the end if the last word doesn't end with one 112 | input_string += '.' 113 | 114 | return input_string 115 | 116 | 117 | def add_punctuation(input_str: str, event) -> str: 118 | data = { 119 | "style": "LLM-Only", 120 | "stream": False, 121 | "messageContent": input_str, 122 | "agentId": 1605 123 | } 124 | resp = event.chat.messages(data=data) 125 | return resp 126 | 127 | 128 | def fix_punctuation(a_string: str, b_string: str) -> str: 129 | i_a = 0 130 | i_b = 0 131 | while i_a < len(a_string) - 1 and i_b < len(b_string) - 1: 132 | while b_string[i_b] in [',', '.', '!', '?']: 133 | i_b += 1 134 | if a_string[i_a] != ' ' and a_string[i_a + 1] != ' ' and b_string[i_b] == a_string[i_a] and (b_string[i_b+1:i_b+3] == ', ' or b_string[i_b+1:i_b+2] == ',') and b_string[i_b+3] == a_string[i_a + 1]: 135 | print('a') 136 | b_string = b_string[:i_b+1] + b_string[i_b+3:] 137 | i_a += 1 138 | i_b += 1 139 | return b_string 140 | 141 | 142 | def transcribe_audio(audio_path: str, lang: str, event) -> dict: 143 | final_results = {'Display': '', 'Lexical': '', 'Words': [], 'Duration': 0} 144 | done = False 145 | audio_info = mediainfo(audio_path) 146 | total_duration = int(float(audio_info["duration"]) * 1e7) # convert seconds to 100-nanosecond units 147 | print('duration:', total_duration) 148 | final_results['Duration'] = total_duration 149 | 150 | def recognized_cb(evt): 151 | """callback that is called when a piece of speech is recognized""" 152 | print('RECOGNIZED: {}'.format(evt)) 153 | nonlocal final_results 154 | if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: 155 | json_result = json.loads(evt.result.json) 156 | 157 | lexical = json_result["NBest"][0]['Lexical'].split() 158 | display = json_result["NBest"][0]['Display'].split() 159 | lexical= [element for element in lexical if not element.startswith("'")] 160 | #print('L', len(lexical), lexical) 161 | #print('D',len(display), display) 162 | 163 | words = [] 164 | lexical_list = [] 165 | #display_list = [] 166 | best_words = json_result["NBest"][0]['Words'] 167 | #print('B', best_words) 168 | i = 0 169 | for item in best_words: 170 | if "'" in item['Word']: 171 | print('skip:', item['Word']) 172 | continue 173 | if (best_words[i]['Offset']) +best_words[i]['Duration'] / 2 <= total_duration: 174 | words.append(best_words[i]) 175 | lexical_list.append(lexical[i]) 176 | #display_list.append(display[i]) 177 | #print(lexical[i], best_words[i]['Word'], best_words[i]['Offset']) 178 | i += 1 179 | #print(i, len(best_words)) 180 | while i < len(best_words) and (best_words[i]['Offset'] +best_words[i]['Duration'] / 2 ) <= total_duration: 181 | if i>= len(lexical): 182 | #print('DEBUG:', i, len(lexical), len(best_words)) 183 | #print('DEBUG: exit cycle') 184 | break 185 | words.append(best_words[i]) 186 | lexical_list.append(lexical[i]) 187 | #display_list.append(display[i]) 188 | #print(display[i], lexical[i], best_words[i]['Word'], best_words[i]['Offset']) 189 | i += 1 190 | 191 | #print('end record duration') 192 | #print(display[i], lexical[i], best_words[i]['Word'], best_words[i]['Offset']) 193 | lexical = ' '.join(lexical_list).strip() 194 | #display = ' '.join(display_list).strip() 195 | #print('update results') 196 | final_results['Words'] += words 197 | final_results['Lexical'] += lexical.strip() + ' ' 198 | #final_results['Display'] += display.strip() + ' ' 199 | #print(final_results['Lexical'] ) 200 | 201 | def stop_cb(evt): 202 | """callback that stops continuous recognition on receiving an event `evt`""" 203 | print('CLOSING on {}'.format(evt)) 204 | nonlocal done 205 | done = True 206 | 207 | # your Azure Speech service configuration 208 | speech_key = os.environ.get('azure_key') 209 | service_region = os.environ.get('azure_region') 210 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) 211 | speech_config.request_word_level_timestamps() 212 | lang_dict = { 213 | 'en': 'en-US', 214 | 'ch': 'zh-CN', 215 | 'zh': 'zh-CN', 216 | 'it': 'it-IT', 217 | 'de': 'de-DE', 218 | 'fr': 'fr-FR', 219 | 'es': 'es-ES' 220 | } 221 | speech_config.speech_recognition_language = lang_dict[lang] 222 | #speech_config.set_property(speechsdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "1000") 223 | 224 | # specifying audio file path 225 | audio_input = speechsdk.AudioConfig(filename=audio_path) 226 | 227 | # creating a speech recognizer 228 | speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input) 229 | speech_recognizer.recognized.connect(recognized_cb) 230 | speech_recognizer.session_stopped.connect(stop_cb) 231 | speech_recognizer.canceled.connect(stop_cb) 232 | 233 | # perform continuous recognition 234 | speech_recognizer.start_continuous_recognition() 235 | while not done: 236 | time.sleep(.5) 237 | 238 | final_results['Display'] = add_punctuation(final_results['Lexical'], event).strip().replace('//', '').replace('"', '') 239 | final_results['Display'] = fix_punctuation(final_results['Lexical'], final_results['Display']) 240 | print(len(final_results['Display'].split()), len(final_results['Lexical'].split())) 241 | #final_results['Display'] = modify_last_word(final_results['Display']) 242 | return final_results 243 | 244 | 245 | def mindsflow_function(event, context) -> dict: 246 | # get the audio url from the event 247 | audio_url = event.get("audio_url") 248 | lang = event.get("lang", "en") 249 | 250 | # download the audio file 251 | audio_file = requests.get(audio_url) 252 | 253 | audio_path = audio_url.split('/')[-1] 254 | with open(audio_path, 'wb') as f: 255 | f.write(audio_file.content) 256 | 257 | # get the Transcription result 258 | transcription_result = transcribe_audio(audio_path, lang, event) 259 | 260 | transcription_path = 'audio_transcription_{}.json'.format(get_random_string()) 261 | # upload transcription result to S3 262 | with open(transcription_path, 'w') as f: 263 | json.dump(transcription_result, f) 264 | 265 | url = upload_to_aws(transcription_path) 266 | 267 | # prepare the result 268 | result = { 269 | 'transcription_json_url': url, 270 | 'duration': transcription_result['Duration'], 271 | 'text': transcription_result['Display'] 272 | } 273 | 274 | if os.path.exists(transcription_path): 275 | os.remove(transcription_path) 276 | 277 | return result 278 | 279 | -------------------------------------------------------------------------------- /agent-video-generator/functions/translateCaptionsJson.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "translateCaptionsJson", 4 | "displayName": "", 5 | "description": "Translate captions in json file", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "json_url", 10 | "source_language", 11 | "target_language" 12 | ], 13 | "properties": { 14 | "json_url": { 15 | "type": "string", 16 | "description": "" 17 | }, 18 | "source_language": { 19 | "type": "string", 20 | "description": "" 21 | }, 22 | "target_language": { 23 | "type": "string", 24 | "description": "" 25 | } 26 | } 27 | }, 28 | "outputPattern": { 29 | "type": "object", 30 | "required": [ 31 | "json_url" 32 | ], 33 | "properties": { 34 | "json_url": { 35 | "type": "string", 36 | "description": "" 37 | } 38 | } 39 | }, 40 | "tag": "Translation", 41 | "testCases": [ 42 | { 43 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/sentence_times_1699866757_slkpxpcq.json", 44 | "source_language": "en", 45 | "target_language": "it" 46 | }, 47 | { 48 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/sentence_times_1700135459_xyxdjgbl.json", 49 | "source_language": "zh", 50 | "target_language": "en" 51 | } 52 | ], 53 | "aiPrompt": "", 54 | "greeting": "" 55 | } 56 | $""" 57 | 58 | import json 59 | from googletrans import Translator, LANGUAGES 60 | import os 61 | import boto3 62 | import requests 63 | import shutil 64 | import random 65 | import string 66 | import pydub 67 | 68 | 69 | def download_file(url, filename): 70 | res = requests.get(url) 71 | with open(filename, "wb") as f: 72 | f.write(res.content) 73 | 74 | 75 | def upload_to_aws(filename: str) -> str: 76 | bucket_name = os.environ.get('bucket_name') 77 | region = os.environ.get('region') 78 | session = boto3.Session( 79 | aws_access_key_id=os.environ.get('access_key_id'), 80 | aws_secret_access_key=os.environ.get('secret_access_key') 81 | ) 82 | s3_client = session.client('s3') 83 | bucket_path = 'temp_audio' 84 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 85 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 86 | url = f'{s3_base_url}{bucket_path}/{filename}' 87 | return url 88 | 89 | 90 | def get_captions_from_url(url): 91 | filename = f"{url.split('/')[-1]}" 92 | # download the json file 93 | download_file(url, filename) 94 | # read the contents 95 | with open(filename, 'r', encoding='utf-8') as f: 96 | captions = json.load(f) 97 | return captions, filename 98 | 99 | 100 | def translate_text(text, source_language, target_language): 101 | translator = Translator() 102 | lang_dict = { 103 | 'en': 'english', 104 | 'zh': 'chinese (simplified)', 105 | 'ch': 'chinese (simplified)', 106 | 'de': 'german', 107 | 'ge': 'german', 108 | 'it': 'italian', 109 | 'fr': 'french', 110 | 'sp': 'spanish', 111 | 'es': 'spanish', 112 | } 113 | source_language = lang_dict[source_language] 114 | target_language = lang_dict[target_language] 115 | #print(source_language, target_language) 116 | #print(LANGUAGES.values()) 117 | if source_language not in LANGUAGES.values() or target_language not in LANGUAGES.values(): 118 | return "Invalid source or target language." 119 | translation = translator.translate(text, src=source_language, dest=target_language) 120 | return translation.text 121 | 122 | 123 | # make this func indepenednt 124 | def translate_captions(captions, source_language, target_language): 125 | translated_captions = [] 126 | for cap in captions: 127 | cap['translation'] = translate_text(cap['sentence'], source_language, target_language) 128 | translated_captions.append(cap) 129 | return translated_captions 130 | 131 | 132 | def mindsflow_function(event, context) -> dict: 133 | json_url = event.get("json_url") 134 | target_language = event.get("target_language") 135 | source_language = event.get("source_language") 136 | 137 | # download and read the captions from the json file 138 | captions, json_file = get_captions_from_url(json_url) 139 | # add translated sentences into the target language 140 | translated_captions = translate_captions(captions, source_language, target_language) 141 | 142 | translated_json = 'translated' + ''.join(random.choice(string.ascii_letters) for _ in range(6)) + '.json' 143 | with open(translated_json, 'w', encoding='utf8') as f: 144 | json.dump(translated_captions, f, ensure_ascii=False, indent=4) 145 | json_url = upload_to_aws(translated_json) 146 | os.remove(translated_json) 147 | 148 | return {'json_url': json_url} 149 | 150 | -------------------------------------------------------------------------------- /agent-video-generator/functions/translateSrtFile.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "translateSrtFile", 4 | "displayName": "", 5 | "description": "This method downloads a subtitle file from a provided URL, translates it into a specified target language while keeping the original language, uploads the translated file to S3, and removes it from the local system.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "srt_url", 10 | "source_lang", 11 | "target_lang", 12 | "show_source_lang_captions", 13 | "show_target_lang_captions" 14 | ], 15 | "properties": { 16 | "srt_url": { 17 | "type": "string", 18 | "description": "URL of the SRT file to be translated" 19 | }, 20 | "source_lang": { 21 | "type": "string", 22 | "description": "The language of the original SRT file" 23 | }, 24 | "target_lang": { 25 | "type": "string", 26 | "description": "The language to translate the SRT file into" 27 | }, 28 | "captions_line": { 29 | "type": "integer", 30 | "description": "" 31 | }, 32 | "show_source_lang_captions": { 33 | "type": "boolean", 34 | "description": "" 35 | }, 36 | "show_target_lang_captions": { 37 | "type": "boolean", 38 | "description": "" 39 | } 40 | } 41 | }, 42 | "outputPattern": { 43 | "type": "object", 44 | "required": [ 45 | "transl_srt_url" 46 | ], 47 | "properties": { 48 | "transl_srt_url": { 49 | "type": "string", 50 | "description": "The S3 bucket path where the translated file is uploaded" 51 | } 52 | } 53 | }, 54 | "tag": "VideoCaptions", 55 | "testCases": [ 56 | { 57 | "srt_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/01w5zy.srt", 58 | "source_lang": "it", 59 | "target_lang": "ch", 60 | "captions_line": 15, 61 | "show_source_lang_captions": true, 62 | "show_target_lang_captions": false 63 | }, 64 | { 65 | "srt_url": "", 66 | "source_lang": "", 67 | "target_lang": "de", 68 | "captions_line": 0, 69 | "show_source_lang_captions": false, 70 | "show_target_lang_captions": false 71 | } 72 | ], 73 | "aiPrompt": "Given the url of a srt file, download it and translate subtitle to a target language. The final srt file must contain subtitles in both languages. Input include also original and target language. Finally upload the file to s3 and remote if from local.", 74 | "greeting": "" 75 | } 76 | $""" 77 | 78 | import json 79 | import boto3 80 | import requests 81 | from googletrans import Translator 82 | from pysrt import open as open_srt 83 | import random 84 | import os 85 | import string 86 | 87 | 88 | def download_file(url: str, local_path: str) -> bool: 89 | r = requests.get(url, allow_redirects=True) 90 | open(local_path, 'wb').write(r.content) 91 | return True 92 | 93 | 94 | def translate_text(input_file_path: str, output_file_path: str, origin_lang: str, target_lang: str) -> str: 95 | translator = Translator() 96 | srt_file = open_srt(input_file_path) 97 | for line in srt_file: 98 | translated_text = translator.translate(line.text, src=origin_lang, dest=target_lang) 99 | line.text += "\n" + translated_text.text 100 | srt_file.save(output_file_path, encoding='utf-8') 101 | return True 102 | 103 | def split_text(input_file_path, output_file_path: str, source_lang: str, target_lang: str, new_line_after: int = 15, show_target_lang_captions = True, show_source_lang_captions = True) -> bool: 104 | srt_file = open_srt(input_file_path) 105 | for line in srt_file: 106 | source_text, trans_text = line.text.split("\n") 107 | if "chinese" in target_lang or "japanese" in target_lang: 108 | trans_text = '\n'.join(trans_text[i:min(i+new_line_after, len(trans_text))] for i in range(0, len(trans_text), new_line_after)) 109 | if "chinese" in source_lang or "japanese" in source_lang: 110 | source_text = '\n'.join(source_text[i:min(i+new_line_after, len(source_text))] for i in range(0, len(source_text), new_line_after)) 111 | if show_source_lang_captions is False and show_target_lang_captions is True: 112 | line.text = trans_text 113 | elif show_target_lang_captions is False and show_source_lang_captions is True: 114 | line.text = source_text 115 | else: 116 | line.text = source_text + "\n" + trans_text 117 | srt_file.save(output_file_path, encoding='utf-8') 118 | return True 119 | 120 | 121 | llm_prompt = '''Given the input sentence in {}, correct any logical, semantic or spelling mistake. If possible also summarize the correctd sentence. Return only the correct sentence. 122 | SENTENCE: {} 123 | CORRECT SENTENCE: ''' 124 | def fix_text(input_str: str, event) -> str: 125 | data = { 126 | "style": "LLM-Only", 127 | "stream": False, 128 | "messageContent": input_str, 129 | "agentId": 964 130 | } 131 | resp = event.chat.messages(data=data) 132 | return resp 133 | 134 | 135 | def fix_srt_file(input_file_path: str, origin_lang: str, event) -> bool: 136 | srt_file = open_srt(input_file_path) 137 | for line in srt_file: 138 | temp_prompt = llm_prompt.format(origin_lang, line.text) 139 | fixed_text = fix_text(temp_prompt, event) 140 | #print(line.text, fixed_text) 141 | line.text = fixed_text 142 | srt_file.save(input_file_path, encoding='utf-8') 143 | return True 144 | 145 | 146 | s3_client = boto3.client('s3') 147 | 148 | def upload_to_aws(filename: str) -> str: 149 | # Uses your AWS credentials to access the service 150 | bucket_name = os.environ.get('bucket_name') 151 | region = os.environ.get('region') 152 | # Create a session using the provided credentials 153 | session = boto3.Session( 154 | aws_access_key_id=os.environ.get('access_key_id'), 155 | aws_secret_access_key=os.environ.get('secret_access_key') 156 | ) 157 | # Create an S3 client 158 | s3_client = session.client('s3') 159 | bucket_path = 'ai-video' 160 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 161 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 162 | url = f'{s3_base_url}{bucket_path}/{filename}' 163 | return url 164 | 165 | def remove_file(local_path: str): 166 | os.remove(local_path) 167 | 168 | 169 | def mindsflow_function(event, context) -> dict: 170 | srt_file_url = event.get("srt_url") 171 | src_lang = event.get("source_lang") 172 | tgt_lang = event.get("target_lang") 173 | captions_line = event.get("captions_line", 15) 174 | show_target_lang_captions = event.get("show_target_lang_captions", True) 175 | show_source_lang_captions = event.get("show_source_lang_captions", True) 176 | 177 | input_file_path = "input_file.srt" 178 | random_string = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(6)) 179 | output_file_path = random_string + "output_file.srt" 180 | 181 | download_file(srt_file_url, input_file_path) 182 | 183 | lang_dict = { 184 | 'en': 'english', 185 | 'zh': 'chinese (simplified)', 186 | 'ch': 'chinese (simplified)', 187 | 'de': 'german', 188 | 'ge': 'german', 189 | 'it': 'italian', 190 | 'fr': 'french', 191 | 'sp': 'spanish', 192 | 'es': 'spanish', 193 | } 194 | if src_lang in lang_dict.keys(): 195 | src_lang = lang_dict[src_lang] 196 | if tgt_lang in lang_dict.keys(): 197 | tgt_lang = lang_dict[tgt_lang] 198 | 199 | #fix_srt_file(input_file_path, src_lang, event) 200 | 201 | translate_text(input_file_path, input_file_path, src_lang, tgt_lang) 202 | split_text(input_file_path, output_file_path, src_lang, tgt_lang, new_line_after=captions_line, show_target_lang_captions=show_target_lang_captions, 203 | show_source_lang_captions=show_source_lang_captions) 204 | 205 | trans_srt_url = upload_to_aws(output_file_path) 206 | 207 | remove_file(input_file_path) 208 | remove_file(output_file_path) 209 | 210 | result = { 211 | 'transl_srt_url': trans_srt_url 212 | } 213 | 214 | return result 215 | 216 | -------------------------------------------------------------------------------- /agent-video-generator/functions/translateTargetToSource.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "translateTargetToSource", 4 | "displayName": "", 5 | "description": "This method is designed to translate text from one language to another, utilizing the target language as input and outputting the translation in the source language.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "text", 10 | "source_lang", 11 | "target_lang" 12 | ], 13 | "properties": { 14 | "text": { 15 | "type": "string", 16 | "description": "Text to be translated" 17 | }, 18 | "source_lang": { 19 | "type": "string", 20 | "description": "Source language of the text" 21 | }, 22 | "target_lang": { 23 | "type": "string", 24 | "description": "Target language to translate the text into" 25 | } 26 | } 27 | }, 28 | "outputPattern": { 29 | "type": "object", 30 | "required": [ 31 | "text" 32 | ], 33 | "properties": { 34 | "text": { 35 | "type": "string", 36 | "description": "The translated text" 37 | } 38 | } 39 | }, 40 | "tag": "Translation", 41 | "testCases": [ 42 | { 43 | "text": "Hello world", 44 | "source_lang": "english", 45 | "target_lang": "chinese (simplified)" 46 | }, 47 | { 48 | "text": "Guten tag", 49 | "source_lang": "", 50 | "target_lang": "" 51 | } 52 | ], 53 | "aiPrompt": "Translate a text from target language to source language", 54 | "greeting": "" 55 | } 56 | $""" 57 | 58 | import json 59 | from googletrans import Translator, LANGUAGES 60 | 61 | def translate_text(text, source_language, target_language): 62 | #print(LANGUAGES.values()) 63 | translator = Translator() 64 | 65 | if source_language not in LANGUAGES.values() or target_language not in LANGUAGES.values(): 66 | return "Invalid source or target language." 67 | 68 | translation = translator.translate(text, src=source_language, dest=target_language) 69 | 70 | return translation.text 71 | 72 | def mindsflow_function(event, context) -> dict: 73 | # get the text and languages from the event 74 | text = event.get("text") 75 | src_lang = event.get("source_lang") 76 | tgt_lang = event.get("target_lang") 77 | 78 | lang_dict = { 79 | 'en': 'english', 80 | 'zh': 'chinese (simplified)', 81 | 'ch': 'chinese (simplified)', 82 | 'de': 'german', 83 | 'ge': 'german', 84 | 'it': 'italian', 85 | 'fr': 'french', 86 | 'sp': 'spanish', 87 | 'es': 'spanish', 88 | } 89 | if src_lang in lang_dict.keys(): 90 | src_lang = lang_dict[src_lang] 91 | if tgt_lang in lang_dict.keys(): 92 | tgt_lang = lang_dict[tgt_lang] 93 | 94 | # get the translation result 95 | translation_result = translate_text(text, src_lang, tgt_lang) 96 | 97 | # define result 98 | result = { 99 | 'text': translation_result 100 | } 101 | 102 | return result 103 | 104 | -------------------------------------------------------------------------------- /agent-video-generator/functions/uploadYoutubeVideo.py: -------------------------------------------------------------------------------- 1 | """$ 2 | { 3 | "name": "uploadYoutubeVideo", 4 | "displayName": "", 5 | "description": "Manages the process of uploading a video to YouTube inclusive of its URL, title, description, and category, after which it deletes the video and returns a success status. The YouTube credentials are loaded from a JSON file.", 6 | "inputPattern": { 7 | "type": "object", 8 | "required": [ 9 | "title", 10 | "upload", 11 | "category", 12 | "video_url", 13 | "description", 14 | "account_name" 15 | ], 16 | "properties": { 17 | "title": { 18 | "type": "string", 19 | "description": "Title of the video to be uploaded" 20 | }, 21 | "upload": { 22 | "type": "boolean", 23 | "description": "" 24 | }, 25 | "category": { 26 | "type": "string", 27 | "description": "Category of the video to be uploaded" 28 | }, 29 | "video_url": { 30 | "type": "string", 31 | "description": "URL of the video to be uploaded to YouTube" 32 | }, 33 | "description": { 34 | "type": "string", 35 | "description": "Description of the video to be uploaded" 36 | }, 37 | "account_name": { 38 | "type": "string", 39 | "description": "" 40 | } 41 | } 42 | }, 43 | "outputPattern": { 44 | "type": "object", 45 | "required": [ 46 | "upload_success" 47 | ], 48 | "properties": { 49 | "upload_success": { 50 | "type": "boolean", 51 | "description": "A boolean flag indicating if the video was successfully uploaded to YouTube" 52 | } 53 | } 54 | }, 55 | "tag": "UploadVideo", 56 | "testCases": [ 57 | { 58 | "title": "Sample Video 1", 59 | "upload": false, 60 | "category": "Music", 61 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/output_1696843400_daefppdn.mp4", 62 | "description": "This is a sample video 1 for testing.", 63 | "account_name": "mindsflow.ai" 64 | }, 65 | { 66 | "title": "Sample Video 2", 67 | "upload": false, 68 | "category": "Test", 69 | "video_url": "https://example.com/video2.mp4", 70 | "description": "This is a sample video 2 for testing.", 71 | "account_name": "" 72 | } 73 | ], 74 | "aiPrompt": "Upload video to youtube, input are video URL, title, description and category, delete the video after upload. Read youtube credentials from json file. Return succeeded True or False", 75 | "greeting": "" 76 | } 77 | $""" 78 | 79 | from youtube_upload.client import YoutubeUploader 80 | import json 81 | import os 82 | import requests 83 | 84 | category_dict = { 85 | 'Autos & Vehicles': '2', 86 | 'Film & Animation': '1', 87 | 'Music': '10', 88 | 'Pets & Animals': '15', 89 | 'Sports': '17', 90 | 'Short Movies': '18', 91 | 'Travel & Events': '19', 92 | 'Gaming': '20', 93 | 'Videoblogging': '21', 94 | 'People & Blogs': '22', 95 | 'Comedy': '23', 96 | 'Entertainment': '24', 97 | 'News & Politics': '25', 98 | 'Howto & Style': '26', 99 | 'Education': '27', 100 | 'Science & Technology': '28', 101 | 'Nonprofits & Activism': '29', 102 | 'Movies': '30', 103 | 'Anime/Animation': '31', 104 | 'Action/Adventure': '32', 105 | 'Classics': '33', 106 | 'Documentary': '35', 107 | 'Drama': '36', 108 | 'Family': '37', 109 | 'Foreign': '38', 110 | 'Horror': '39', 111 | 'Sci-Fi/Fantasy': '40', 112 | 'Thriller': '41', 113 | 'Shorts': '42', 114 | 'Shows': '43', 115 | 'Trailers': '44' 116 | } 117 | 118 | credentials_path = 'youtube' 119 | config_file = f'{credentials_path}/api_tokens.json' 120 | 121 | def download_file(url, new_file_name): 122 | response = requests.get(url) 123 | with open(new_file_name, 'wb') as f: 124 | f.write(response.content) 125 | return new_file_name 126 | 127 | def mindsflow_function(event, context) -> dict: 128 | video_url = event.get("video_url") 129 | title = event.get("title") 130 | description = event.get("description") 131 | category = event.get("category") 132 | account_name = event.get("account_name") 133 | upload = event.get("upload", True) 134 | 135 | if upload == False: 136 | return { 137 | 'upload_success': False 138 | } 139 | 140 | # download the video 141 | video_path = download_file(video_url, 'video_youtube.mp4') 142 | 143 | with open(config_file, 'r') as json_file: 144 | data = json.load(json_file) 145 | account = data[account_name] 146 | 147 | # Get the credentials 148 | refresh_token = account['refresh_token'] 149 | access_token = account['access_token'] 150 | secrets_file = account['secrets_file'] 151 | 152 | uploader = YoutubeUploader(secrets_file_path=f'{credentials_path}/{secrets_file}') 153 | uploader.authenticate(refresh_token=refresh_token, 154 | access_token=access_token) 155 | 156 | # Video options 157 | options = { 158 | "title" : title, # The video title 159 | "description" : description, # The video description 160 | "tags" : [], 161 | "categoryId" : category_dict[category], 162 | "privacyStatus" : "public", # Video privacy. Can either be "public", "private", or "unlisted" 163 | "kids" : True, # Specifies if the Video if for kids or not. Defaults to False. 164 | #"thumbnailLink" : "https://cdn.havecamerawilltravel.com/photographer/files/2020/01/youtube-logo-new-1068x510.jpg" # Optional. Specifies video thumbnail. 165 | } 166 | 167 | # upload video 168 | try: 169 | uploader.upload(video_path, options) 170 | success = True 171 | except: 172 | success = False 173 | 174 | os.remove(video_path) 175 | 176 | # define result 177 | result = { 178 | 'upload_success': success 179 | } 180 | 181 | return result 182 | 183 | -------------------------------------------------------------------------------- /results/flow/part1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/part1.png -------------------------------------------------------------------------------- /results/flow/part2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/part2.png -------------------------------------------------------------------------------- /results/flow/part3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/part3.png -------------------------------------------------------------------------------- /results/flow/translation/part1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/translation/part1.png -------------------------------------------------------------------------------- /results/flow/translation/part2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/translation/part2.png -------------------------------------------------------------------------------- /results/flow/translation/part3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/translation/part3.png -------------------------------------------------------------------------------- /results/into_video_transl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/into_video_transl.png -------------------------------------------------------------------------------- /results/intro.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/intro.jpg -------------------------------------------------------------------------------- /results/videos/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video1.mp4 -------------------------------------------------------------------------------- /results/videos/video1_transl.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video1_transl.mp4 -------------------------------------------------------------------------------- /results/videos/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video2.mp4 -------------------------------------------------------------------------------- /results/videos/video2_transl.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video2_transl.mp4 -------------------------------------------------------------------------------- /results/videos/video3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video3.mp4 -------------------------------------------------------------------------------- /results/videos/video3_transl.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video3_transl.mp4 -------------------------------------------------------------------------------- /results/videos/video4.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video4.mp4 -------------------------------------------------------------------------------- /results/videos/video4_transl.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video4_transl.mp4 -------------------------------------------------------------------------------- /video_translation.md: -------------------------------------------------------------------------------- 1 | # AI video translator agent 2 | 3 | ![AI Video Translator](./results/into_video_transl.png) 4 | 5 | This project utilizes advanced AI techniques to translate the spoken language in videos from one language to another. 6 | It also incorporates an automatic caption generation system that generates translated subtitles for improved accessibility and understanding. 7 | The agent is hosted on [Mindsflow.ai](https://mindsflow.ai/). 8 | 9 | ## Features 10 | 11 | - **Automatically translated speech**: Using [Azure-API](https://azure.microsoft.com/en-us/products/ai-services/text-to-speech), the agent can translate the speech of the original video to a target language. 12 | 13 | - **Automatically translated captions**: Video speech transcription is first translated with [googletrans](https://pypi.org/project/googletrans/), then translated captions are added to the video with [ffmpeg](https://ffmpeg.org/about.html). 14 | 15 | - **Automatic upload**: Once the video is ready, the agent can automatically upload it on your favourite social media platform. 16 | 17 | - **Flow-based programming**: The agents is based on a [flow-based programming](https://en.wikipedia.org/wiki/Flow-based_programming) model to assemble different AI and algorithmic components into a complete video. The flow is developed and hosted on [Mindsflow.ai](https://mindsflow.ai/). All the blocks of the flow are available [here](agent-video-generator/functions). 18 | 19 | **Note**: running this agent requires to have an [Azure API](https://azure.microsoft.com/en-us/products/api-management/?ef_id=_k_Cj0KCQiA5-uuBhDzARIsAAa21T9Ii5vg2kAFHYwFfD2k7pnFp1Rg-HbVmvAOKfTrqq5Ue2TfbAIdahEaAmkWEALw_wcB_k_&OCID=AIDcmmy6frl1tq_SEM__k_Cj0KCQiA5-uuBhDzARIsAAa21T9Ii5vg2kAFHYwFfD2k7pnFp1Rg-HbVmvAOKfTrqq5Ue2TfbAIdahEaAmkWEALw_wcB_k_&gad_source=1&gclid=Cj0KCQiA5-uuBhDzARIsAAa21T9Ii5vg2kAFHYwFfD2k7pnFp1Rg-HbVmvAOKfTrqq5Ue2TfbAIdahEaAmkWEALw_wcB) key. 20 | 21 | ## Results 22 | 23 | In the following table you can watch some generated samples. 24 | 25 | | Original | Translation (Chinese) | 26 | |--------------------------------------------------------------|------------------------------------------------------| 27 | | https://youtube.com/shorts/wKOBppgV2R0?feature=share | https://youtube.com/shorts/Zbk02MsAoko?feature=share | 28 | | https://youtube.com/shorts/6dgCNjVMBpM?feature=share | https://youtube.com/shorts/wbZbhVQmpNw?feature=share | 29 | | https://youtube.com/shorts/wKOBppgV2R0?feature=share | https://youtube.com/shorts/GCIVzEHeeqM?feature=share | 30 | | https://youtu.be/ERfDdZq9ve8 | https://youtu.be/a1tKgG0UP6A | 31 | 32 | More results are available on [Douyin](https://www.douyin.com/user/MS4wLjABAAAAnDmwuk2SS4WBc8swBbYhtbGpH1Mrp3nlHrTnMcyDJdW5RUsr4BCajyo716Wyc76L?is_search=0&list_name=follow&nt=0) 33 | 34 | ## Flow 35 | 36 | | Part 1 | Part 2 | Part 3 | 37 | |---------------------------------------------------|-------------------------------------|-------------------------------------| 38 | | ![Alt text](./results/flow/translation/part1.png) | ![Alt text](./results/flow/translation/part2.png) | ![Alt text](./results/flow/translation/part3.png) | 39 | 40 | For more details you see the full images [here](./results/flow/translation/). 41 | 42 | ## Input format 43 | 44 | ``` 45 | { 46 | "voice": "zh-CN-YunfengNeural", # (full list of voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts) 47 | "video_url": "video_to_translate.mp4", # link to the video to be translated. It must point to a video mp4 file stored online. 48 | "source_language": "en", # video original language 49 | "target_language": "zh", # language to translated the video 50 | "account_name": "mindsflow.ai", # account name, only if you want the video to be automatically uploaded on your platform 51 | "upload": false, # whether to upload the video on social media 52 | } 53 | ``` 54 | 55 | ## Output format 56 | 57 | The output of the agent is structured in the following way: 58 | 59 | ``` 60 | { 61 | "result": "link to result" 62 | } 63 | ``` 64 | In this output, result is a link pointing to a ZIP file. This ZIP file contains: 65 | 66 | - The generated video in mp4 format 67 | - The original video in mp4 format 68 | - The video original subtitles in srt text format 69 | - The video translated subtitles in srt text format 70 | 71 | ## Extra 72 | 73 | Try out more AI agents at [https://chat.mindsflow.ai/en-US/explore](https://chat.mindsflow.ai/en-US/explore). -------------------------------------------------------------------------------- /voice_clone/functions/clone_voice_vits.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import boto3 4 | import requests 5 | import random 6 | import string 7 | 8 | default_train_config = 'config_1000' 9 | 10 | def mindsflow_function(event, context) -> dict: 11 | # get from event 12 | dataset_url = event.get('dataset_url') 13 | config = event.get('train_config', default_train_config) 14 | split = event.get('audio_split', 12) 15 | clean_noise = event.get('clean_noise', False) 16 | voice = event.get('voice', None) 17 | api_ip = os.environ.get('api_ip') 18 | 19 | if config is None or len(config) == 0: 20 | config = default_train_config 21 | if voice is not None and len(voice) == 0: 22 | voice = None 23 | 24 | voice_clone_url = f"http://{api_ip}:5000/voice_clone/" 25 | 26 | data = { 27 | "dataset_url": dataset_url, 28 | "config": config, 29 | "split": split, 30 | "clean_noise": clean_noise 31 | } 32 | 33 | headers = { 34 | 'Content-Type': 'application/json' 35 | } 36 | 37 | print('Cloning voice...') 38 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers) 39 | if response.status_code != 200: 40 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}') 41 | print('Voice cloned') 42 | 43 | response_dict = response.json() 44 | 45 | return { 46 | "succeeded": response_dict["succeeded"], 47 | "voice": response_dict["voice"] if voice is None else voice 48 | } 49 | -------------------------------------------------------------------------------- /voice_clone/functions/generate_voice_vits.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import boto3 4 | import requests 5 | import random 6 | import string 7 | 8 | s3 = boto3.resource('s3') 9 | 10 | def download_file(url: str, save_path: str): 11 | resp = requests.get(url) 12 | with open(save_path, 'wb') as f: 13 | f.write(resp.content) 14 | 15 | 16 | def generate_random_string(length): 17 | letters = string.ascii_letters 18 | result_str = ''.join(random.choice(letters) for i in range(length)) 19 | return result_str 20 | 21 | 22 | def upload_to_aws(filename: str) -> str: 23 | bucket_name = os.environ.get('bucket_name') 24 | region = os.environ.get('region') 25 | session = boto3.Session( 26 | aws_access_key_id=os.environ.get('access_key_id'), 27 | aws_secret_access_key=os.environ.get('secret_access_key') 28 | ) 29 | s3_client = session.client('s3') 30 | bucket_path = 'voice-clone' 31 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 32 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 33 | url = f'{s3_base_url}{bucket_path}/{filename}' 34 | return url 35 | 36 | 37 | def mindsflow_function(event, context) -> dict: 38 | # get from event 39 | audio_url = event.get('audio_url') 40 | voice= event.get('voice') 41 | clean_noise = event.get('clean_noise') 42 | api_ip = os.environ.get('api_ip') 43 | 44 | voice_clone_url = f"http://{api_ip}:5001/generate_voice/" 45 | 46 | data = { 47 | "audio_url": audio_url, 48 | "voice": voice, 49 | "clean_noise": clean_noise 50 | } 51 | 52 | headers = { 53 | 'Content-Type': 'application/json' 54 | } 55 | 56 | print('Generating voice...') 57 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers) 58 | if response.status_code != 200: 59 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}') 60 | print('Voice generated') 61 | 62 | audio_path = voice + '_' + audio_url.split('/')[-1] 63 | # Save the file to the directory 64 | with open(audio_path, 'wb') as file: 65 | file.write(response.content) 66 | 67 | result_url = upload_to_aws(audio_path) 68 | 69 | # clean up 70 | os.remove(audio_path) 71 | 72 | return { 73 | "audio_url": result_url 74 | } 75 | -------------------------------------------------------------------------------- /voice_clone/functions/set_epoch_in_json_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import boto3 4 | 5 | s3 = boto3.resource('s3') 6 | 7 | def upload_to_aws(filename: str) -> str: 8 | bucket_name = os.environ.get('bucket_name') 9 | region = os.environ.get('region') 10 | session = boto3.Session( 11 | aws_access_key_id=os.environ.get('access_key_id'), 12 | aws_secret_access_key=os.environ.get('secret_access_key') 13 | ) 14 | s3_client = session.client('s3') 15 | bucket_path = 'voice-clone' 16 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}") 17 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/' 18 | url = f'{s3_base_url}{bucket_path}/{filename}' 19 | return url 20 | 21 | def modify_epochs(file_path:str, new_epoch:int) -> bool: 22 | with open(file_path, 'r') as json_file: 23 | data = json.load(json_file) 24 | data['train']['epochs'] = new_epoch 25 | 26 | new_file_name = f'config_{new_epoch}.json' 27 | new_file_path = os.path.join(os.path.dirname(file_path), new_file_name) 28 | with open(new_file_path, 'w') as new_file: 29 | json.dump(data, new_file, indent=4) 30 | 31 | return new_file_path 32 | 33 | def mindsflow_function(event, context) -> dict: 34 | # extract parameters from event 35 | file_path = 'train_configs/config.json' 36 | new_epoch = event.get("epochs") 37 | 38 | # modify the epochs in JSON file 39 | new_file_path = modify_epochs(file_path, new_epoch) 40 | 41 | url = upload_to_aws(new_file_path) 42 | 43 | os.remove(new_file_path) 44 | 45 | # formulate the result 46 | result = { 47 | 'config_url': url 48 | } 49 | 50 | return result 51 | -------------------------------------------------------------------------------- /voice_clone/voice_clone_api/functions.py: -------------------------------------------------------------------------------- 1 | from pydub import AudioSegment 2 | import os 3 | from scipy.io import wavfile 4 | import noisereduce as nr 5 | import numpy as np 6 | import wave 7 | import requests 8 | 9 | 10 | def is_stereo(filename): 11 | with wave.open(filename, 'rb') as wav_file: 12 | channels = wav_file.getnchannels() 13 | 14 | if channels == 2: 15 | return True 16 | else: 17 | return False 18 | 19 | 20 | def reduce_noise(file_name): 21 | if file_name.split('.')[-1] != 'wav': 22 | file_name = convert_mp3_to_wav(file_name, remove_original=True) 23 | rate, data = wavfile.read(file_name) 24 | if is_stereo(file_name): 25 | # from https://github.com/timsainb/noisereduce/issues/57 26 | data1 = data[:,0] 27 | data2 = data[0:,1] 28 | # perform noise reduction 29 | reduced_noise1 = nr.reduce_noise(y=data1, sr=rate) 30 | reduced_noise2 = nr.reduce_noise(y=data2, sr=rate) 31 | reduced_noise = np.stack((reduced_noise1, reduced_noise2), axis=1) 32 | else: 33 | reduced_noise = nr.reduce_noise(y=data, sr=rate) 34 | wavfile.write(file_name, rate, reduced_noise) 35 | return file_name 36 | 37 | 38 | def split_audio(input_file, duration): 39 | # Load audio file 40 | audio = AudioSegment.from_file(input_file) 41 | 42 | # Length of audio file 43 | length_audio = len(audio) 44 | 45 | # Split audio file into chunks of 'duration' 46 | chunks = [audio[i:i+duration*1000] for i in range(0, length_audio, duration*1000)] 47 | 48 | # Save chunks in the same folder as the original file 49 | for i, chunk in enumerate(chunks): 50 | chunk_name = f'{input_file[:-4]}_chunk_{i}.wav' 51 | print(f'Created {chunk_name}') 52 | chunk.export(chunk_name, format='wav') 53 | 54 | 55 | def convert_mp3_to_wav(file_path, remove_original=True): 56 | audio = AudioSegment.from_mp3(file_path) 57 | output_path = change_file_extension(file_path, 'wav') 58 | audio.export(output_path, format="wav") 59 | if remove_original: 60 | os.remove(file_path) 61 | return output_path 62 | 63 | 64 | def change_file_extension(filename, new_extension): 65 | # Get the file name without the old extension 66 | base = os.path.splitext(filename)[0] 67 | # Return the file name with the new extension 68 | return base + '.' + new_extension 69 | 70 | 71 | def download_file(url, filename): 72 | r = requests.get(url, allow_redirects=True) 73 | open(filename, 'wb').write(r.content) 74 | -------------------------------------------------------------------------------- /voice_clone/voice_clone_api/infer.py: -------------------------------------------------------------------------------- 1 | from functions import * 2 | 3 | 4 | audio_path = '../results' 5 | dataset_raw = 'dataset_raw' 6 | logs = 'logs' 7 | 8 | # reference: https://github.com/svc-develop-team/so-vits-svc 9 | 10 | 11 | def infer(audio_url, dataset_name, config, clean_noise=False): 12 | os.system(f"wget {audio_url}") 13 | audio_name = audio_url.split('/')[-1].split('.')[0] 14 | audio_name_with_ext = audio_url.split('/')[-1] 15 | ext = audio_name_with_ext.split('.')[-1] 16 | if ext == 'mp3': 17 | audio_name_with_ext = convert_mp3_to_wav(audio_name_with_ext) 18 | if not os.path.exists(f"{audio_path}"): 19 | os.system(f"mkdir {audio_path}") 20 | if clean_noise: 21 | audio_name_with_ext = reduce_noise(audio_name_with_ext) 22 | os.system(f"mv {audio_name_with_ext} {audio_path}") 23 | os.system(f"svc infer {audio_path}/{audio_name_with_ext} -m {logs}/{dataset_name}/ -c {logs}/{config}.json") 24 | os.system(f"rm {audio_path}/{audio_name_with_ext}") 25 | #os.system(f"mv {audio_name}.out.wav {audio_path}") 26 | return f"{audio_path}/{audio_name}.out.wav" 27 | -------------------------------------------------------------------------------- /voice_clone/voice_clone_api/infer_api.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, UploadFile, File, HTTPException 2 | import os 3 | from fastapi.responses import FileResponse 4 | import argparse 5 | import logging 6 | import uvicorn 7 | from infer import * 8 | 9 | app = FastAPI() 10 | dataset_raw = 'dataset_raw' 11 | logs = 'logs' 12 | config = "config_1000" 13 | 14 | 15 | @app.post("/generate_voice") 16 | async def generate_audio_file(data: dict): 17 | print('Received audio generation request data: ', data) 18 | dataset = data['voice'] 19 | audio_url = data['audio_url'] 20 | clean_noise = data['clean_noise'] 21 | 22 | try: 23 | generated_audio = infer(audio_url, dataset, config, clean_noise=clean_noise) 24 | except Exception as e: 25 | raise HTTPException(status_code=500, detail=e) 26 | 27 | response = FileResponse(generated_audio, filename=generated_audio) 28 | return response 29 | 30 | 31 | # python3 infer_api.py --port 5000 32 | if __name__ == "__main__": 33 | # Setting up argument parsing 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("--port", "-p", type=int, default=5000, help="Port to run server on, default is 8000.") 36 | args = parser.parse_args() 37 | 38 | logging.basicConfig(level=logging.INFO) 39 | uvicorn.run(app, host="0.0.0.0", port=args.port) 40 | -------------------------------------------------------------------------------- /voice_clone/voice_clone_api/test_api/run_make_prompt.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | # Specify the API endpoint 5 | url = "http://IP/voice_clone/" 6 | 7 | # Specify the data payload 8 | data = { 9 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/tony_stark.wav", 10 | "character_name": "xxx" 11 | } 12 | 13 | headers = { 14 | 'Content-Type': 'application/json' 15 | } 16 | 17 | response = requests.post(url, data=json.dumps(data), headers=headers) 18 | 19 | if response.status_code == 200: 20 | print(f"Response from server: {response.json()}") 21 | else: 22 | print(f"Failed to get response. Status code: {response.status_code}") -------------------------------------------------------------------------------- /voice_clone/voice_clone_api/test_api/run_voice_clone.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import json 4 | 5 | data = { 6 | 'character_name': 'elon_musk', 7 | 'text': 'SpaceX aims to make humanity a multiplanetary species.' 8 | } 9 | 10 | response = requests.post('http://IP/generate_audio/', json=data) 11 | 12 | with open('response.wav', 'wb') as f: 13 | f.write(response.content) 14 | 15 | if response.status_code == 200: 16 | filename = 'result.wav' 17 | folder_name = 'results' 18 | if not os.path.exists(folder_name): 19 | os.makedirs(folder_name) 20 | file_path = os.path.join(folder_name, filename) 21 | 22 | # Save the file to the directory 23 | with open(file_path, 'wb') as file: 24 | file.write(response.content) 25 | print(f'File saved at {file_path}') 26 | else: 27 | print('Failed to get response from the server.') -------------------------------------------------------------------------------- /voice_clone/voice_clone_api/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pydub import AudioSegment 3 | from functions import split_audio, reduce_noise, download_file 4 | import argparse 5 | 6 | 7 | dataset_raw = 'dataset_raw' 8 | logs = 'logs' 9 | 10 | # reference: https://github.com/svc-develop-team/so-vits-svc 11 | 12 | 13 | def preprocess(dataset_path: str, split_threshold: int = 12, f0_method='dio', clean_noise=False): 14 | # dataset zip url 15 | if not os.path.exists(dataset_raw): 16 | os.mkdir(dataset_raw) 17 | os.system(f"wget {dataset_path}") 18 | dataset_name = dataset_path.split('/')[-1].split('.')[0] 19 | print('Dataset name:', dataset_name) 20 | dataset_dir = os.path.join(dataset_raw, dataset_name) 21 | if not os.path.exists(dataset_dir): 22 | os.mkdir(dataset_dir) 23 | print('Created dir:', dataset_dir) 24 | os.system(f"mv {dataset_name}.zip {dataset_raw}") 25 | os.system(f"unzip {dataset_raw}/{dataset_name}.zip -d {dataset_raw}") 26 | # after unzip, if the file contains a single file, create folder in {dataset_raw} with the name of the file and 27 | # move the wav file to that folder 28 | files = [f for f in os.listdir(dataset_raw) if f.endswith('.wav')] 29 | if len(files) == 1: 30 | single_file = files[0] 31 | file_name, file_ext = os.path.splitext(single_file) 32 | new_folder_path = f"{dataset_raw}/{file_name}" 33 | os.system(f"mkdir {new_folder_path}") 34 | os.system(f"mv {dataset_raw}/{single_file} {new_folder_path}") 35 | 36 | os.system(f"rm {dataset_raw}/{dataset_name}.zip") 37 | 38 | if os.path.exists(os.path.join(dataset_raw, '__MACOSX')): 39 | os.system(f"rm -rf {dataset_raw}/__MACOSX") 40 | if os.path.exists(os.path.join('dataset/44k', '__MACOSX')): 41 | os.system(f"rm -rf dataset/44k/__MACOSX") 42 | 43 | for root, _, files in os.walk(dataset_dir): 44 | for name in files: 45 | filename = os.path.join(root, name) 46 | if filename.endswith((".mp3", ".wav")): 47 | 48 | if clean_noise: 49 | filename = reduce_noise(filename) 50 | 51 | # Split long audio file into smaller chunks 52 | audio = AudioSegment.from_file(filename) 53 | duration_seconds = len(audio) / 1000 # duration in seconds 54 | if duration_seconds > split_threshold: 55 | split_audio(filename, split_threshold) 56 | os.remove(filename) 57 | print(f'Removed {filename}') 58 | 59 | os.system("svc pre-resample") 60 | os.system("svc pre-config") 61 | os.system(f"svc pre-hubert -fm {f0_method}") 62 | return dataset_name 63 | 64 | 65 | def train(dataset_name, config): 66 | if not os.path.exists(logs): 67 | os.mkdir(logs) 68 | if 'http' in config: 69 | download_file(config, f"{logs}/custom_config.json") 70 | config = f"{logs}/custom_config.json" 71 | else: 72 | if '.json' in config: 73 | config = config.split('.json')[0] 74 | config = f'{logs}/{config}.json' 75 | if os.path.exists(f"{logs}/{dataset_name}"): 76 | os.system(f"rm -rf {logs}/{dataset_name}") 77 | if not os.path.exists(f"{logs}/{dataset_name}"): 78 | os.mkdir(f"{logs}/{dataset_name}") 79 | os.system(f"svc train --model-path {logs}/{dataset_name} --config-path {config}") 80 | # svc train --model-path logs/davide_en --config-path logs/config_100.json 81 | 82 | 83 | def clean(): 84 | os.system(f"rm -rf {dataset_raw}") 85 | os.system(f"rm -rf dataset") 86 | os.system(f"rm -rf filelists") 87 | 88 | -------------------------------------------------------------------------------- /voice_clone/voice_clone_api/train_api.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, UploadFile, File, HTTPException 2 | import logging 3 | import uvicorn 4 | from train import * 5 | 6 | app = FastAPI() 7 | dataset_raw = 'dataset_raw' 8 | logs = 'logs' 9 | 10 | 11 | @app.post("/voice_clone") 12 | async def generate_audio_file(data: dict): 13 | print('Received audio generation request data: ', data) 14 | dataset = data['dataset_url'] 15 | split = data['split'] 16 | config = data['config'] 17 | clean_noise = data['clean_noise'] 18 | try: 19 | dataset_name = preprocess(dataset, split_threshold=split, clean_noise=clean_noise) 20 | train(dataset_name, config) 21 | clean() 22 | except Exception as e: 23 | raise HTTPException(status_code=500, detail=e) 24 | 25 | return { 26 | "succeeded": True, 27 | "voice": dataset_name, 28 | } 29 | 30 | 31 | # python3 train_api.py --port 5000 32 | if __name__ == "__main__": 33 | # Setting up argument parsing 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("--port", "-p", type=int, default=5000, help="Port to run server on, default is 8000.") 36 | args = parser.parse_args() 37 | 38 | logging.basicConfig(level=logging.INFO) 39 | uvicorn.run(app, host="0.0.0.0", port=args.port) 40 | --------------------------------------------------------------------------------