├── .gitignore
├── README.md
├── agent-video-generator
└── functions
│ ├── AddCaptionsToVideoFFMPEG.py
│ ├── AddCaptionsToVideoMoviepy.py
│ ├── AddCaptionsToVideoOpenCV.py
│ ├── AudioTranscriptionToSentences.py
│ ├── CommandsExecution.py
│ ├── ConvertSrtToAss.py
│ ├── MindsflowAgent.py
│ ├── MusicGeneration.py
│ ├── PromptImagesToVideo.py
│ ├── ShowFonts.py
│ ├── UploadResultZipS3.py
│ ├── addAudioSegmentsToVideo.py
│ ├── addSoundToVideo.py
│ ├── addTextToImage.py
│ ├── cloneVoiceValleX.py
│ ├── cloneVoiceVits.py
│ ├── deleteFilesByExtension.py
│ ├── deleteFolders.py
│ ├── extractVideoAudioComponents.py
│ ├── generateAudioSegmentsFromJson.py
│ ├── generateSrtFromJson.py
│ ├── generateVideoScript.py
│ ├── generateVoiceVits.py
│ ├── loadJsonAndReturnKeys.py
│ ├── preprocessTrainData.py
│ ├── returnInputParameters.py
│ ├── setEpochInJsonFile.py
│ ├── splitVoiceMusic.py
│ ├── textToSpeech.py
│ ├── transcribeAudio.py
│ ├── translateCaptionsJson.py
│ ├── translateSrtFile.py
│ ├── translateTargetToSource.py
│ └── uploadYoutubeVideo.py
├── results
├── flow
│ ├── part1.png
│ ├── part2.png
│ ├── part3.png
│ └── translation
│ │ ├── part1.png
│ │ ├── part2.png
│ │ └── part3.png
├── into_video_transl.png
├── intro.jpg
└── videos
│ ├── video1.mp4
│ ├── video1_transl.mp4
│ ├── video2.mp4
│ ├── video2_transl.mp4
│ ├── video3.mp4
│ ├── video3_transl.mp4
│ ├── video4.mp4
│ └── video4_transl.mp4
├── video_translation.md
└── voice_clone
├── functions
├── clone_voice_vits.py
├── generate_voice_vits.py
└── set_epoch_in_json_config.py
└── voice_clone_api
├── functions.py
├── infer.py
├── infer_api.py
├── test_api
├── run_make_prompt.py
└── run_voice_clone.py
├── train.py
└── train_api.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | secrets/
98 |
99 | # poetry
100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | # This is especially recommended for binary packages to ensure reproducibility, and is more
102 | # commonly ignored for libraries.
103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 |
106 | # pdm
107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | # in version control.
111 | # https://pdm.fming.dev/#use-with-ide
112 | .pdm.toml
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | .idea/
163 | *.DS_Store
164 |
165 | replicate_models/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AI video generator agent
2 |
3 | 
4 |
5 | This AI-agent utilizes generative AI to automatically generate short videos and post them on social platforms.
6 | It integrates several AI domains such as script generation, image generation, music generation, speech generation, automatic captioning, special effects, automatic upload and video composition to create engaging and high-quality videos.
7 | The agent is hosted on [Mindsflow.ai](https://mindsflow.ai/).
8 |
9 | ## Features
10 |
11 | - **Script Generation**: uses [GPT4](https://openai.com/gpt-4) to generate compelling scripts for your videos.
12 |
13 | - **Image Generation**: Based on the script, it generates relevant and visually appealing frames using [StableDiffusionXL](https://replicate.com/stability-ai/sdxl).
14 |
15 | - **Music Generation**: The system can create original, fitting background music to enhance the mood and tone of the video. It leverages a [music generation](https://replicate.com/meta/musicgen) model
16 |
17 | - **Speech Generation**: Using [Azure-API](https://azure.microsoft.com/en-us/products/ai-services/text-to-speech), the agent can also generate human-like narrations for the scripts. It supports multiple languages.
18 |
19 | - **Automatic Captioning**: This feature ensures accessibility by providing accurate captions for the generated speech. Captions are generated with [ffmpeg](https://ffmpeg.org/about.html).
20 |
21 | - **Special Effects**: The agent includes the ability to apply various special effects to the video to make it more engaging using [moviepy](https://pypi.org/project/moviepy/).
22 |
23 | - **Video Composition**: The agents is based on a [flow-based programming](https://en.wikipedia.org/wiki/Flow-based_programming) model to assemble different AI and algorithmic components into a complete video. The flow is developed and hosted on [Mindsflow.ai](https://mindsflow.ai/). All the blocks of the flow are available [here](agent-video-generator/functions).
24 |
25 | - **Automatic upload**: Once the video is ready, the agent can automatically upload it on your favourite social media platform.
26 |
27 | **Note**: running this agent requires to have an OpenAI key, [Replicate](https://replicate.com/explore) key, and Azure API key.
28 |
29 | ## Results
30 |
31 | You can check out some sample videos at the following links:
32 |
33 | 1. [https://www.instagram.com/inspiration_daily_tales/](https://www.instagram.com/inspiration_daily_tales/)
34 |
35 | 2. [https://www.tiktok.com/@inspiration_tales_daily](https://www.tiktok.com/@inspiration_tales_daily)
36 |
37 | These samples provide a glimpse of what the video-generator agent is capable of. Happy viewing!
38 |
39 | ## Flow
40 |
41 | | Part 1 | Part 2 | Part 3 |
42 | |-------------------------------------|-------------------------------------|-------------------------------------|
43 | |  |  |  |
44 |
45 | For more details you see the full images [here](./results/flow/).
46 |
47 | ## Input format
48 |
49 | ```
50 | {
51 | "topic": "topic of the video", # example: benefits of eating mango
52 | "language": "en", # narration language
53 | "speaker": "en-US-GuyNeural", # (full list of voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)
54 | "voice_speed": 1, # (<1 slower voice, >1 faster voice)
55 | "font_size": 30, # font size in pixels
56 | "font_name": "SourceSerif4", # font type
57 | "font_size_title": 50, # title font size in pixels
58 | "text_color": "white", # subtitles color
59 | "text_bg_color": "", # subtitles bg color, "" or "none" means no bg color
60 | "text_bg_opacity": 0, # subtitles bg opacity, in [0-1], 0->transparent, 1->full
61 | "text_border_color": "none", # subtitles border color
62 | "text_border_size": 0, # subtitles border size, 0 no border
63 | "caption_position": "center", # center, top, bottom
64 | "height": 1024, # video height in pixels
65 | "width": 576, # video width in pixels
66 | "fps": 16, # video fps
67 | "image_model": "sdxl", # model to generate frames: sd or sdxl
68 | "music_volume": 0.5, # volume of bg music, in [0-1], 0->no bg music
69 | "transition_time": 1, # frames transitions time, 0->instant transition
70 | "zoom": 1.1, # frames zoom in/out strength, 1->no zoom
71 | "account_name": "mindsflow.ai", # account name, only if you want the video to be automatically uploaded on your platform
72 | "upload": false, # whether to upload the video on social media
73 | "image_duration": 6, # duration of each image
74 | }
75 | ```
76 |
77 | **Note**: The only compulsory field is "topic". If not specified, all other fields will be set to their default values.
78 |
79 | ## Output format
80 |
81 | The output of the agent is structured in the following way:
82 |
83 | ```
84 | {
85 | "result": "link to result"
86 | }
87 | ```
88 | In this output, result is a link pointing to a ZIP file. This ZIP file contains:
89 |
90 | - The generated video in mp4 format
91 | - A thumbnail image for the video
92 | - The video script in text format
93 | - The captions file in srt format
94 |
95 | ## Extra
96 |
97 | Try out more AI agents at [https://chat.mindsflow.ai/en-US/explore](https://chat.mindsflow.ai/en-US/explore).
--------------------------------------------------------------------------------
/agent-video-generator/functions/AddCaptionsToVideoFFMPEG.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "AddCaptionsToVideoFFMPEG",
4 | "displayName": "",
5 | "description": "This method receives an SRT or ASS subtitle file path and an MP4 video file path as inputs. Using the FFmpeg library, it integrates the subtitle file with the video and outputs the path of the combined video. It does not operate in command-line mode",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "video_url",
10 | "captions_url"
11 | ],
12 | "properties": {
13 | "video_url": {
14 | "type": "string",
15 | "description": "Path to the MP4 video file."
16 | },
17 | "captions_url": {
18 | "type": "string",
19 | "description": "Path to the ASS (Advanced SubStation Alpha) subtitle file."
20 | }
21 | }
22 | },
23 | "outputPattern": {
24 | "type": "object",
25 | "required": [
26 | "video_url"
27 | ],
28 | "properties": {
29 | "video_url": {
30 | "type": "string",
31 | "description": "Path of the video file after merging with subtitles"
32 | }
33 | }
34 | },
35 | "tag": "VideoCaptions",
36 | "testCases": [
37 | {
38 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/output_1701831655_ypexkwiz.mp4",
39 | "captions_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/WQ7TEooutput_file.ass"
40 | }
41 | ],
42 | "aiPrompt": "AddCaptionsToVideoFFMPEG",
43 | "greeting": ""
44 | }
45 | $"""
46 |
47 | import os
48 | import ffmpeg
49 | import requests
50 | import boto3
51 | import random, string
52 | import subprocess
53 |
54 | def download_file(url, filename):
55 | response = requests.get(url)
56 | file = open(filename, 'wb')
57 | file.write(response.content)
58 | file.close()
59 |
60 | s3_client = boto3.client('s3')
61 |
62 | def upload_to_aws(filename: str) -> str:
63 | # Uses your AWS credentials to access the service
64 | bucket_name = os.environ.get('bucket_name')
65 | region = os.environ.get('region')
66 | # Create a session using the provided credentials
67 | session = boto3.Session(
68 | aws_access_key_id=os.environ.get('access_key_id'),
69 | aws_secret_access_key=os.environ.get('secret_access_key')
70 | )
71 | # Create an S3 client
72 | s3_client = session.client('s3')
73 | bucket_path = 'ai-video'
74 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
75 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
76 | url = f'{s3_base_url}{bucket_path}/{filename}'
77 | return url
78 |
79 | def merge_subtitle_and_video(subtitle_path: str, mp4_path: str, output_path: str):
80 | # determine file type from extension
81 | _, file_extension = os.path.splitext(subtitle_path)
82 |
83 | if file_extension.lower() == ".srt":
84 | ffmpeg.input(mp4_path).output(output_path, vf='subtitles=' + subtitle_path).run(overwrite_output=True)
85 | elif file_extension.lower() == ".ass":
86 | command = f"ffmpeg -i {mp4_path} -vf 'ass={subtitle_path}' {output_path}"
87 | process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, universal_newlines=True)
88 | output = process.stdout
89 | else:
90 | print(f"Unsupported subtitle file type: {file_extension}")
91 |
92 | def mindsflow_function(event, context) -> dict:
93 | # get the srt path from the event
94 | caption_url = event.get("captions_url")
95 | # get the mp4 path from the event
96 | video_url = event.get("video_url")
97 |
98 | command = ' apt install ffmpeg'
99 | process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, universal_newlines=True)
100 |
101 | mp4_path = video_url.split('/')[-1]
102 | caption_path = caption_url.split('/')[-1]
103 | download_file(video_url, mp4_path)
104 | download_file(caption_url, caption_path)
105 |
106 | # Set output path
107 | output_path = "video_with_captions_{}.mp4".format(''.join(random.choices(string.ascii_letters + string.digits, k=5)))
108 |
109 | # Merge the srt and mp4 files
110 | merge_subtitle_and_video(ass_path, mp4_path, output_path)
111 |
112 | upload_url = upload_to_aws(output_path)
113 | os.remove(output_path)
114 |
115 | # define result
116 | result = {
117 | 'video_url': upload_url
118 | }
119 |
120 | return result
121 |
122 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/AddCaptionsToVideoMoviepy.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "AddCaptionsToVideoMoviepy",
4 | "displayName": "",
5 | "description": "Add captions to video with moviepy",
6 | "inputPattern": {
7 | "type": "object",
8 | "properties": {
9 | "font_name": {
10 | "type": "string",
11 | "description": ""
12 | },
13 | "font_size": {
14 | "type": "number",
15 | "description": ""
16 | },
17 | "video_url": {
18 | "type": "string",
19 | "description": ""
20 | },
21 | "text_color": {
22 | "type": "string",
23 | "description": ""
24 | },
25 | "caption_url": {
26 | "type": "string",
27 | "description": ""
28 | },
29 | "text_bg_color": {
30 | "type": "string",
31 | "description": ""
32 | },
33 | "highlight_color": {
34 | "type": "string",
35 | "description": ""
36 | },
37 | "text_bg_opacity": {
38 | "type": "number",
39 | "description": ""
40 | },
41 | "caption_position": {
42 | "type": "string",
43 | "description": ""
44 | },
45 | "text_border_size": {
46 | "type": "number",
47 | "description": ""
48 | },
49 | "text_border_color": {
50 | "type": "string",
51 | "description": ""
52 | }
53 | },
54 | "required": [
55 | "video_url",
56 | "caption_url"
57 | ]
58 | },
59 | "outputPattern": {
60 | "type": "object",
61 | "properties": {
62 | "video_url": {
63 | "type": "string",
64 | "description": ""
65 | }
66 | },
67 | "required": [
68 | "video_url"
69 | ]
70 | },
71 | "tag": "VideoCaptions",
72 | "testCases": [
73 | {
74 | "font_name": "Heebo",
75 | "font_size": 30,
76 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/teacher_comic.mp4",
77 | "text_color": "white",
78 | "caption_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/test.srt",
79 | "text_bg_color": "black",
80 | "highlight_color": "yellow",
81 | "text_bg_opacity": 0.5,
82 | "caption_position": "bottom",
83 | "text_border_size": 0,
84 | "text_border_color": ""
85 | }
86 | ],
87 | "aiPrompt": "",
88 | "greeting": ""
89 | }
90 | $"""
91 |
92 | import pysrt
93 | from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip, ColorClip, concatenate_videoclips
94 | import os
95 | import requests
96 | import boto3
97 | import random, string
98 | import ast
99 |
100 | font_dir = os.environ['font_dir']
101 |
102 | color_dict = {
103 | 'red': (255, 0, 0),
104 | 'blue': (0, 0, 255),
105 | 'green': (0, 255, 0),
106 | 'white': (255, 255, 255),
107 | 'black': (0, 0, 0),
108 | 'yellow': (255, 255, 0),
109 | 'cyan': (0, 255, 255),
110 | 'magenta': (255, 0, 255),
111 | 'grey': (128, 128, 128),
112 | 'pink': (255, 192, 203),
113 | 'purple': (128, 0, 128),
114 | 'orange': (255, 165, 0),
115 | 'brown': (165, 42, 42)
116 | }
117 |
118 | def download_file(url, filename):
119 | response = requests.get(url)
120 | file = open(filename, 'wb')
121 | file.write(response.content)
122 | file.close()
123 |
124 | def random_color():
125 | return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
126 |
127 | def rgb_to_hex(rgb):
128 | return "#{:02x}{:02x}{:02x}".format(*rgb)
129 |
130 | s3_client = boto3.client('s3')
131 |
132 | def upload_to_aws(filename: str) -> str:
133 | # Uses your AWS credentials to access the service
134 | bucket_name = os.environ.get('bucket_name')
135 | region = os.environ.get('region')
136 | # Create a session using the provided credentials
137 | session = boto3.Session(
138 | aws_access_key_id=os.environ.get('access_key_id'),
139 | aws_secret_access_key=os.environ.get('secret_access_key')
140 | )
141 | # Create an S3 client
142 | s3_client = session.client('s3')
143 | bucket_path = 'ai-video'
144 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
145 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
146 | url = f'{s3_base_url}{bucket_path}/{filename}'
147 | return url
148 |
149 |
150 | def time_to_seconds(time_obj):
151 | return time_obj.hours * 3600 + time_obj.minutes * 60 + time_obj.seconds + time_obj.milliseconds / 1000
152 |
153 |
154 | llm_prompt = 'Given the input text, choose some important and meaningful words to highlight. Max 1-2 words per sentence. Return them as a python list.\nTEXT: {}'
155 | def highlight_words(input_str: str, event) -> str:
156 | input_str = llm_prompt.format(input_str)
157 | data = {
158 | "style": "LLM-Only",
159 | "stream": False,
160 | "messageContent": input_str,
161 | "agentId": 1548
162 | }
163 | resp = event.chat.messages(data=data)
164 | return resp
165 |
166 |
167 | def create_subtitle_clips(subtitles, videosize, fontsize=24, font='fonts/Caveat.ttf', color='yellow', bg_color='black', border_size=1.5, border_color="black", caption_position='bottom', bg_opacity=0.5, highlight_color=None, event=None):
168 | subtitle_clips = []
169 |
170 | for subtitle in subtitles:
171 | start_time = time_to_seconds(subtitle.start)
172 | end_time = time_to_seconds(subtitle.end)
173 | duration = end_time - start_time
174 |
175 | video_width, video_height = videosize
176 |
177 | if border_size == 0 or border_size == 0.:
178 | border_color = None
179 |
180 | method = 'caption'
181 | if highlight_color is not None:
182 | # https://docs.gtk.org/Pango/pango_markup.html
183 | important_words = ast.literal_eval(highlight_words(subtitle.text, event))
184 | print('Important words:', important_words)
185 | for word in important_words:
186 | subtitle.text = subtitle.text.replace(word, f'{word}')
187 | method = 'pango'
188 | subtitle.text = f'{subtitle.text}'
189 |
190 | text_clip = TextClip(subtitle.text, fontsize=fontsize, font=font, color=color, size=(video_width*3/4, None), method=method, stroke_color=border_color, stroke_width=border_size).set_start(start_time).set_duration(duration)
191 |
192 | # add bg color
193 | if bg_color in color_dict.keys():
194 | im_width, im_height = text_clip.size
195 | color_clip = ColorClip(size=(int(im_width), int(im_height)), color=color_dict[bg_color])
196 | color_clip = color_clip.set_opacity(bg_opacity).set_start(start_time).set_duration(duration)
197 | text_clip = CompositeVideoClip([color_clip, text_clip])
198 |
199 | subtitle_x_position = 'center'
200 | y_position_dict = {
201 | 'center': 'center',
202 | 'bottom': video_height * 4/5,
203 | 'top': video_height * 1/5,
204 | }
205 | subtitle_y_position = y_position_dict[caption_position]
206 |
207 | text_position = (subtitle_x_position, subtitle_y_position)
208 | subtitle_clips.append(text_clip.set_position(text_position))
209 |
210 | return subtitle_clips
211 |
212 |
213 | def mindsflow_function(event, context) -> dict:
214 |
215 | caption_url = event.get("caption_url")
216 | video_url = event.get("video_url")
217 | fontsize = event.get("font_size", 24)
218 | fontname = event.get("font_name", "SourceSerif4")
219 | text_color = event.get('text_color', 'white')
220 | bg_color = event.get('text_bg_color', 'black')
221 | bg_opacity = event.get('text_bg_opacity', 0.5)
222 | border_size = event.get('text_border_size', 1.)
223 | border_color = event.get('text_border_color', None)
224 | caption_position = event.get('caption_position', 'center')
225 | highlight_color = event.get('highlight_color', None)
226 | fontname = f'{font_dir}/{fontname}.ttf'
227 |
228 | mp4_path = video_url.split('/')[-1]
229 | caption_path = caption_url.split('/')[-1]
230 | download_file(video_url, mp4_path)
231 | download_file(caption_url, caption_path)
232 |
233 | if highlight_color not in color_dict.keys():
234 | highlight_color = None
235 |
236 | # Load video and SRT file
237 | video = VideoFileClip(mp4_path)
238 | subtitles = pysrt.open(caption_path)
239 |
240 | # Set output path
241 | output_path = "video_with_captions_{}.mp4".format(''.join(random.choices(string.ascii_letters + string.digits, k=5)))
242 |
243 | # Create subtitle clips
244 | subtitle_clips = create_subtitle_clips(subtitles, video.size, fontsize, fontname, text_color, bg_color, border_size, border_color, caption_position, bg_opacity, highlight_color, event)
245 |
246 | # Add subtitles to the video
247 | final_video = CompositeVideoClip([video] + subtitle_clips)
248 |
249 | # Write output video file
250 | final_video.write_videofile(output_path)
251 |
252 | upload_url = upload_to_aws(output_path)
253 | os.remove(output_path)
254 | os.remove(caption_path)
255 | os.remove(mp4_path)
256 |
257 | result = {
258 | 'video_url': upload_url
259 | }
260 |
261 | return result
262 |
263 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/AddCaptionsToVideoOpenCV.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "AddCaptionsToVideoOpenCV",
4 | "displayName": "",
5 | "description": "The Python method is intended to download a video from a given URL, add captions to that downloaded video, upload the updated video to an S3 bucket, and return a URL for accessing the newly uploaded video.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "video_url",
10 | "json_caption"
11 | ],
12 | "properties": {
13 | "margin": {
14 | "type": "number",
15 | "description": ""
16 | },
17 | "font_size": {
18 | "type": "number",
19 | "description": ""
20 | },
21 | "font_type": {
22 | "type": "string",
23 | "description": ""
24 | },
25 | "video_url": {
26 | "type": "string",
27 | "description": "URL of the video to be downloaded"
28 | },
29 | "text_color": {
30 | "type": "string",
31 | "description": ""
32 | },
33 | "border_color": {
34 | "type": "string",
35 | "description": ""
36 | },
37 | "json_caption": {
38 | "type": "string",
39 | "description": "Captions to be added to the video"
40 | },
41 | "max_caption_len": {
42 | "type": "number",
43 | "description": ""
44 | },
45 | "caption_position": {
46 | "type": "string",
47 | "description": ""
48 | }
49 | }
50 | },
51 | "outputPattern": {
52 | "type": "object",
53 | "required": [
54 | "video_url"
55 | ],
56 | "properties": {
57 | "video_url": {
58 | "type": "string",
59 | "description": "The URL of the video uploaded to S3"
60 | }
61 | }
62 | },
63 | "tag": "VideoCaptions",
64 | "testCases": [
65 | {
66 | "margin": 0.1,
67 | "font_size": 30,
68 | "font_type": "default",
69 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/a78d8376-a5f9-413c-9624-b4eb7680357e_video_no_audio.mp4",
70 | "text_color": "white",
71 | "border_color": "black",
72 | "json_caption": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedBDbUzJ.json",
73 | "max_caption_len": 40,
74 | "caption_position": "threequarter"
75 | }
76 | ],
77 | "aiPrompt": "",
78 | "greeting": ""
79 | }
80 | $"""
81 |
82 | import json
83 | import cv2
84 | from moviepy.editor import VideoFileClip
85 | import boto3
86 | import os
87 | import time
88 | import random
89 | import string
90 | import requests
91 | import numpy as np
92 | from PIL import ImageFont, ImageDraw, Image
93 |
94 |
95 | def upload_to_aws(filename: str) -> str:
96 | # Uses your AWS credentials to access the service
97 | bucket_name = os.environ.get('bucket_name')
98 | region = os.environ.get('region')
99 |
100 | # Create a session using the provided credentials
101 | session = boto3.Session(
102 | aws_access_key_id=os.environ.get('access_key_id'),
103 | aws_secret_access_key=os.environ.get('secret_access_key')
104 | )
105 |
106 | # Create an S3 client
107 | s3_client = session.client('s3')
108 |
109 | bucket_path = 'ai-video'
110 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
111 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
112 | url = f'{s3_base_url}{bucket_path}/{filename}'
113 |
114 | return url
115 |
116 | def download_file(url, save_path):
117 | response = requests.get(url)
118 | with open(save_path, 'wb') as file:
119 | file.write(response.content)
120 |
121 | def get_random_string():
122 | letters = string.ascii_lowercase
123 | result_str = ''.join(random.choice(letters) for _ in range(8))
124 | timestamp = int(time.time())
125 | random_str = str(timestamp) + '_' + result_str
126 | return random_str
127 |
128 | # Define color dictionary for known colors
129 | color_dict = {
130 | 'black': (0, 0, 0),
131 | 'white': (255, 255, 255),
132 | 'red': (0, 0, 255), # Remember, in OpenCV it's BGR not RGB
133 | 'green': (0, 255, 0),
134 | 'blue': (255, 0, 0),
135 | 'yellow': (0, 255, 255)
136 | }
137 |
138 |
139 | # Define the dictionary for known font types
140 | font_dict = {
141 | 'chinese': 'NotoSansSC',
142 | 'default': 'SourceSerif4',
143 | }
144 |
145 |
146 | def wrap_text(caption, frame_width, font):
147 | words = caption.split(' ')
148 | lines = [words.pop(0)] # Initial
149 | for word in words:
150 | box = font.getbbox(lines[-1] + ' ' + word)
151 | text_width, text_height = box[2] - box[0], box[3] - box[1]
152 | if text_width > frame_width:
153 | lines.append(word)
154 | else:
155 | lines[-1] += ' ' + word
156 | return lines
157 |
158 | def add_captions(video_path, json_file_path, border_size=2, border_color='black', text_color='white',
159 | font_size=30, font_type='DUPLEX', caption_position='bottom', outfile="out.mp4", margin=0.1,
160 | font_dir=''):
161 | # Load video
162 | cap = cv2.VideoCapture(video_path)
163 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
164 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
165 | fps = int(cap.get(cv2.CAP_PROP_FPS))
166 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
167 |
168 | # Load the JSON file with caption details
169 | with open(json_file_path, 'r') as f:
170 | captions = json.load(f)
171 | print(captions)
172 |
173 | # Get the specified color tuples
174 | border_color = color_dict[border_color.lower()]
175 | text_color = color_dict[text_color.lower()]
176 | # Get the specified font
177 | if font_type is None:
178 | font_type = 'default'
179 | if font_type in font_dict.keys():
180 | font_type = font_dict[font_type]
181 | font = ImageFont.truetype(f'{os.path.join(font_dir, font_type)}.ttf', size=font_size)
182 |
183 | # Define the codec and create a VideoWriter object
184 | #fourcc_code = int(cap.get(cv2.CAP_PROP_FOURCC))
185 | #fourcc_code = "".join([chr((fourcc_code >> 8 * i) & 0xFF) for i in range(4)])
186 | fourcc_code = "vp90"
187 | fourcc = cv2.VideoWriter_fourcc(*fourcc_code)
188 | out = cv2.VideoWriter(outfile, fourcc, fps, (width, height))
189 |
190 | frame_counter = 0
191 | caption_index = 0
192 | print('fps', fps)
193 | while(cap.isOpened()):
194 | ret, frame = cap.read()
195 | if ret:
196 | current_time = frame_counter * (1e7/fps) # Current timestamp in microseconds
197 | print(current_time / 1e7, captions[caption_index], caption_index)
198 | print(frame_counter, caption_index)
199 | if current_time >= captions[caption_index]['end_time']:
200 | caption_index += 1
201 | # Check if there are no more captions
202 | if caption_index >= len(captions):
203 | break # If no more captions, exit loop
204 |
205 | img_pil = Image.fromarray(frame)
206 | draw = ImageDraw.Draw(img_pil)
207 |
208 | margin_rate = int(width * margin)
209 |
210 | lines = wrap_text(captions[caption_index]['sentence'], width - 2 * margin_rate, font)
211 | for i, line in enumerate(lines):
212 | box = font.getbbox(line)
213 | text_width, text_height = box[2] - box[0], box[3] - box[1]
214 | text_height = font_size * 1.3
215 |
216 | # Center the text
217 | textX = (width - text_width - margin_rate * 2) // 2 + margin_rate
218 | total_lines = len(lines)
219 | total_text_height = total_lines * text_height # The total height of text block
220 |
221 | # Position text as per given caption_position
222 | if caption_position.lower() == 'top':
223 | textY = margin_rate + (i * text_height)
224 | elif caption_position.lower() == 'bottom':
225 | textY = height - margin_rate - (len(lines) - i) * text_height
226 | elif caption_position.lower() == 'threequarter':
227 | three_quarter_height = height * 0.75
228 | textY = three_quarter_height - ((total_lines - i) * text_height)
229 | elif caption_position.lower() == 'onequarter':
230 | one_quarter_height = height * 0.25
231 | textY = one_quarter_height + ((i + 1) * text_height)
232 | else: # Default to center if unknown value
233 | textY = ((height - total_text_height) // 2) + (i * text_height)
234 |
235 | for k in range(-border_size, border_size+1):
236 | for j in range(-border_size, border_size+1):
237 | draw.text((textX+j, textY+k), line, font = font, fill = border_color)
238 | draw.text((textX, textY), line, font = font, fill = text_color)
239 |
240 | out.write(np.array(img_pil))
241 |
242 | frame_counter += 1
243 |
244 | else:
245 | break
246 |
247 | cap.release()
248 | out.release()
249 |
250 | def mindsflow_function(event, context) -> dict:
251 | # get the video url and caption from the event
252 | video_url = event.get("video_url")
253 | captions_url = event.get("json_caption")
254 | caption_position = event.get("caption_position", "bottom")
255 | border_color = event.get("border_color", "black")
256 | text_color = event.get("text_color", "white")
257 | font_size = event.get("font_size", 30)
258 | max_caption_len = event.get("max_caption_len", 30)
259 | margin = event.get("margin", 0.1)
260 | font_type = event.get("font_type", 'default')
261 |
262 | download_path = "video_" + get_random_string() + ".mp4"
263 | out_path = "video_" + get_random_string() + ".mp4"
264 | download_file(video_url, download_path)
265 |
266 | json_path = "caption_" + get_random_string() + ".json"
267 | download_file(captions_url, json_path)
268 |
269 | # get the captioned video URL
270 | add_captions(download_path,
271 | json_file_path=json_path,
272 | outfile=out_path,
273 | caption_position=caption_position,
274 | border_color=border_color,
275 | text_color=text_color,
276 | font_size=font_size,
277 | margin=margin,
278 | font_type=font_type,
279 | font_dir = os.environ.get('font_dir')
280 | )
281 |
282 | # upload the combined image to aws and save the url
283 | url = upload_to_aws(out_path)
284 |
285 | # define result
286 | result = {
287 | 'video_url': url
288 | }
289 |
290 | if os.path.exists(download_path):
291 | os.remove(download_path)
292 | if os.path.exists(json_path):
293 | os.remove(json_path)
294 |
295 | return result
296 |
297 |
298 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/AudioTranscriptionToSentences.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "AudioTranscriptionToSentences",
4 | "displayName": "",
5 | "description": "This method downloads a JSON file containing the transcription of an audio, including the start and duration of each word. It further splits the transcription into sentences and uses the JSON transcription to map the start and duration of each sentence.",
6 | "inputPattern": {
7 | "type": "object",
8 | "properties": {
9 | "add_punctuation": {
10 | "type": "boolean",
11 | "description": ""
12 | },
13 | "split_all_punctuation": {
14 | "type": "boolean",
15 | "description": ""
16 | },
17 | "transcription_json_url": {
18 | "type": "string",
19 | "description": "URL from where to download the json file."
20 | }
21 | },
22 | "required": [
23 | "split_all_punctuation",
24 | "transcription_json_url"
25 | ]
26 | },
27 | "outputPattern": {
28 | "type": "object",
29 | "properties": {
30 | "text": {
31 | "type": "string",
32 | "description": ""
33 | },
34 | "n_splits": {
35 | "type": "number",
36 | "description": ""
37 | },
38 | "sentences_json_url": {
39 | "type": "string",
40 | "description": "URL to download JSON"
41 | }
42 | },
43 | "required": [
44 | "text",
45 | "n_splits",
46 | "sentences_json_url"
47 | ]
48 | },
49 | "tag": "DataPreprocessing",
50 | "testCases": [
51 | {
52 | "add_punctuation": false,
53 | "split_all_punctuation": false,
54 | "transcription_json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/audio_transcription_1703468432_yelpditk.json"
55 | }
56 | ],
57 | "aiPrompt": "",
58 | "greeting": ""
59 | }
60 | $"""
61 |
62 | import json
63 | import requests
64 | import boto3
65 | import time
66 | import random
67 | import string
68 | import os
69 | import nltk
70 | import jieba
71 | import re
72 | import regex
73 |
74 | def download_file(url, save_path):
75 | response = requests.get(url)
76 | with open(save_path, 'wb') as file:
77 | file.write(response.content)
78 |
79 | def get_random_string():
80 | letters = string.ascii_lowercase
81 | result_str = ''.join(random.choice(letters) for _ in range(8))
82 | timestamp = int(time.time())
83 | random_str = str(timestamp) + '_' + result_str
84 | return random_str
85 |
86 | def upload_to_aws(filename: str) -> str:
87 | # Uses your AWS credentials to access the service
88 | bucket_name = os.environ.get('bucket_name')
89 | region = os.environ.get('region')
90 |
91 | # Create a session using the provided credentials
92 | session = boto3.Session(
93 | aws_access_key_id=os.environ.get('access_key_id'),
94 | aws_secret_access_key=os.environ.get('secret_access_key')
95 | )
96 |
97 | # Create an S3 client
98 | s3_client = session.client('s3')
99 |
100 | bucket_path = 'ai-video'
101 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
102 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
103 | url = f'{s3_base_url}{bucket_path}/{filename}'
104 |
105 | return url
106 |
107 |
108 | light_punctuation = [',', ","]
109 |
110 |
111 | def divide_string(words, words2, split_all_punctuation=True):
112 | substrings = []
113 | substrings2 = []
114 | substring_len = []
115 |
116 | current_substring = ""
117 | current_substring2 = ""
118 | cur_substring_len = 0
119 |
120 | punctuation = [".", "!", "?", ";", "。", "!", "?", ";"]
121 | if split_all_punctuation is True:
122 | punctuation += light_punctuation
123 | for i, word in enumerate(words):
124 | if word[-1] in punctuation:
125 | #print(word, current_substring)
126 | cur_substring_len += 1
127 | if regex.match(r'\p{Script=Han}', word):
128 | current_substring += "" + word
129 | current_substring2 += "" + words2[i]
130 | else:
131 | current_substring += " " + word
132 | current_substring2 += " " + words2[i]
133 | substrings.append(current_substring.strip())
134 | substrings2.append(current_substring2.strip())
135 | current_substring = ""
136 | current_substring2 = ""
137 | substring_len.append(cur_substring_len)
138 | cur_substring_len = 0
139 | else:
140 | cur_substring_len += 1
141 | if regex.match(r'\p{Script=Han}', word):
142 | current_substring += "" + word
143 | current_substring2 += "" + words2[i]
144 | else:
145 | current_substring += " " + word
146 | current_substring2 += " " + words2[i]
147 |
148 | if current_substring: # If there's anything left, append it to the list
149 | substrings.append(current_substring.strip())
150 | substrings2.append(current_substring2.strip())
151 | substring_len.append(cur_substring_len)
152 |
153 | return substrings, substrings2, substring_len
154 |
155 | llm_prompt = '''split this text into smaller sentences
156 | TEXT: {}'''
157 | def llm_add_puntuaction(input_str: str, event) -> str:
158 | data = {
159 | "style": "LLM-Only",
160 | "stream": False,
161 | "messageContent": llm_prompt.format(input_str),
162 | "agentId": 964
163 | }
164 | resp = event.chat.messages(data=data)
165 | return resp
166 |
167 | def get_sentence_time(json_file_path, event, split_all_punctuation=True, add_punctuation=False):
168 | # Load JSON data from a file
169 | with open(json_file_path, 'r') as f:
170 | data = json.load(f)
171 |
172 | # Get display text and split into sentences
173 | display_lexical = data['Lexical'].strip()
174 | display_text = data['Display'].strip().replace('.', '. ')
175 |
176 | if add_punctuation:
177 | display_text = llm_add_puntuaction(display_text, event) # to test
178 |
179 | lexical_list = display_lexical.split()
180 | text_list = display_text.split()
181 | print(len(lexical_list), lexical_list)
182 | print(len(text_list), text_list)
183 |
184 | def n_split_str(str_, n):
185 | words = str_.split()
186 | return [' '.join(words[i:i+n]) for i in range(0, len(words), n)]
187 | def count_words(sentences):
188 | return [len(sentence.split()) for sentence in sentences]
189 |
190 | if len(lexical_list) != len(text_list):
191 | sentences_text = n_split_str(display_lexical, 10)
192 | sentences_clean = sentences_text
193 | substring_len_text = count_words(sentences_text)
194 | substring_len_lexical = substring_len_text
195 | else:
196 | sentences_text, sentences_clean, substring_len_text = divide_string(text_list, lexical_list, split_all_punctuation)
197 | substring_len_lexical = substring_len_text
198 |
199 | print(sentences_clean)
200 | print(substring_len_text ,sentences_text)
201 |
202 | # Map words to their times
203 | words = [{'Word': w['Word'], 'Index': index, 'Offset': w['Offset'], 'Duration': w['Duration']} for index, w in enumerate(data['Words'])]
204 | #print(words)
205 |
206 | sentence_times = []
207 |
208 | index = 0
209 | for i, sentence in enumerate(sentences_text):
210 | start_time = words[index]['Offset']
211 | index += substring_len_lexical[i] - 1
212 | end_time = words[index]['Offset'] + words[index]['Duration']
213 | duration = end_time - start_time
214 | index += 1
215 | #print(duration)
216 | final_sentence = sentences_text[i]
217 | while final_sentence[-1] in light_punctuation:
218 | final_sentence = final_sentence[:-1]
219 |
220 | sentence_times.append({
221 | 'sentence': final_sentence,
222 | 'start_time': start_time,
223 | 'end_time': end_time,
224 | 'duration': duration
225 | })
226 |
227 | return sentence_times, display_text
228 |
229 |
230 | def mindsflow_function(event, context) -> dict:
231 |
232 | url = event.get('transcription_json_url')
233 | split_all_punctuation = event.get('split_all_punctuation', True)
234 | add_punctuation = event.get('add_punctuation', False)
235 |
236 | transcription_path = 'transcript_{}.json'.format(get_random_string())
237 | download_file(url, transcription_path)
238 |
239 | sentence_times, text = get_sentence_time(transcription_path, event, split_all_punctuation, add_punctuation)
240 |
241 | output_file = 'sentence_times_{}.json'.format(get_random_string())
242 | with open(output_file, 'w') as f:
243 | json.dump(sentence_times, f)
244 |
245 | url = upload_to_aws(output_file)
246 |
247 | result = {
248 | 'sentences_json_url': url,
249 | 'text': text,
250 | 'n_splits': len(sentence_times)
251 | }
252 |
253 | if os.path.exists(transcription_path):
254 | os.remove(transcription_path)
255 | if os.path.exists(output_file):
256 | os.remove(output_file)
257 |
258 | return result
259 |
260 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/CommandsExecution.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "CommandsExecution",
4 | "displayName": "",
5 | "description": "CommandsExecution",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [],
9 | "properties": {}
10 | },
11 | "outputPattern": {
12 | "type": "object",
13 | "required": [],
14 | "properties": {}
15 | },
16 | "tag": "Example",
17 | "testCases": [
18 | {}
19 | ],
20 | "aiPrompt": "",
21 | "greeting": ""
22 | }
23 | $"""
24 |
25 | import json
26 |
27 | def mindsflow_function(event, context) -> dict:
28 | """
29 | This is the main function that processes an event within a given context.
30 |
31 | Args:
32 | event (class Event): Containing mindsflow internal api and request information.
33 | case1: event.get("param") # inference parameters
34 | case2: event.chat.messages(data) # call mindsflow api
35 | context (class Context): Containing execution context and additional environment information.
36 |
37 | Returns:
38 | dict: A result dictionary meeting the Output Pattern.
39 | """
40 | import zipfile
41 | import subprocess
42 |
43 | '''def unzip_folder(path_to_zip_file, directory_to_extract_to):
44 | with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
45 | zip_ref.extractall(directory_to_extract_to)'''
46 |
47 | # usage
48 | #unzip_folder("fonts.zip", "fonts")
49 |
50 | def execute_command(command):
51 | process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
52 | output, error = process.communicate()
53 | execute_command("pip uninstall spleeter")
54 |
55 | result = {
56 | 'data': 'Hello, MindsFlow User!'
57 | }
58 |
59 | return result
60 |
61 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/ConvertSrtToAss.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "ConvertSrtToAss",
4 | "displayName": "",
5 | "description": "Converts srt file to ass file",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "srt_url"
10 | ],
11 | "properties": {
12 | "shadow": {
13 | "type": "number",
14 | "description": ""
15 | },
16 | "marginl": {
17 | "type": "integer",
18 | "description": ""
19 | },
20 | "marginr": {
21 | "type": "integer",
22 | "description": ""
23 | },
24 | "marginv": {
25 | "type": "integer",
26 | "description": ""
27 | },
28 | "outline": {
29 | "type": "integer",
30 | "description": ""
31 | },
32 | "srt_url": {
33 | "type": "string",
34 | "description": ""
35 | },
36 | "fontname": {
37 | "type": "string",
38 | "description": "arial"
39 | },
40 | "fontsize": {
41 | "type": "integer",
42 | "description": ""
43 | }
44 | }
45 | },
46 | "outputPattern": {
47 | "type": "object",
48 | "required": [
49 | "ass_url"
50 | ],
51 | "properties": {
52 | "ass_url": {
53 | "type": "string",
54 | "description": ""
55 | }
56 | }
57 | },
58 | "tag": "VideoCaptions",
59 | "testCases": [
60 | {
61 | "shadow": 0,
62 | "marginl": 0,
63 | "marginr": 0,
64 | "marginv": 0,
65 | "outline": 0,
66 | "srt_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/89z79p.srt",
67 | "fontname": "文泉驿正黑",
68 | "fontsize": 0
69 | }
70 | ],
71 | "aiPrompt": "",
72 | "greeting": ""
73 | }
74 | $"""
75 |
76 | import json
77 | import boto3
78 | import os
79 | import uuid
80 | import requests
81 | import pysubs2
82 |
83 |
84 | def download_file(url, filename):
85 | response = requests.get(url)
86 | file = open(filename, 'wb')
87 | file.write(response.content)
88 | file.close()
89 |
90 | s3_client = boto3.client('s3')
91 |
92 | def upload_to_aws(filename: str) -> str:
93 | # Uses your AWS credentials to access the service
94 | bucket_name = os.environ.get('bucket_name')
95 | region = os.environ.get('region')
96 | # Create a session using the provided credentials
97 | session = boto3.Session(
98 | aws_access_key_id=os.environ.get('access_key_id'),
99 | aws_secret_access_key=os.environ.get('secret_access_key')
100 | )
101 | # Create an S3 client
102 | s3_client = session.client('s3')
103 | bucket_path = 'ai-video'
104 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
105 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
106 | url = f'{s3_base_url}{bucket_path}/{filename}'
107 | return url
108 |
109 |
110 | def convert_srt_to_ass(srt_path: str, ass_path: str, fontname='Arial', fontsize=16, marginl=10, marginv=10, marginr=10, outline=0, shadow=0):
111 | subs = pysubs2.load(srt_path, encoding="utf-8")
112 | for line in subs:
113 | line.style = "my_style"
114 | subs.styles["my_style"] = pysubs2.SSAStyle(fontname=fontname, fontsize=fontsize,
115 | marginl=marginl, marginr=marginr,
116 | marginv=marginv, outline=outline,
117 | shadow=shadow)
118 | subs.save(ass_path)
119 |
120 |
121 | def mindsflow_function(event, context) -> dict:
122 | srt_url = event.get("srt_url")
123 | fontname = event.get("fontname", "Arial")
124 | fontsize = event.get("fontsize", 10)
125 | marginl = event.get("marginl", 20)
126 | marginr = event.get("marginr", 20)
127 | marginv = event.get("marginv", 10)
128 | outline = event.get("outline", 1)
129 | shadow = event.get("shadow", 0)
130 |
131 | file_name = srt_url.split('/')[-1].split('.')[0]
132 | srt_file = f"{file_name}.srt"
133 | ass_file = f"{file_name}.ass"
134 | download_file(srt_url, srt_file)
135 |
136 | convert_srt_to_ass(srt_file, ass_file, fontname, fontsize, marginl, marginv, marginr, outline, shadow)
137 |
138 | upload_url = upload_to_aws(ass_file)
139 |
140 | os.remove(srt_file)
141 | os.remove(ass_file)
142 |
143 | result = {
144 | 'ass_url': upload_url
145 | }
146 |
147 | return result
148 |
149 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/MindsflowAgent.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "MindsflowAgent",
4 | "displayName": "",
5 | "description": "Example of how to invoke Mindsflow agent",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "input_str"
10 | ],
11 | "properties": {
12 | "input_str": {
13 | "type": "string",
14 | "description": "The input string to be translated"
15 | }
16 | }
17 | },
18 | "outputPattern": {
19 | "type": "object",
20 | "required": [
21 | "translated_text"
22 | ],
23 | "properties": {
24 | "translated_text": {
25 | "type": "string",
26 | "description": "translation result"
27 | }
28 | }
29 | },
30 | "tag": "Example",
31 | "testCases": [
32 | {
33 | "input_str": "hello"
34 | }
35 | ],
36 | "aiPrompt": "aa",
37 | "greeting": ""
38 | }
39 | $"""
40 |
41 | import json
42 |
43 | def translate_text(input_str: str, event) -> str:
44 | data = {
45 | "style": "LLM-Only",
46 | "stream": False,
47 | "messageContent": input_str,
48 | "agentId": 739
49 | }
50 |
51 | resp = event.chat.messages(data=data)
52 |
53 | return resp
54 |
55 | def mindsflow_function(event, context) -> dict:
56 | # get the input string from the event
57 | input_str = event.get("input_str")
58 |
59 | # get the translation result
60 | translated_text = translate_text(input_str, event)
61 |
62 | # define result
63 | result = {
64 | 'translated_text': translated_text
65 | }
66 |
67 | return result
68 |
69 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/MusicGeneration.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "MusicGeneration",
4 | "displayName": "",
5 | "description": "Generate music from prompt",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "music_prompt"
10 | ],
11 | "properties": {
12 | "seed": {
13 | "type": "integer",
14 | "description": ""
15 | },
16 | "duration": {
17 | "type": "number",
18 | "description": ""
19 | },
20 | "temperature": {
21 | "type": "number",
22 | "description": ""
23 | },
24 | "music_prompt": {
25 | "type": "string",
26 | "description": ""
27 | }
28 | }
29 | },
30 | "outputPattern": {
31 | "type": "object",
32 | "required": [
33 | "music_url"
34 | ],
35 | "properties": {
36 | "music_url": {
37 | "type": "string",
38 | "description": ""
39 | }
40 | }
41 | },
42 | "tag": "VideoGeneration",
43 | "testCases": [
44 | {
45 | "seed": -1,
46 | "duration": 4.9,
47 | "temperature": 1,
48 | "music_prompt": "Create a classical music piece"
49 | }
50 | ],
51 | "aiPrompt": "",
52 | "greeting": ""
53 | }
54 | $"""
55 |
56 | import json
57 | from os import path
58 | import math
59 |
60 | '''import subprocess
61 | command = 'pip install replicate'
62 | process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, universal_newlines=True)'''
63 | import replicate
64 |
65 |
66 | # Function to create a short music
67 | def create_music(prompt: str, duration: int=15, temperature: float=1, seed: int=-1) -> str:
68 | output = replicate.run(
69 | "meta/musicgen:7be0f12c54a8d033a0fbd14418c9af98962da9a86f5ff7811f9b3423a1f0b7d7",
70 | input={"model_version": "large",
71 | "prompt": prompt,
72 | "duration": duration,
73 | "temperature": temperature,
74 | "seed": seed}
75 | )
76 |
77 | return output
78 |
79 | def mindsflow_function(event, context) -> dict:
80 | # get the prompt from the event
81 | prompt = event.get("music_prompt")
82 | duration = event.get("duration", 15)
83 | duration = min(duration, 28)
84 | temperature = event.get("temperature", 1)
85 | seed = event.get("seed", -1)
86 | if isinstance(duration, float):
87 | duration = math.ceil(duration) # Convert to int and approximate by excess
88 |
89 | # get the music URL
90 | music_url = create_music(prompt, duration, temperature, seed)
91 |
92 | # define result
93 | result = {
94 | 'music_url': music_url
95 | }
96 |
97 | return result
98 |
99 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/ShowFonts.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "ShowFonts",
4 | "displayName": "",
5 | "description": "Show fonts",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [],
9 | "properties": {}
10 | },
11 | "outputPattern": {
12 | "type": "object",
13 | "required": [],
14 | "properties": {}
15 | },
16 | "tag": "VideoCaptions",
17 | "testCases": [
18 | {}
19 | ],
20 | "aiPrompt": "",
21 | "greeting": ""
22 | }
23 | $"""
24 |
25 | import json
26 | from moviepy.editor import TextClip
27 | import os
28 | from PIL import Image, ImageDraw, ImageFont
29 | from moviepy.editor import concatenate
30 |
31 | FOLDER = 'fonts' # specify the correct path
32 |
33 | def mindsflow_function(event, context) -> dict:
34 |
35 | WIDTH, HEIGHT = 500, 500 # specify dimensions of each image
36 | BG_COLOR = (0, 0, 0) # background color
37 |
38 | # Create images with each font
39 | images = []
40 | for file in os.listdir(FOLDER):
41 | if file.endswith(".ttf"):
42 | font = ImageFont.truetype(os.path.join(FOLDER, file), 50)
43 | image = Image.new('RGB', (WIDTH, HEIGHT), color=BG_COLOR)
44 | draw = ImageDraw.Draw(image)
45 |
46 | text = '{}'.format(file.replace('.ttf', ''))
47 | x = 10
48 | y = 150
49 |
50 | draw.text((x, y), text, fill=(255,255,255), font=font)
51 | images.append(image)
52 |
53 | # Calculate the grid size - 6 images per row
54 | rows = len(images) // 6
55 | if len(images) % 6:
56 | rows += 1
57 |
58 | # Concatenate all images into grid
59 | concat_image = Image.new('RGB', (WIDTH * 6, HEIGHT * rows), BG_COLOR)
60 |
61 | x_offset = 0
62 | y_offset = 0
63 | for i, img in enumerate(images):
64 | concat_image.paste(img, (x_offset, y_offset))
65 | if (i+1) % 6 == 0:
66 | x_offset = 0
67 | y_offset += HEIGHT
68 | else:
69 | x_offset += WIDTH
70 | concat_image.save(f'{FOLDER}/fonts.jpg')
71 |
72 | result = {
73 | 'fonts': f'{FOLDER}/fonts.jpg'
74 | }
75 |
76 | return result
77 |
78 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/UploadResultZipS3.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "UploadResultZipS3",
4 | "displayName": "",
5 | "description": "UploadResultZipS3",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "video_url",
10 | "title",
11 | "first_frame_url",
12 | "script",
13 | "description"
14 | ],
15 | "properties": {
16 | "title": {
17 | "type": "string",
18 | "description": ""
19 | },
20 | "script": {
21 | "type": "string",
22 | "description": ""
23 | },
24 | "video_url": {
25 | "type": "string",
26 | "description": ""
27 | },
28 | "description": {
29 | "type": "string",
30 | "description": ""
31 | },
32 | "first_frame_url": {
33 | "type": "string",
34 | "description": ""
35 | },
36 | "video_url_no_music": {
37 | "type": "string",
38 | "description": ""
39 | }
40 | }
41 | },
42 | "outputPattern": {
43 | "type": "object",
44 | "required": [
45 | "result_url"
46 | ],
47 | "properties": {
48 | "result_url": {
49 | "type": "string",
50 | "description": ""
51 | }
52 | }
53 | },
54 | "tag": "UploadVideo",
55 | "testCases": [
56 | {
57 | "script": "hello",
58 | "title": "title of the 对的 video 沙发",
59 | "first_frame_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/img_1697717278_uakysssz.png",
60 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/output_1697717270_bmvbdbul.mp4",
61 | "description": "a video about something",
62 | "video_url_no_music": ""
63 | }
64 | ],
65 | "aiPrompt": "Upload result S3",
66 | "greeting": ""
67 | }
68 | $"""
69 |
70 | import os
71 | import urllib.request
72 | import json
73 | import shutil
74 | import boto3
75 | import unicodedata
76 | import random
77 | import string
78 |
79 | # Auxiliary function to download video and image
80 | def download_file(url, path):
81 | try:
82 | urllib.request.urlretrieve(url, path)
83 | return True
84 | except Exception as e:
85 | print(f"An error occurred while downloading the file. Error: {str(e)}")
86 | return False
87 |
88 | # Auxiliary function to write description and title in txt files
89 | def write_txt_file(content, path):
90 | try:
91 | with open(path, 'w') as f:
92 | f.write(content)
93 | return True
94 | except Exception as e:
95 | print(f"An error occurred while writing the text file. Error: {str(e)}")
96 | return False
97 |
98 | def upload_to_aws(filename: str) -> str:
99 | # Uses your AWS credentials to access the service
100 | bucket_name = os.environ.get('bucket_name')
101 | region = os.environ.get('region')
102 |
103 | # Create a session using the provided credentials
104 | session = boto3.Session(
105 | aws_access_key_id=os.environ.get('access_key_id'),
106 | aws_secret_access_key=os.environ.get('secret_access_key')
107 | )
108 |
109 | # Create an S3 client
110 | s3_client = session.client('s3')
111 |
112 | bucket_path = 'video-results'
113 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
114 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
115 | url = f'{s3_base_url}{bucket_path}/{filename}'
116 |
117 | return url
118 |
119 | # Auxiliary function to create a folder, download the files and then zip the folder
120 | def prepare_files(event):
121 | video_url = event.get("video_url")
122 | image_url = event.get("first_frame_url")
123 | video_title = event.get("title")
124 | video_description = event.get("description")
125 | text = event.get("script")
126 | video_url_no_music = event.get("video_url_no_music", None)
127 |
128 | print(video_title)
129 | video_title_original = video_title
130 | if not video_title.isascii():
131 | video_title = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
132 | if len(video_title) > 30:
133 | video_title = video_title[:30]
134 | video_title = video_title.replace(" ", "_")
135 | print(video_title)
136 |
137 | if not os.path.exists(video_title):
138 | os.makedirs(video_title)
139 |
140 | video_path = f"{video_title}/video.{video_url.split('.')[-1]}"
141 | download_file(video_url, video_path)
142 | img_path = f"{video_title}/first_frame.{image_url.split('.')[-1]}"
143 | download_file(image_url, img_path)
144 |
145 | write_txt_file(video_description, f"{video_title}/description.txt")
146 | write_txt_file(video_title_original, f"{video_title}/{video_title}.txt")
147 | write_txt_file(text, f"{video_title}/text.txt")
148 | if video_url_no_music is not None:
149 | write_txt_file(video_url_no_music, f"{video_title}/video_url_no_music.txt")
150 |
151 | shutil.make_archive(video_title, 'zip', video_title)
152 | url = upload_to_aws(f"{video_title}.zip")
153 |
154 | os.remove(video_path)
155 | os.remove(img_path)
156 | os.remove(f"{video_title}.zip")
157 | shutil.rmtree(video_title)
158 |
159 | return url
160 |
161 | # Main function
162 | def mindsflow_function(event, context) -> dict:
163 |
164 | # prepare files and upload to S3
165 | s3_url = prepare_files(event)
166 |
167 | # define result
168 | result = {
169 | 'result_url': s3_url
170 | }
171 |
172 | return result
173 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/addAudioSegmentsToVideo.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "addAudioSegmentsToVideo",
4 | "displayName": "",
5 | "description": "Add audio segments to video",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "voice",
10 | "json_url",
11 | "video_url",
12 | "audio_folder",
13 | "use_original_voice"
14 | ],
15 | "properties": {
16 | "voice": {
17 | "type": "string",
18 | "description": ""
19 | },
20 | "json_url": {
21 | "type": "string",
22 | "description": ""
23 | },
24 | "video_url": {
25 | "type": "string",
26 | "description": ""
27 | },
28 | "audio_folder": {
29 | "type": "string",
30 | "description": ""
31 | },
32 | "use_original_voice": {
33 | "type": "boolean",
34 | "description": ""
35 | }
36 | }
37 | },
38 | "outputPattern": {
39 | "type": "object",
40 | "required": [
41 | "video_url"
42 | ],
43 | "properties": {
44 | "video_url": {
45 | "type": "string",
46 | "description": ""
47 | }
48 | }
49 | },
50 | "tag": "TextToSpeech",
51 | "testCases": [
52 | {
53 | "voice": "d8369f1b-588b-40b2-8009-3511630bff13_audio",
54 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedAOjUGH.json",
55 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/0ea5ed8d-795e-4120-993d-62bb9ba70920_video_no_audio.mp4",
56 | "audio_folder": "test",
57 | "use_original_voice": false
58 | },
59 | {
60 | "voice": "d8369f1b-588b-40b2-8009-3511630bff13_audio",
61 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedAEAQmF.json",
62 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/a53247d5-055c-464e-bdc3-242369f1ff46_video_no_audio.mp4",
63 | "audio_folder": "test",
64 | "use_original_voice": false
65 | },
66 | {
67 | "voice": "zh-CN-YunfengNeural",
68 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedDLYYSi.json",
69 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/combine_8d43656c-2e0c-48cd-a4b6-d8c1c0336740.mp4",
70 | "audio_folder": "test",
71 | "use_original_voice": false
72 | }
73 | ],
74 | "aiPrompt": "",
75 | "greeting": ""
76 | }
77 | $"""
78 |
79 | import json
80 | import moviepy.editor as mpy
81 | import os
82 | import requests
83 | from pydub import AudioSegment
84 | import shutil
85 | import boto3
86 |
87 |
88 | def download_file(url, filename):
89 | if not os.path.exists(filename):
90 | res = requests.get(url)
91 | with open(filename, "wb") as f:
92 | f.write(res.content)
93 | else:
94 | print(f"The file {filename} already exists.")
95 |
96 |
97 | def get_captions_from_url(url):
98 | filename = f"{url.split('/')[-1]}"
99 | # download the json file
100 | download_file(url, filename)
101 | # read the contents
102 | with open(filename, 'r', encoding='utf-8') as f:
103 | captions = json.load(f)
104 | return captions, filename
105 |
106 |
107 | def upload_to_aws(filename: str, bucket_path = None) -> str:
108 | bucket_name = os.environ.get('bucket_name')
109 | region = os.environ.get('region')
110 | session = boto3.Session(
111 | aws_access_key_id=os.environ.get('access_key_id'),
112 | aws_secret_access_key=os.environ.get('secret_access_key')
113 | )
114 | s3_client = session.client('s3')
115 | if bucket_path is None:
116 | bucket_path = 'ai-video'
117 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
118 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
119 | url = f'{s3_base_url}{bucket_path}/{filename}'
120 | return url
121 |
122 |
123 | def delete_from_aws(filename: str, bucket_path=None):
124 | bucket_name = os.environ.get('bucket_name')
125 | session = boto3.Session(
126 | aws_access_key_id=os.environ.get('access_key_id'),
127 | aws_secret_access_key=os.environ.get('secret_access_key')
128 | )
129 | s3_client = session.client('s3')
130 | if bucket_path is None:
131 | bucket_path = 'ai-video'
132 | # Now delete the file after upload
133 | s3_client.delete_object(Bucket=bucket_name, Key=f"{bucket_path}/{filename}")
134 |
135 |
136 | unit_time = 10000000
137 |
138 |
139 | def combine_video_audio(video_path: str, captions, audio_folder: str, api_data: dict, voice_clone_url: str, use_original_voice: bool = False) -> str:
140 | # get the video
141 | video = mpy.VideoFileClip(video_path)
142 | audio_tracks = []
143 |
144 | # loop over all the start times
145 | for i, cap in enumerate(captions):
146 | # start time of audio
147 | start_time = cap['start_time'] / unit_time
148 | audio_path = f"{audio_folder}/audio_segment_{i+1}.wav"
149 |
150 | print(f'Processing audio {i+1} | Start time {start_time} | {audio_path}')
151 |
152 | if use_original_voice:
153 | audio_url = upload_to_aws(audio_path, bucket_path='temp-audio')
154 | headers = {'Content-Type': 'application/json'}
155 | api_data['audio_url'] = audio_url
156 | response = requests.post(voice_clone_url, data=json.dumps(api_data), headers=headers)
157 | if response.status_code != 200:
158 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}')
159 | audio_path = f'{audio_folder}/gen_voice_{i+1}.wav'
160 | print('use original voice', audio_path)
161 | with open(audio_path, 'wb') as file:
162 | file.write(response.content)
163 | delete_from_aws(audio_path, bucket_path='temp-audio')
164 |
165 | # load newly created voice track as an AudioFileClip
166 | new_audio = mpy.AudioFileClip(audio_path)
167 | # set start time for this audio segment
168 | new_audio = new_audio.set_start(start_time)
169 | # add this audio to the audio_tracks list
170 | audio_tracks.append(new_audio)
171 |
172 | print('Writing video...')
173 | # concatenate the original audio with new audio tracks
174 | final_audio = mpy.CompositeAudioClip(audio_tracks)
175 | # build final video with new audio track
176 | video = video.set_audio(final_audio)
177 | new_video_path = f"combine_{video_path}"
178 | if '_video_no_audio' in new_video_path:
179 | new_video_path = new_video_path.replace('_video_no_audio', '')
180 | video.write_videofile(new_video_path, audio_codec='aac')
181 | return new_video_path
182 |
183 |
184 | def mindsflow_function(event, context) -> dict:
185 | video_url = event.get("video_url")
186 | json_url = event.get("json_url")
187 | audio_folder = event.get("audio_folder")
188 | voice = event.get('voice')
189 | use_original_voice = event.get('use_original_voice')
190 | api_ip = os.environ.get('api_ip')
191 |
192 | video_path = video_url.split('/')[-1]
193 | download_file(video_url, video_path)
194 | print(f'Video downloaded from {video_url}')
195 | captions, json_name = get_captions_from_url(json_url)
196 |
197 | voice_clone_url = f"http://{api_ip}:5001/generate_voice/"
198 |
199 | api_data = {
200 | "audio_url": None,
201 | "voice": voice,
202 | "clean_noise": False
203 | }
204 |
205 | # get the audio configuration result
206 | new_video_path = combine_video_audio(video_path, captions, audio_folder, api_data, voice_clone_url, use_original_voice)
207 | result_video = upload_to_aws(new_video_path)
208 |
209 | # delete local files after use
210 | os.remove(video_path)
211 | os.remove(new_video_path)
212 | os.remove(json_name)
213 | #shutil.rmtree(audio_folder)
214 |
215 | # define result
216 | result = {
217 | 'video_url': result_video
218 | }
219 |
220 | return result
221 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/addSoundToVideo.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "addSoundToVideo",
4 | "displayName": "",
5 | "description": "The method is designed to add sound to a video file.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "audio_url",
10 | "video_url"
11 | ],
12 | "properties": {
13 | "volume": {
14 | "type": "number",
15 | "description": ""
16 | },
17 | "audio_url": {
18 | "type": "string",
19 | "description": "URL of the audio to be downloaded"
20 | },
21 | "video_url": {
22 | "type": "string",
23 | "description": "URL of the video to be downloaded"
24 | },
25 | "repeat_audio": {
26 | "type": "boolean",
27 | "description": ""
28 | }
29 | }
30 | },
31 | "outputPattern": {
32 | "type": "object",
33 | "required": [
34 | "video_url"
35 | ],
36 | "properties": {
37 | "video_url": {
38 | "type": "string",
39 | "description": "The URL of the video file with background music added and uploaded to S3"
40 | }
41 | }
42 | },
43 | "tag": "DataPreprocessing",
44 | "testCases": [
45 | {
46 | "volume": 0.5,
47 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/7d670407-7729-4db1-b468-6ca545051de5_audio/accompaniment.wav",
48 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/combine_e37e383a-00db-46d9-a5fa-d9dbfa5e760c.mp4",
49 | "repeat_audio": false
50 | }
51 | ],
52 | "aiPrompt": "addSoundToVideo",
53 | "greeting": ""
54 | }
55 | $"""
56 |
57 | import json
58 | import requests
59 | import moviepy.editor as mpy
60 | import boto3
61 | import time
62 | import random
63 | import string
64 | import os
65 | from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip, concatenate_audioclips
66 | from moviepy.audio.AudioClip import AudioArrayClip
67 | import numpy as np
68 | from moviepy.audio.fx.all import volumex
69 |
70 | def download_file(url: str, save_as: str) -> None:
71 | response = requests.get(url, stream=True)
72 | with open(save_as, 'wb') as f:
73 | for chunk in response.iter_content(chunk_size=1024):
74 | if chunk:
75 | f.write(chunk)
76 |
77 | def get_random_string():
78 | letters = string.ascii_lowercase
79 | result_str = ''.join(random.choice(letters) for _ in range(8))
80 | timestamp = int(time.time())
81 | random_str = str(timestamp) + '_' + result_str
82 | return random_str
83 |
84 | def upload_to_aws(filename: str) -> str:
85 | # Uses your AWS credentials to access the service
86 | bucket_name = os.environ.get('bucket_name')
87 | region = os.environ.get('region')
88 |
89 | # Create a session using the provided credentials
90 | session = boto3.Session(
91 | aws_access_key_id=os.environ.get('access_key_id'),
92 | aws_secret_access_key=os.environ.get('secret_access_key')
93 | )
94 |
95 | # Create an S3 client
96 | s3_client = session.client('s3')
97 |
98 | bucket_path = 'ai-video'
99 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
100 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
101 | url = f'{s3_base_url}{bucket_path}/{filename}'
102 |
103 | return url
104 |
105 | def add_background_music(video_file_path: str, audio_file_path: str, output_file_path: str, repeat_audio: bool=True, pause: float=1.0, volume: float=1.0) -> None:
106 | video = VideoFileClip(video_file_path) # Existing Video File
107 | existing_audio = video.audio # Existing Audio in Video File
108 | new_audio = AudioFileClip(audio_file_path) # New Audio File
109 | new_audio = new_audio.fx(volumex, volume) # Adjusting the volume of the new audio
110 | if repeat_audio:
111 | # Duration for the silent clip
112 | fps = 44100
113 | audio_array = np.zeros((int(pause*fps), 2))
114 | cl_silent = AudioArrayClip(audio_array, fps=fps)
115 | cl_silent.write_audiofile('silent.wav')
116 | audio_clips = [new_audio]
117 | silent_audio = AudioFileClip('silent.wav')
118 | # append audio clips until their total duration is greater than the video
119 | while sum(clip.duration for clip in audio_clips) < video.duration:
120 | audio_clips.extend([new_audio, silent_audio])
121 | new_audio = concatenate_audioclips(audio_clips)
122 |
123 | # If the new audio is longer than the video, limit its duration to that of the video.
124 | if new_audio.duration > video.duration:
125 | new_audio = new_audio.subclip(0, video.duration)
126 | elif video.duration > new_audio.duration:
127 | video = video.subclip(0, new_audio.duration)
128 |
129 | # If the video also has audio, we will overlay the new audio onto the existing audio
130 | if existing_audio is not None:
131 | audio = CompositeAudioClip([existing_audio, new_audio])
132 | else:
133 | audio = new_audio # If the video has no audio, just set the new audio as the video's audio
134 |
135 | final_clip = video.set_audio(audio) # Set the audio track of the video to the audio clip created above
136 | final_clip.write_videofile(output_file_path, audio_codec='aac') # Write the output
137 |
138 | def mindsflow_function(event, context) -> dict:
139 | video_url = event.get("video_url")
140 | audio_url = event.get("audio_url")
141 | volume = event.get("volume", 1.0)
142 | repeat_audio = event.get("repeat_audio", False)
143 |
144 | video_file_path = "temp_video.mp4"
145 | audio_file_path = "temp_audio.wav"
146 | random_str = get_random_string()
147 | output_file_path = f"output_{random_str}.mp4"
148 |
149 | # Step 1: download files
150 | download_file(video_url, video_file_path)
151 | download_file(audio_url, audio_file_path)
152 |
153 | # Step 2: add background music to video
154 | if volume > 0:
155 | add_background_music(video_file_path, audio_file_path, output_file_path, volume=volume, repeat_audio=repeat_audio)
156 | else:
157 | print('audio not added because specified volume was <= 0')
158 |
159 | # Step 3: upload file to S3
160 | url = upload_to_aws(output_file_path)
161 |
162 | result = {
163 | 'video_url': url
164 | }
165 | return result
166 |
167 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/addTextToImage.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "addTextToImage",
4 | "displayName": "",
5 | "description": "This Python method downloads an image from a provided URL, adds a given title to the image, uploads the modified image to an S3 bucket, and then returns the new image's URL.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "text",
10 | "image_url"
11 | ],
12 | "properties": {
13 | "text": {
14 | "type": "string",
15 | "description": ""
16 | },
17 | "margin": {
18 | "type": "number",
19 | "description": ""
20 | },
21 | "font_name": {
22 | "type": "string",
23 | "description": ""
24 | },
25 | "font_size": {
26 | "type": "number",
27 | "description": ""
28 | },
29 | "image_url": {
30 | "type": "string",
31 | "description": "URL of the video to be downloaded"
32 | },
33 | "text_color": {
34 | "type": "string",
35 | "description": ""
36 | },
37 | "caption_position": {
38 | "type": "string",
39 | "description": ""
40 | },
41 | "text_border_size": {
42 | "type": "number",
43 | "description": ""
44 | },
45 | "text_border_color": {
46 | "type": "string",
47 | "description": ""
48 | }
49 | }
50 | },
51 | "outputPattern": {
52 | "type": "object",
53 | "required": [
54 | "image_url"
55 | ],
56 | "properties": {
57 | "image_url": {
58 | "type": "string",
59 | "description": "The presigned URL for the image uploaded to the S3 bucket"
60 | }
61 | }
62 | },
63 | "tag": "VideoGeneration",
64 | "testCases": [
65 | {
66 | "text": "",
67 | "margin": 0,
68 | "font_name": "",
69 | "font_size": 0,
70 | "image_url": "",
71 | "text_color": "",
72 | "caption_position": "",
73 | "text_border_size": 0,
74 | "text_border_color": ""
75 | }
76 | ],
77 | "aiPrompt": "",
78 | "greeting": ""
79 | }
80 | $"""
81 |
82 | import json
83 | import cv2
84 | from moviepy.editor import VideoFileClip
85 | import boto3
86 | import os
87 | import time
88 | import random
89 | import string
90 | import requests
91 | import numpy as np
92 | from PIL import ImageFont, ImageDraw, Image
93 |
94 | font_url = 'https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/fonts/{}.ttf'
95 |
96 | def upload_to_aws(filename: str) -> str:
97 | # Uses your AWS credentials to access the service
98 | bucket_name = os.environ.get('bucket_name')
99 | region = os.environ.get('region')
100 |
101 | # Create a session using the provided credentials
102 | session = boto3.Session(
103 | aws_access_key_id=os.environ.get('access_key_id'),
104 | aws_secret_access_key=os.environ.get('secret_access_key')
105 | )
106 |
107 | # Create an S3 client
108 | s3_client = session.client('s3')
109 |
110 | bucket_path = 'ai-video'
111 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
112 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
113 | url = f'{s3_base_url}{bucket_path}/{filename}'
114 |
115 | return url
116 |
117 | def download_file(url, save_path):
118 | response = requests.get(url)
119 | with open(save_path, 'wb') as file:
120 | file.write(response.content)
121 |
122 | def get_random_string():
123 | letters = string.ascii_lowercase
124 | result_str = ''.join(random.choice(letters) for _ in range(6))
125 | timestamp = int(time.time())
126 | random_str = str(timestamp) + '_' + result_str
127 | return random_str
128 |
129 | # Define color dictionary for known colors
130 | color_dict = {
131 | 'black': (0, 0, 0),
132 | 'white': (255, 255, 255),
133 | 'red': (0, 0, 255), # Remember, in OpenCV it's BGR not RGB
134 | 'green': (0, 255, 0),
135 | 'blue': (255, 0, 0),
136 | 'yellow': (0, 255, 255),
137 | 'cyan': (255, 255, 0),
138 | 'magenta': (255, 0, 255),
139 | 'light gray': (211, 211, 211),
140 | 'dark gray': (169, 169, 169),
141 | 'pink': (147, 20, 255),
142 | 'purple': (128, 0, 128),
143 | 'orange': (0, 165, 255),
144 | 'brown': (42, 42, 165)
145 | }
146 |
147 | # Define the dictionary for known font types
148 | font_dict = {
149 | 'chinese': 'NotoSansSC',
150 | 'default': 'SourceSerif4',
151 | }
152 |
153 | def wrap_text(caption, frame_width, font):
154 | words = caption.split(' ')
155 | lines = [words.pop(0)] # Initial
156 | for word in words:
157 | box = font.getbbox(lines[-1] + ' ' + word)
158 | text_width, text_height = box[2] - box[0], box[3] - box[1]
159 | if text_width > frame_width:
160 | lines.append(word)
161 | else:
162 | lines[-1] += ' ' + word
163 | return lines
164 |
165 | def add_title_to_img(image_path, caption, outfile='out.jpg', border_size=2, border_color='black', text_color='white',
166 | font_size=30, font_type='DUPLEX', caption_position='bottom', margin=0.1, font_dir=''):
167 | # Load image
168 | img_pil = Image.open(image_path)
169 | draw = ImageDraw.Draw(img_pil)
170 |
171 | width, height = img_pil.size
172 |
173 | # Get the specified font
174 | if font_type is None:
175 | font_type = 'default'
176 | if font_type in font_dict.keys():
177 | font_type = font_dict[font_type]
178 | try:
179 | font = ImageFont.truetype(f'{os.path.join(font_dir, font_type)}.ttf', size=font_size)
180 | except:
181 | if not os.path.exists(font_dir):
182 | os.makedirs(font_dir)
183 | download_file(font_url.format(font_type), f'{os.path.join(font_dir, font_type)}.ttf')
184 | font = ImageFont.truetype(f'{os.path.join(font_dir, font_type)}.ttf', size=font_size)
185 |
186 | margin_rate = int(width * margin)
187 |
188 | lines = wrap_text(caption, width - 2 * margin_rate, font)
189 | for i, line in enumerate(lines):
190 | box = font.getbbox(line)
191 | text_width, text_height = box[2] - box[0], box[3] - box[1]
192 | text_height = font_size * 1.3
193 |
194 | # Center the text
195 | textX = (width - text_width - margin_rate * 2) // 2 + margin_rate
196 | total_lines = len(lines)
197 | total_text_height = total_lines * text_height # The total height of text block
198 |
199 | # Position text as per given caption_position
200 | if caption_position.lower() == 'top':
201 | textY = margin_rate + (i * text_height)
202 | elif caption_position.lower() == 'bottom':
203 | textY = height - margin_rate - (len(lines) - i) * text_height
204 | elif caption_position.lower() == 'threequarter':
205 | three_quarter_height = height * 0.75
206 | textY = three_quarter_height - ((total_lines - i) * text_height)
207 | elif caption_position.lower() == 'onequarter':
208 | one_quarter_height = height * 0.25
209 | textY = one_quarter_height + ((i + 1) * text_height)
210 | else: # Default to center if unknown value
211 | textY = ((height - total_text_height) // 2) + (i * text_height)
212 |
213 | # Draw the outline
214 | for k in range(-border_size, border_size + 1):
215 | for j in range(-border_size, border_size + 1):
216 | draw.text((textX + j, textY + k), line, font=font, fill=border_color)
217 | # Draw the text
218 | draw.text((textX, textY), line, font=font, fill=text_color)
219 |
220 | # save the image with caption
221 | img_pil.save(outfile)
222 |
223 | def mindsflow_function(event, context) -> dict:
224 | img_url = event.get("image_url")
225 | text = event.get("text")
226 | caption_position = event.get("caption_position", "bottom")
227 | border_color = event.get("text_border_color", "black")
228 | text_color = event.get("text_color", "white")
229 | font_size = event.get("font_size", 30)
230 | margin = event.get("margin", 0.1)
231 | font_type = event.get("font_name", 'default')
232 | border_size = event.get("text_border_size", 2)
233 |
234 | download_path = "img_" + get_random_string() + ".png"
235 | out_path = "img_" + get_random_string() + ".png"
236 | download_file(img_url, download_path)
237 | # add title to the image
238 | add_title_to_img(download_path,
239 | text,
240 | outfile=out_path,
241 | caption_position=caption_position,
242 | border_color=border_color,
243 | text_color=text_color,
244 | font_size=font_size,
245 | margin=margin,
246 | font_type=font_type,
247 | border_size=border_size,
248 | font_dir = os.environ.get('font_dir')
249 | )
250 | # upload the image to s3 and get the url
251 | url = upload_to_aws(out_path)
252 |
253 | # define result
254 | result = {
255 | 'image_url': url
256 | }
257 |
258 | os.remove(download_path)
259 | os.remove(out_path)
260 |
261 | return result
262 |
263 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/cloneVoiceValleX.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "cloneVoiceValleX",
4 | "displayName": "",
5 | "description": "This Python method downloads a wav file, replicates the voice, generates speech from provided text (potentially in a different language), uploads the new file to AWS, and returns the URL.\n\n- The input wav file of the speaker to be cloned MUST be < 15s\n- For now only English to Chinese is supported",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "text"
10 | ],
11 | "properties": {
12 | "text": {
13 | "type": "string",
14 | "description": "Text from which to generate the new voice"
15 | },
16 | "audio_url": {
17 | "type": "string",
18 | "description": "The url for the original audio file that needs to be processed"
19 | },
20 | "transcript": {
21 | "type": "string",
22 | "description": ""
23 | },
24 | "character_name": {
25 | "type": "string",
26 | "description": "Name of the character (optional)"
27 | }
28 | }
29 | },
30 | "outputPattern": {
31 | "type": "object",
32 | "required": [
33 | "audio_url"
34 | ],
35 | "properties": {
36 | "audio_url": {
37 | "type": "string",
38 | "description": "The URL of the audio file uploaded to AWS"
39 | }
40 | }
41 | },
42 | "tag": "VoiceCloning",
43 | "testCases": [
44 | {
45 | "text": "今天阳光明媚,温度很适宜,所以我打算去附近的公园漫步、欣赏风景、放松心情",
46 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/voice/davide1_split.wav",
47 | "transcript": "",
48 | "character_name": "tony_stark"
49 | },
50 | {
51 | "text": "I think I is going to rule the earth one day, but fortunately this day is still very far.",
52 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/tony_stark.wav",
53 | "transcript": "",
54 | "character_name": "tony_stark"
55 | },
56 | {
57 | "text": "I think I is going to rule the earth one day, 但是那天还没到",
58 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/tony_stark.wav",
59 | "transcript": "",
60 | "character_name": "tony_stark"
61 | }
62 | ],
63 | "aiPrompt": "Given the url of a wav file and a text. Download the file, clone the voice and generate a speech according to the next text, the new text can also be in a different language. Upload the new generated wav file to aws and return the url",
64 | "greeting": ""
65 | }
66 | $"""
67 |
68 | import os
69 | import json
70 | import boto3
71 | import requests
72 | import random
73 | import string
74 |
75 | s3 = boto3.resource('s3')
76 |
77 | def download_file(url: str, save_path: str):
78 | resp = requests.get(url)
79 | with open(save_path, 'wb') as f:
80 | f.write(resp.content)
81 |
82 |
83 | def generate_random_string(length):
84 | letters = string.ascii_letters
85 | result_str = ''.join(random.choice(letters) for i in range(length))
86 | return result_str
87 |
88 |
89 | def upload_to_aws(filename: str) -> str:
90 | bucket_name = os.environ.get('bucket_name')
91 | region = os.environ.get('region')
92 | session = boto3.Session(
93 | aws_access_key_id=os.environ.get('access_key_id'),
94 | aws_secret_access_key=os.environ.get('secret_access_key')
95 | )
96 | s3_client = session.client('s3')
97 | bucket_path = 'voice-clone'
98 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
99 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
100 | url = f'{s3_base_url}{bucket_path}/{filename}'
101 | return url
102 |
103 |
104 | def mindsflow_function(event, context) -> dict:
105 | # get from event
106 | audio_url = event.get('audio_url', None)
107 | text = event.get('text')
108 | character_name = event.get('character_name', None)
109 | transcript = event.get('transcript', None)
110 | api_ip = os.environ.get('api_ip')
111 |
112 | if character_name is None or len(character_name) == 0:
113 | character_name = 'temp'+generate_random_string(10)
114 |
115 | if transcript is not None and len(transcript) == 0:
116 | transcript = None
117 |
118 | voice_clone_url = f"http://{api_ip}:5000/voice_clone/"
119 |
120 | data = {
121 | "audio_url": audio_url,
122 | "character_name": character_name,
123 | "transcript": transcript
124 | }
125 |
126 | headers = {
127 | 'Content-Type': 'application/json'
128 | }
129 |
130 | print('Cloning voice...')
131 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers)
132 | if response.status_code != 200:
133 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}')
134 | print('Voice cloned')
135 |
136 | voice_gen_url = f"http://{api_ip}:5001/generate_audio/"
137 |
138 | data = {
139 | 'character_name': character_name,
140 | 'text': text
141 | }
142 |
143 | print('Generating new voice...')
144 | response = requests.post(voice_gen_url, json=data)
145 | if response.status_code != 200:
146 | raise RuntimeError(f'Voice generation failed with status code: {response.status_code}')
147 | print('New voice generated')
148 |
149 | audio_path = audio_url.split('/')[-1]
150 | # Save the file to the directory
151 | with open(audio_path, 'wb') as file:
152 | file.write(response.content)
153 |
154 | result_url = upload_to_aws(audio_path)
155 |
156 | # clean up
157 | os.remove(audio_path)
158 |
159 | return {
160 | "audio_url": result_url
161 | }
162 |
163 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/cloneVoiceVits.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "cloneVoiceVits",
4 | "displayName": "",
5 | "description": "The Python method is designed to download a WAV file from a specified URL, clone the voice from the file, and generate new speech from a supplied text (potentially in another language). The newly created WAV file is then uploaded to AWS and the URL",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "dataset_url"
10 | ],
11 | "properties": {
12 | "voice": {
13 | "type": "string",
14 | "description": "Name of the cloned voice"
15 | },
16 | "audio_split": {
17 | "type": "integer",
18 | "description": ""
19 | },
20 | "clean_noise": {
21 | "type": "boolean",
22 | "description": "Clean noise in audio for training"
23 | },
24 | "dataset_url": {
25 | "type": "string",
26 | "description": "The url of the original wav file"
27 | },
28 | "train_config": {
29 | "type": "string",
30 | "description": "The new text to generate speech"
31 | }
32 | }
33 | },
34 | "outputPattern": {
35 | "type": "object",
36 | "required": [
37 | "voice",
38 | "succeeded"
39 | ],
40 | "properties": {
41 | "voice": {
42 | "type": "string",
43 | "description": ""
44 | },
45 | "succeeded": {
46 | "type": "boolean",
47 | "description": ""
48 | }
49 | }
50 | },
51 | "tag": "VoiceCloning",
52 | "testCases": [
53 | {
54 | "voice": "",
55 | "audio_split": 12,
56 | "clean_noise": true,
57 | "dataset_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/chinese_poadcast_woman1.zip",
58 | "train_config": "config_1000"
59 | },
60 | {
61 | "voice": "",
62 | "audio_split": 0,
63 | "clean_noise": false,
64 | "dataset_url": "",
65 | "train_config": ""
66 | }
67 | ],
68 | "aiPrompt": "Given the url of a wav file and a text. Download the file, clone the voice and generate a speech according to the next text, the new text can also be in a different language. Upload the new generated wav file to aws and return the url",
69 | "greeting": ""
70 | }
71 | $"""
72 |
73 | import os
74 | import json
75 | import boto3
76 | import requests
77 | import random
78 | import string
79 |
80 | default_train_config = 'config_1000'
81 |
82 | def mindsflow_function(event, context) -> dict:
83 | # get from event
84 | dataset_url = event.get('dataset_url')
85 | config = event.get('train_config', default_train_config)
86 | split = event.get('audio_split', 12)
87 | clean_noise = event.get('clean_noise', False)
88 | voice = event.get('voice', None)
89 | api_ip = os.environ.get('api_ip')
90 |
91 | if config is None or len(config) == 0:
92 | config = default_train_config
93 | if voice is not None and len(voice) == 0:
94 | voice = None
95 |
96 | voice_clone_url = f"http://{api_ip}:5000/voice_clone/"
97 |
98 | data = {
99 | "dataset_url": dataset_url,
100 | "config": config,
101 | "split": split,
102 | "clean_noise": clean_noise
103 | }
104 |
105 | headers = {
106 | 'Content-Type': 'application/json'
107 | }
108 |
109 | print('Cloning voice...')
110 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers)
111 | if response.status_code != 200:
112 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}')
113 | print('Voice cloned')
114 |
115 | response_dict = response.json()
116 |
117 | return {
118 | "succeeded": response_dict["succeeded"],
119 | "voice": response_dict["voice"] if voice is None else voice
120 | }
121 |
122 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/deleteFilesByExtension.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "deleteFilesByExtension",
4 | "displayName": "",
5 | "description": "This method is used for deleting all files within a directory, with an optional filter for specific file extensions.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [],
9 | "properties": {}
10 | },
11 | "outputPattern": {
12 | "type": "object",
13 | "required": [
14 | "status"
15 | ],
16 | "properties": {
17 | "status": {
18 | "type": "string",
19 | "description": "Indicates whether the operation was successful"
20 | }
21 | }
22 | },
23 | "tag": "FileDeletion",
24 | "testCases": [
25 | {},
26 | {}
27 | ],
28 | "aiPrompt": "delete all files in dir, can filter by extension",
29 | "greeting": ""
30 | }
31 | $"""
32 |
33 | import json
34 | import os
35 | import glob
36 |
37 | def process_files(dir_path: str, file_type: str) -> list:
38 | # construct the path with file type
39 | file_paths = glob.glob(os.path.join(dir_path, '*.' + file_type))
40 |
41 |
42 | # read and print file content
43 | for file_path in file_paths:
44 | print(file_path)
45 | os.remove(file_path)
46 |
47 | def mindsflow_function(event, context) -> dict:
48 | # get the directory path and file type from the event
49 | dir_path = ''
50 | file_type = ['wav', 'mp4', 'json', 'html', 'log', 'zip', 'srt', 'mp3', 'jpg', 'ass']
51 |
52 | for ext in file_type:
53 | # process the files and get the content
54 | process_files(dir_path, ext)
55 |
56 | # define result
57 | result = {
58 | 'status': 'ok'
59 | }
60 |
61 | return result
62 |
63 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/deleteFolders.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "deleteFolders",
4 | "displayName": "",
5 | "description": "delete all folders with exceptions",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [],
9 | "properties": {}
10 | },
11 | "outputPattern": {
12 | "type": "object",
13 | "required": [
14 | "status"
15 | ],
16 | "properties": {
17 | "status": {
18 | "type": "string",
19 | "description": "Indicates whether the operation was successful"
20 | }
21 | }
22 | },
23 | "tag": "FileDeletion",
24 | "testCases": [
25 | {},
26 | {}
27 | ],
28 | "aiPrompt": "",
29 | "greeting": ""
30 | }
31 | $"""
32 |
33 | import json
34 | import os
35 | import glob
36 | import shutil
37 |
38 | exclude_list = [os.getenv('font_dir')] # define your exclude list
39 |
40 | def process_files(dir_path: str) -> list:
41 | # list all the subdirectories
42 | dir_paths = [d for d in glob.glob(os.path.join(dir_path, '*')) if os.path.isdir(d)]
43 |
44 | for dir_path in dir_paths:
45 |
46 | folder_name = os.path.basename(dir_path)
47 | # only delete the folder if it's not in the exclude list
48 | if folder_name not in exclude_list:
49 |
50 | # delete the folder
51 | shutil.rmtree(dir_path)
52 | print(f'Deleted: {dir_path}')
53 |
54 | def mindsflow_function(event, context) -> dict:
55 | # get the directory path from the event
56 | dir_path = ''
57 |
58 | # process the directories and delete them
59 | process_files(dir_path)
60 |
61 | # define result
62 | result = {
63 | 'status': 'ok'
64 | }
65 |
66 | return result
67 |
68 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/extractVideoAudioComponents.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "extractVideoAudioComponents",
4 | "displayName": "",
5 | "description": "This method is designed to download a YouTube video, extract its audio, and upload the video without audio and the extracted audio to an S3 server, returning the respective URLs. It is presented in a way that allows for future adaptation to other platforms.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "video_url"
10 | ],
11 | "properties": {
12 | "video_url": {
13 | "type": "string",
14 | "description": "The URL of the video to be downloaded and split"
15 | }
16 | }
17 | },
18 | "outputPattern": {
19 | "type": "object",
20 | "required": [
21 | "audio_url",
22 | "video_url"
23 | ],
24 | "properties": {
25 | "audio_url": {
26 | "type": "string",
27 | "description": "The url for the downloaded audio file"
28 | },
29 | "video_url": {
30 | "type": "string",
31 | "description": "The url for the downloaded video file without audio"
32 | },
33 | "original_video_url": {
34 | "type": "string",
35 | "description": ""
36 | }
37 | }
38 | },
39 | "tag": "DataPreprocessing",
40 | "testCases": [
41 | {
42 | "video_url": "https://www.youtube.com/watch?app=desktop&v=Lv06Razi3Y4"
43 | },
44 | {
45 | "video_url": "https://www.bilibili.com/video/BV14d4y1U7iG/"
46 | },
47 | {
48 | "video_url": "https://www.instagram.com/reel/Cx43zhAvdwL/"
49 | },
50 | {
51 | "video_url": "https://www.tiktok.com/@tedtoks/video/7304757623600057631"
52 | }
53 | ],
54 | "aiPrompt": "Given the URL of a video youtube, download it, extract the audio. Upload the video without audio and the audio to S3 and return the corresponding URLs. Make the code such the download can be generalized to other platforms in the future",
55 | "greeting": ""
56 | }
57 | $"""
58 |
59 | import json
60 | from pytube import YouTube
61 | from moviepy.editor import *
62 | import boto3
63 | import uuid
64 | import os
65 | from pydub import AudioSegment
66 | import youtube_dl
67 | import requests
68 | import instaloader
69 | from urllib.parse import urlparse
70 |
71 | def extract_reel_id(url):
72 | path = urlparse(url).path
73 | segments = path.split('/')
74 | if "reel" in segments:
75 | reel_index = segments.index("reel")
76 | if reel_index+1 < len(segments):
77 | return segments[reel_index+1]
78 | return None
79 |
80 | s3_client = boto3.client('s3')
81 |
82 | def upload_to_aws(filename: str) -> str:
83 | # Uses your AWS credentials to access the service
84 | bucket_name = os.environ.get('bucket_name')
85 | region = os.environ.get('region')
86 | # Create a session using the provided credentials
87 | session = boto3.Session(
88 | aws_access_key_id=os.environ.get('access_key_id'),
89 | aws_secret_access_key=os.environ.get('secret_access_key')
90 | )
91 | # Create an S3 client
92 | s3_client = session.client('s3')
93 | bucket_path = 'ai-video'
94 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
95 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
96 | url = f'{s3_base_url}{bucket_path}/{filename}'
97 | return url
98 |
99 | ydl_opts = {
100 | 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
101 | 'postprocessors': [{
102 | 'key': 'FFmpegVideoConvertor',
103 | 'preferedformat': 'mp4',
104 | }],
105 | }
106 |
107 | def download_and_split_video(url, download_path=""):
108 | if 'youtube.com' in url:
109 | yt = YouTube(url)
110 | try:
111 | print('try download 720p')
112 | video = yt.streams.get_by_resolution('720p').download(download_path)
113 | except:
114 | print('download failed')
115 | video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(download_path)
116 | elif 'www.bilibili.com' in url:
117 | with youtube_dl.YoutubeDL(ydl_opts) as ydl:
118 | info_dict = ydl.extract_info(url, download=True)
119 | video_title = ydl.prepare_filename(info_dict)
120 | video = os.path.join(download_path, video_title.replace('flv', 'mp4'))
121 | elif 'www.tiktok.com' in url:
122 | # pip install yt-dlp
123 | video_name = url.split('/')[-1]
124 | video = f"tiktok_video_{video_name}.mp4"
125 | os.system("yt-dlp {} -o {}".format(url, video))
126 | elif 'www.instagram.com' in url: # currently not working
127 | reel_id = extract_reel_id(url)
128 | L = instaloader.Instaloader()
129 | post = instaloader.Post.from_shortcode(L.context, reel_id)
130 | video_url = post.video_url
131 | video_name = f'ins_reel_{reel_id}'
132 | video = video_name + '.mp4'
133 | from datetime import datetime, timedelta
134 | L.download_pic(filename=video_name, url=url, mtime=datetime.now())
135 | else:
136 | response = requests.get(url)
137 | video = os.path.join(download_path, url.split('/')[-1])
138 | with open(video, 'wb') as file:
139 | file.write(response.content)
140 |
141 | video_clip = VideoFileClip(video)
142 | audio = video_clip.audio
143 | video_without_audio = video_clip.without_audio()
144 | audio_file = os.path.join(download_path, f'{str(uuid.uuid4())}_audio')
145 |
146 | # Save audio to wav
147 | audio.write_audiofile(audio_file + ".wav")
148 |
149 | # Save the video file without audio
150 | video_file = os.path.join(download_path, f'{str(uuid.uuid4())}_video_no_audio.mp4')
151 | video_without_audio.write_videofile(video_file, audio=False)
152 |
153 | return audio_file + ".wav", video_file, video
154 |
155 | def mindsflow_function(event, context) -> dict:
156 | url = event.get("video_url")
157 | audio_file, video_file, original_video = download_and_split_video(url)
158 | audio_url = upload_to_aws(audio_file)
159 | video_url = upload_to_aws(video_file)
160 | original_video_url = upload_to_aws(original_video)
161 | os.remove(original_video)
162 | os.remove(audio_file)
163 | os.remove(video_file)
164 | result = {
165 | 'audio_url': audio_url,
166 | 'video_url': video_url,
167 | 'original_video_url': original_video_url
168 | }
169 | return result
170 |
171 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/generateAudioSegmentsFromJson.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "generateAudioSegmentsFromJson",
4 | "displayName": "",
5 | "description": "Generate audio from json captions.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "json_url",
10 | "target_lang"
11 | ],
12 | "properties": {
13 | "voice": {
14 | "type": "string",
15 | "description": ""
16 | },
17 | "json_url": {
18 | "type": "string",
19 | "description": "URL of the JSON file containing captions"
20 | },
21 | "target_lang": {
22 | "type": "string",
23 | "description": "The language into which the captions should be translated"
24 | },
25 | "enhance_sync": {
26 | "type": "boolean",
27 | "description": ""
28 | },
29 | "max_speech_rate": {
30 | "type": "number",
31 | "description": ""
32 | },
33 | "min_speech_rate": {
34 | "type": "number",
35 | "description": ""
36 | },
37 | "summarize_long_sentences": {
38 | "type": "boolean",
39 | "description": ""
40 | }
41 | }
42 | },
43 | "outputPattern": {
44 | "type": "object",
45 | "required": [
46 | "translation_folder"
47 | ],
48 | "properties": {
49 | "translation_folder": {
50 | "type": "string",
51 | "description": ""
52 | }
53 | }
54 | },
55 | "tag": "TextToSpeech",
56 | "testCases": [
57 | {
58 | "voice": "zh-CN-male",
59 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedVceeOp.json",
60 | "target_lang": "zh",
61 | "enhance_sync": false,
62 | "max_speech_rate": 1.5,
63 | "min_speech_rate": 0.5,
64 | "summarize_long_sentences": false
65 | },
66 | {
67 | "voice": "it-IT': 'Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)",
68 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedAOjUGH.json",
69 | "target_lang": "it",
70 | "enhance_sync": false,
71 | "max_speech_rate": 0,
72 | "min_speech_rate": 0,
73 | "summarize_long_sentences": false
74 | },
75 | {
76 | "voice": "zh-CN-YunfengNeural",
77 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/translatedDLYYSi.json",
78 | "target_lang": "zh",
79 | "enhance_sync": true,
80 | "max_speech_rate": 0,
81 | "min_speech_rate": 0,
82 | "summarize_long_sentences": false
83 | }
84 | ],
85 | "aiPrompt": "Is given the URL of a json file containing a set of captions and their start and duration in a video. download the file and read the content. Translate each sentence into a target language. Then generate the audio of each translated sentence. Is also given the URl of the video. download it and add each audio segment back to the video according to its start time",
86 | "greeting": ""
87 | }
88 | $"""
89 |
90 | import json
91 | import os
92 | import boto3
93 | import requests
94 | import azure.cognitiveservices.speech as speechsdk
95 | from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, SpeechSynthesisOutputFormat
96 | import langid
97 | langid.set_languages(['en', 'zh', 'ja'])
98 | import shutil
99 | import random
100 | import string
101 | import pydub
102 |
103 |
104 | time_unit = 10000000
105 |
106 |
107 | def download_file(url, filename):
108 | res = requests.get(url)
109 | with open(filename, "wb") as f:
110 | f.write(res.content)
111 |
112 |
113 | def upload_to_aws(filename: str) -> str:
114 | bucket_name = os.environ.get('bucket_name')
115 | region = os.environ.get('region')
116 | session = boto3.Session(
117 | aws_access_key_id=os.environ.get('access_key_id'),
118 | aws_secret_access_key=os.environ.get('secret_access_key')
119 | )
120 | s3_client = session.client('s3')
121 | bucket_path = 'temp_audio'
122 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
123 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
124 | url = f'{s3_base_url}{bucket_path}/{filename}'
125 | return url
126 |
127 |
128 | def get_captions_from_url(url):
129 | filename = f"{url.split('/')[-1]}"
130 | # download the json file
131 | download_file(url, filename)
132 | # read the contents
133 | with open(filename, 'r', encoding='utf-8') as f:
134 | captions = json.load(f)
135 | return captions, filename
136 |
137 |
138 | def calculate_element_count(text):
139 | chinese_punctuations = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
140 | text = text.translate(str.maketrans('', '', string.punctuation + chinese_punctuations))
141 | # Consider language specifics (ex: Chinese is rather based on characters)
142 | if langid.classify(text)[0] in ['zh', 'ja']:
143 | return len(text.replace(' ', '')) # Spaces are not typically considered in character count
144 | else:
145 | return len(text.split())
146 |
147 |
148 | def calculate_speech_rate(text, duration):
149 | element_count = calculate_element_count(text)
150 | #print('Element count:', element_count)
151 | #print('Duration:', duration)
152 | duration_in_seconds = float(duration) / float(time_unit)
153 | #print('Duration in seconds', duration_in_seconds)
154 | speech_rate = element_count / float(duration_in_seconds) * 60.
155 | return speech_rate, element_count
156 |
157 | llm_prompt = 'Shorten the input text. The output must have less words than the input. Keep the original language.\n INPUT: {}.\n OUTPUT:'
158 | def summarize_text(input_str: str, event) -> str:
159 | data = {
160 | "style": "LLM-Only",
161 | "stream": False,
162 | "messageContent": input_str,
163 | "agentId": 964
164 | }
165 | resp = event.chat.messages(data=data)
166 | return resp
167 |
168 | lang_dict = {
169 | 'en': 'en-US',
170 | 'zh': 'zh-CN',
171 | 'ch': 'zh-CN',
172 | 'de': 'de-DE',
173 | 'ge': 'de-DE',
174 | 'it': 'it-IT',
175 | 'fr': 'fr-FR',
176 | 'sp': 'es-ES',
177 | 'es': 'es-ES',
178 | }
179 |
180 | speaker_dict = {
181 | 'en-US': 'Microsoft Server Speech Text to Speech Voice (en-US, Jessa24kRUS)',
182 | 'zh-CN': 'Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)',
183 | 'it-IT': 'Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)',
184 | 'de-DE': 'Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)',
185 | 'fr-FR': 'Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)',
186 | 'es-ES': 'Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)',
187 |
188 | 'zh-CN-male': 'zh-CN-YunfengNeural',
189 | 'zh-CN-female': 'zh-CN-XiaomengNeural',
190 | }
191 |
192 | speech_rate_dict = {
193 | 'en-US': 150,
194 | 'zh-CN': 400,
195 | }
196 |
197 |
198 | def generate_audio(captions, lang: str = 'en', translation_folder: str = 'translation_folder', enhance_sync: bool = True, event = None, voice=None, summarize_long_sentences=False, min_speech_rate=0.5, max_speech_rate=1.5):
199 | if lang in lang_dict.keys():
200 | lang = lang_dict[lang]
201 | if 'male' in voice or 'female' in voice:
202 | speaker = speaker_dict[voice]
203 | elif lang in voice:
204 | speaker = voice
205 | else:
206 | speaker = speaker_dict[lang]
207 | print('Using speaker:', speaker)
208 |
209 | filename = '{}/audio_segment_{}.wav'
210 | speech_key = os.environ.get('azure_key')
211 | service_region = os.environ.get('azure_region')
212 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
213 |
214 | tot_error = []
215 | for i, cap in enumerate(captions):
216 | temp_filename = filename.format(translation_folder, str(i+1))
217 | audio_output = speechsdk.audio.AudioOutputConfig(filename=temp_filename)
218 | speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output)
219 | text = cap['translation']
220 | duration = cap['duration']
221 | original_text = cap['sentence']
222 | #voice_speed = (speech_rate / ai_speech_rate)
223 | print(i+1, text, original_text)
224 | voice_speed = 1.
225 | #break
226 | if voice_speed != 1.0 and voice_speed != 1:
227 | voice_speed = int(voice_speed * 100.0 - 100.0)
228 | text = f"" + text + ""
229 | else:
230 | text = f"" + text + ""
231 |
232 | result = speech_synthesizer.speak_ssml_async(text).get()
233 | stream = AudioDataStream(result)
234 | stream.save_to_wav_file(temp_filename)
235 |
236 | # Get the duration of the audio file
237 | audio = pydub.AudioSegment.from_file(temp_filename)
238 | duration = audio.duration_seconds
239 |
240 | speech_rate_min, speech_rate_max = min_speech_rate, max_speech_rate
241 | if enhance_sync:
242 | text = cap['translation']
243 | dur_diff_rate = duration / (cap['duration'] / time_unit)
244 | print('Duration diff rate', dur_diff_rate)
245 | if summarize_long_sentences is True and dur_diff_rate > speech_rate_max and len(text) >= 3: # when translated audio is too long
246 | prev_text = text
247 | text = summarize_text(llm_prompt.format(text), event)
248 | print(f"Translated text is too long: {cap['duration']}s vs {duration}s. Rewording: {prev_text} -> {text} ")
249 | prev_duration = duration
250 | err = abs(duration-cap['duration'] / time_unit)
251 | print('Before synch', prev_duration, cap['duration'] / time_unit, err)
252 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
253 | temp_filename = filename.format(translation_folder, str(i+1))
254 | audio_output = speechsdk.audio.AudioOutputConfig(filename=temp_filename)
255 | speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output)
256 | voice_speed = duration / (cap['duration'] / time_unit)
257 | min_speed, max_speed = speech_rate_min, speech_rate_max
258 | voice_speed = min(max_speed, max(min_speed, voice_speed))
259 | voice_speed = int(voice_speed * 100.0 - 100.0)
260 | text = f"" + text + ""
261 | result = speech_synthesizer.speak_ssml_async(text).get()
262 | stream = AudioDataStream(result)
263 | stream.save_to_wav_file(temp_filename)
264 | # Get the duration of the audio file
265 | audio = pydub.AudioSegment.from_file(temp_filename)
266 | duration = audio.duration_seconds
267 |
268 | err = abs(duration-cap['duration'] / time_unit)
269 | print('After synch', duration, cap['duration'] / time_unit, err)
270 | tot_error.append(err)
271 | print('Total mismatch:', sum(tot_error) / len(tot_error))
272 |
273 | return filename
274 |
275 |
276 | def mindsflow_function(event, context) -> dict:
277 | json_url = event.get("json_url")
278 | target_language = event.get("target_lang")
279 | enhance_sync = event.get("enhance_sync", False)
280 | summarize_long_sentences = event.get("summarize_long_sentences", None)
281 | voice = event.get("voice", None)
282 | min_speech_rate = event.get("min_speech_rate", 0.5)
283 | max_speech_rate = event.get("max_speech_rate", 1.5)
284 |
285 | if voice is not None and voice.lower() in ['none']:
286 | voice = None
287 |
288 | audio_folder = 'audio_folder_' + ''.join(random.choice(string.ascii_letters) for _ in range(6)) # make static name for debug
289 | if os.path.exists(audio_folder):
290 | shutil.rmtree(audio_folder)
291 | os.makedirs(audio_folder)
292 |
293 | # download and read the captions from the json file
294 | captions, _ = get_captions_from_url(json_url)
295 | # generate audios from the translated captions
296 | generate_audio(captions, target_language, audio_folder, enhance_sync, event, voice, summarize_long_sentences, min_speech_rate, max_speech_rate)
297 |
298 | return {'audio_folder': audio_folder}
299 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/generateSrtFromJson.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "generateSrtFromJson",
4 | "displayName": "",
5 | "description": "This Python method downloads a JSON file from a given URL which contains captions with their respective start, end, and duration time. It processes this data, generates a subtitle (SRT) file, and subsequently uploads it to S3 storage.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "sentences_json_url"
10 | ],
11 | "properties": {
12 | "min_words_sentence": {
13 | "type": "integer",
14 | "description": ""
15 | },
16 | "sentences_json_url": {
17 | "type": "string",
18 | "description": "URL of the JSON file containing the subtitles to be downloaded"
19 | }
20 | }
21 | },
22 | "outputPattern": {
23 | "type": "object",
24 | "required": [
25 | "srt_url"
26 | ],
27 | "properties": {
28 | "srt_url": {
29 | "type": "string",
30 | "description": "The status of the function operation"
31 | }
32 | }
33 | },
34 | "tag": "VideoCaptions",
35 | "testCases": [
36 | {
37 | "min_words_sentence": 5,
38 | "sentences_json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/sentence_times_1703164123_oewslyvu.json"
39 | },
40 | {
41 | "min_words_sentence": 0,
42 | "sentences_json_url": ""
43 | }
44 | ],
45 | "aiPrompt": "Given the url of a json, download it. It contains some captions with their start, end and duration. The json is a list in this format\nsentence: \"今日话题做题速度太慢 怎么办?\"\nstart_time: 6000000\nend_time: 35500000\nduration: 29500000\nfrom it generate a srt file containing subtitles and upload it so s3",
46 | "greeting": ""
47 | }
48 | $"""
49 |
50 | import json
51 | import requests
52 | from typing import Dict, List
53 | import boto3
54 | from datetime import timedelta
55 | import random
56 | import os
57 | import string
58 |
59 |
60 | def download_json(url: str) -> List[Dict[str, int]]:
61 | response = requests.get(url)
62 | data = response.json()
63 | return data
64 |
65 | def upload_to_aws(filename: str, bucket_path = None) -> str:
66 | bucket_name = os.environ.get('bucket_name')
67 | region = os.environ.get('region')
68 | session = boto3.Session(
69 | aws_access_key_id=os.environ.get('access_key_id'),
70 | aws_secret_access_key=os.environ.get('secret_access_key')
71 | )
72 | s3_client = session.client('s3')
73 | if bucket_path is None:
74 | bucket_path = 'ai-video'
75 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
76 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
77 | url = f'{s3_base_url}{bucket_path}/{filename}'
78 | return url
79 |
80 | def deciseconds_to_time_format(ds: int) -> str:
81 | ms = int(ds / 10000) # converting deciseconds to milliseconds
82 | seconds, milliseconds = divmod(ms, 1000)
83 | minutes, seconds = divmod(seconds, 60)
84 | hours, minutes = divmod(minutes, 60)
85 | time_string = f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
86 | return time_string
87 |
88 |
89 | punctuation = '。,、?!;:“”‘’【】()《》「」.,?!;:(){}[]<>'
90 | strong_punctuation = ['.', '?', '!', '。', '?', '!']
91 | def generate_srt(subtitles: List[Dict[str, int]], min_length: int) -> str:
92 | srt_string = ""
93 | index = 1
94 | while subtitles:
95 | # Pop the first subtitle off the list
96 | subtitle = subtitles.pop(0)
97 | # Store the start and end time
98 | start_time = deciseconds_to_time_format(subtitle["start_time"])
99 | end_time = deciseconds_to_time_format(subtitle["end_time"])
100 | # Combine the sentences until the length is at least min_length
101 | combined_sentence = subtitle['sentence']
102 | while len(combined_sentence.split()) < min_length and subtitles:
103 | if combined_sentence.replace(' ', '')[-1] in strong_punctuation:
104 | break
105 | next_subtitle = subtitles.pop(0)
106 | end_time = deciseconds_to_time_format(next_subtitle["end_time"]) # update end time
107 | combined_sentence += ' ' + next_subtitle['sentence']
108 | # Remove trailing punctuation
109 | while combined_sentence[-1] in punctuation:
110 | combined_sentence = combined_sentence[:-1]
111 | # Add to the SRT string
112 | srt_string += f"{index}\n{start_time} --> {end_time}\n{combined_sentence}\n\n"
113 | index += 1
114 | return srt_string
115 |
116 |
117 | def mindsflow_function(event, context) -> dict:
118 | # get the s3 bucket, file_name, and url from the event
119 | url = event.get("sentences_json_url")
120 | min_words_sentence = event.get("min_words_sentence", 5)
121 |
122 | # download the json from the url
123 | subtitles_json = download_json(url)
124 |
125 | # generate the srt from the json
126 | srt_data = generate_srt(subtitles_json, min_words_sentence)
127 |
128 | file_name = ''.join(random.choices(string.ascii_lowercase + string.digits, k=6))
129 | file_name_srt = file_name + '.srt'
130 | with open(file_name_srt, 'w') as file:
131 | file.write(srt_data )
132 | srt_url = upload_to_aws(file_name_srt)
133 | os.remove(file_name_srt)
134 |
135 | print(srt_data)
136 |
137 | # define result
138 | result = {
139 | 'srt_url': srt_url,
140 | }
141 |
142 | return result
143 |
144 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/generateVideoScript.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "generateVideoScript",
4 | "displayName": "",
5 | "description": "generate video script",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "topic"
10 | ],
11 | "properties": {
12 | "topic": {
13 | "type": "string",
14 | "description": ""
15 | },
16 | "text_style": {
17 | "type": "string",
18 | "description": ""
19 | }
20 | }
21 | },
22 | "outputPattern": {
23 | "type": "object",
24 | "required": [
25 | "json_string"
26 | ],
27 | "properties": {
28 | "json_string": {
29 | "type": "string",
30 | "description": ""
31 | }
32 | }
33 | },
34 | "tag": "VideoGeneration",
35 | "testCases": [
36 | {
37 | "topic": "Benefits of eating mango",
38 | "text_style": "scientific, straight to the point, easy to read"
39 | },
40 | {
41 | "topic": "Story of two brothers, sci-fi",
42 | "text_style": ""
43 | },
44 | {
45 | "topic": "story, sci-fi, epic",
46 | "text_style": ""
47 | }
48 | ],
49 | "aiPrompt": "",
50 | "greeting": ""
51 | }
52 | $"""
53 |
54 | import json
55 | import boto3
56 | import os
57 | import uuid
58 |
59 |
60 | def upload_to_aws(filename: str) -> str:
61 | # Uses your AWS credentials to access the service
62 | bucket_name = os.environ.get('bucket_name')
63 | region = os.environ.get('region')
64 | # Create a session using the provided credentials
65 | session = boto3.Session(
66 | aws_access_key_id=os.environ.get('access_key_id'),
67 | aws_secret_access_key=os.environ.get('secret_access_key')
68 | )
69 | # Create an S3 client
70 | s3_client = session.client('s3')
71 | bucket_path = 'ai-video'
72 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
73 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
74 | url = f'{s3_base_url}{bucket_path}/{filename}'
75 | return url
76 |
77 |
78 | def generate_story_prompt(input_str: str, event) -> str:
79 | data = {
80 | "style": "LLM-Only",
81 | "stream": False,
82 | "messageContent": input_str,
83 | "agentId": 1601
84 | }
85 | resp = event.chat.messages(data=data)
86 | return resp
87 |
88 | def generate_paragraph_prompt(input_str: str, event) -> str:
89 | data = {
90 | "style": "LLM-Only",
91 | "stream": False,
92 | "messageContent": input_str,
93 | "agentId": 1599
94 | }
95 | resp = event.chat.messages(data=data)
96 | return resp
97 |
98 | def generate_music_prompt(input_str: str, event) -> str:
99 | data = {
100 | "style": "LLM-Only",
101 | "stream": False,
102 | "messageContent": input_str,
103 | "agentId": 1604
104 | }
105 | resp = event.chat.messages(data=data)
106 | return resp
107 |
108 | prompt = 'Given a text style and a text, turn the text into that style\nTEXT: {}\nSTYLE: {}\nNEW TEXT: '
109 | def personalize_text(text: str, style: str, event) -> str:
110 | input_str = prompt.format(text, style)
111 | data = {
112 | "style": "LLM-Only",
113 | "stream": False,
114 | "messageContent": input_str,
115 | "agentId": 1548
116 | }
117 | resp = event.chat.messages(data=data)
118 | return resp
119 |
120 | def mindsflow_function(event, context) -> dict:
121 | topic = event.get("topic")
122 | style = event.get("text_style", None)
123 | return_url = event.get("return_url", True)
124 | if 'story' in topic or 'Story' in topic or 'STORY' in topic:
125 | json_string = generate_story_prompt(topic, event)
126 | else:
127 | json_string = generate_paragraph_prompt(topic, event)
128 |
129 | json_url = None
130 | #print(json_string)
131 | dict_object = json.loads(json_string.replace('\\', ''))
132 |
133 | music_prompt = generate_music_prompt(topic, event)
134 | dict_object['music_prompt'] = music_prompt.replace('//', '').replace('"', '')
135 |
136 | if style is not None:
137 | dict_object['original_text'] = dict_object['Text']
138 | dict_object['Text'] = personalize_text(dict_object['Text'], style, event)
139 |
140 | json_path = f"script_{uuid.uuid4()}.json"
141 | with open(json_path, 'w') as f:
142 | json.dump(dict_object, f)
143 | json_url = upload_to_aws(json_path)
144 | os.remove(json_path)
145 |
146 | result = {
147 | 'json_string': json_string,
148 | 'json_url': json_url
149 | }
150 | # iterates over each key-value pair in the JSON object
151 | for key, value in dict_object.items():
152 | result[key.lower()] = value
153 |
154 | return result
155 |
156 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/generateVoiceVits.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "generateVoiceVits",
4 | "displayName": "",
5 | "description": "A Python method that downloads a specified wav file, clones the voice, generates a new speech - potentially in another language - from provided text, and subsequently uploads the newly created wav file to AWS, finally returning the URL of the uploaded file.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "voice",
10 | "audio_url"
11 | ],
12 | "properties": {
13 | "voice": {
14 | "type": "string",
15 | "description": "Voice to use to generate new audio"
16 | },
17 | "audio_url": {
18 | "type": "string",
19 | "description": "URL of the audio to be cloned"
20 | },
21 | "clean_noise": {
22 | "type": "boolean",
23 | "description": "Clean audio input noise"
24 | }
25 | }
26 | },
27 | "outputPattern": {
28 | "type": "object",
29 | "required": [
30 | "audio_url"
31 | ],
32 | "properties": {
33 | "audio_url": {
34 | "type": "string",
35 | "description": "Url of the wav audio file"
36 | }
37 | }
38 | },
39 | "tag": "VoiceCloning",
40 | "testCases": [
41 | {
42 | "voice": "chinese_poadcast_woman1",
43 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/voice/zh_woman1_split.wav",
44 | "clean_noise": true
45 | },
46 | {
47 | "voice": "",
48 | "audio_url": "",
49 | "clean_noise": false
50 | }
51 | ],
52 | "aiPrompt": "Given the url of a wav file and a text. Download the file, clone the voice and generate a speech according to the next text, the new text can also be in a different language. Upload the new generated wav file to aws and return the url",
53 | "greeting": ""
54 | }
55 | $"""
56 |
57 | import os
58 | import json
59 | import boto3
60 | import requests
61 | import random
62 | import string
63 |
64 | s3 = boto3.resource('s3')
65 |
66 | def download_file(url: str, save_path: str):
67 | resp = requests.get(url)
68 | with open(save_path, 'wb') as f:
69 | f.write(resp.content)
70 |
71 |
72 | def generate_random_string(length):
73 | letters = string.ascii_letters
74 | result_str = ''.join(random.choice(letters) for i in range(length))
75 | return result_str
76 |
77 |
78 | def upload_to_aws(filename: str) -> str:
79 | bucket_name = os.environ.get('bucket_name')
80 | region = os.environ.get('region')
81 | session = boto3.Session(
82 | aws_access_key_id=os.environ.get('access_key_id'),
83 | aws_secret_access_key=os.environ.get('secret_access_key')
84 | )
85 | s3_client = session.client('s3')
86 | bucket_path = 'voice-clone'
87 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
88 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
89 | url = f'{s3_base_url}{bucket_path}/{filename}'
90 | return url
91 |
92 |
93 | def mindsflow_function(event, context) -> dict:
94 | # get from event
95 | audio_url = event.get('audio_url')
96 | voice= event.get('voice')
97 | clean_noise = event.get('clean_noise')
98 | api_ip = os.environ.get('api_ip')
99 |
100 | voice_clone_url = f"http://{api_ip}:5001/generate_voice/"
101 |
102 | data = {
103 | "audio_url": audio_url,
104 | "voice": voice,
105 | "clean_noise": clean_noise
106 | }
107 |
108 | headers = {
109 | 'Content-Type': 'application/json'
110 | }
111 |
112 | print('Generating voice...')
113 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers)
114 | if response.status_code != 200:
115 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}')
116 | print('Voice generated')
117 |
118 | audio_path = voice + '_' + audio_url.split('/')[-1]
119 | # Save the file to the directory
120 | with open(audio_path, 'wb') as file:
121 | file.write(response.content)
122 |
123 | result_url = upload_to_aws(audio_path)
124 |
125 | # clean up
126 | os.remove(audio_path)
127 |
128 | return {
129 | "audio_url": result_url
130 | }
131 |
132 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/loadJsonAndReturnKeys.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "loadJsonAndReturnKeys",
4 | "displayName": "",
5 | "description": "This method takes a string input, interprets it as a JSON object, and returns each key within it as a string.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "json_string"
10 | ],
11 | "properties": {
12 | "json_string": {
13 | "type": "string",
14 | "description": "A JSON string variable"
15 | }
16 | }
17 | },
18 | "outputPattern": {
19 | "type": "object",
20 | "required": [],
21 | "properties": {}
22 | },
23 | "tag": "DataPreprocessing",
24 | "testCases": [
25 | {
26 | "json_string": "{\n\\\"Title\\\": \\\"The Enchantments of the Mystic World\\\",\n\\\"Text\\\": \\\"In a land of dreams and lore, where mythical beasts roar under an eternally twilight sky, unfurls the enigma of a fantasy style poem. Weaving an intricate tapestry of knights and elves, wizards and dragons, this poem is a saga of heroic adventures and epic battles. Dreamlike imagery is brushstroked with sonorous verses, blending the borders of reality with the enchanting realm of magical dimensions.\\\",\n\\\"Description\\\": \\\"A brief depiction of a fantasy style poem enriching the mystical world of myths and magic, injecting life into fictional characters and their bewitching land.\\\",\n\\\"Prompt\\\": \\\"An epic painting of mythical creatures like dragons and unicorns embarking on heroic adventures, with knights and elves in a magical realm under a twilight sky.\\\",\n\\\"Hashtags\\\": \\\"#SpartanRace #SpearThrow #ObstacleCourse #FitnessGoals #RaceTraining #Endurance #GetSpartanFit\\\"\\n\n}"
27 | }
28 | ],
29 | "aiPrompt": "",
30 | "greeting": ""
31 | }
32 | $"""
33 |
34 | import json
35 |
36 | def json_from_string(json_str: str) -> dict:
37 | return json.loads(json_str)
38 |
39 | def mindsflow_function(event, context) -> dict:
40 | json_string = event.get("json_string").replace('\\n', '').replace('\n', '').replace('\\', '')
41 | print(json_string)
42 | json_data = json.loads(json_string)
43 |
44 | keys = ', '.join([str(elem) for elem in json_data.keys()])
45 |
46 | results = {}
47 |
48 | for k in json_data.keys():
49 | results[k.lower()] = json_data[k]
50 | if k.lower() == 'description' and 'Hashtags' in json_data.keys():
51 | results[k.lower()] += '\n' + json_data['Hashtags'].lower()
52 |
53 | return results
54 |
55 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/preprocessTrainData.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "preprocessTrainData",
4 | "displayName": "",
5 | "description": "This function downloads an audio file, transforms it into a wav format, and then uploads it to a specified data storage bucket.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "audio_url"
10 | ],
11 | "properties": {
12 | "voice": {
13 | "type": "string",
14 | "description": ""
15 | },
16 | "make_zip": {
17 | "type": "boolean",
18 | "description": ""
19 | },
20 | "audio_url": {
21 | "type": "string",
22 | "description": "URL of the file to be downloaded and converted"
23 | }
24 | }
25 | },
26 | "outputPattern": {
27 | "type": "object",
28 | "required": [
29 | "audio_url"
30 | ],
31 | "properties": {
32 | "audio_url": {
33 | "type": "string",
34 | "description": "url of the converted file"
35 | }
36 | }
37 | },
38 | "tag": "VoiceCloning",
39 | "testCases": [
40 | {
41 | "voice": "hhh",
42 | "make_zip": true,
43 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/chinese_poadcast_woman1.m4a"
44 | },
45 | {
46 | "voice": "",
47 | "make_zip": false,
48 | "audio_url": ""
49 | }
50 | ],
51 | "aiPrompt": "Cloning voice...",
52 | "greeting": ""
53 | }
54 | $"""
55 |
56 | import os
57 | import json
58 | import requests
59 | from pydub import AudioSegment
60 | import boto3
61 | import zipfile
62 | import glob
63 | import shutil
64 | from datetime import datetime
65 |
66 | def download_file(url: str) -> str:
67 | local_filename = url.split('/')[-1]
68 | with requests.get(url, stream=True) as r:
69 | r.raise_for_status()
70 | with open(local_filename, 'wb') as f:
71 | for chunk in r.iter_content(chunk_size=8192):
72 | f.write(chunk)
73 | return local_filename
74 |
75 | def convert_audio_to_wav(file_path: str) -> str:
76 | audio = AudioSegment.from_file(file_path)
77 | wav_filename = os.path.splitext(file_path)[0] + '.wav'
78 | audio.export(wav_filename, format="wav")
79 | return wav_filename
80 |
81 | def upload_to_aws(filename: str) -> str:
82 | # Uses your AWS credentials to access the service
83 | bucket_name = os.environ.get('bucket_name')
84 | region = os.environ.get('region')
85 |
86 | # Create a session using the provided credentials
87 | session = boto3.Session(
88 | aws_access_key_id=os.environ.get('access_key_id'),
89 | aws_secret_access_key=os.environ.get('secret_access_key')
90 | )
91 |
92 | # Create an S3 client
93 | s3_client = session.client('s3')
94 |
95 | bucket_path = 'temp_audio'
96 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
97 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
98 | url = f'{s3_base_url}{bucket_path}/{filename}'
99 |
100 | return url
101 |
102 | def zip_wav_file(wav_file_path):
103 | # Check if file exists
104 | if not os.path.isfile(wav_file_path):
105 | print("File does not exist at provided path.")
106 | return
107 |
108 | # Extracting directory path, file name and file base name
109 | dir_path, file_name = os.path.split(wav_file_path)
110 | file_base_name, _ = os.path.splitext(file_name)
111 |
112 | # Creating new directory with same name as the wav file
113 | new_dir_path = os.path.join(dir_path, file_base_name)
114 |
115 | # If the directory already exists, append a timestamp to its name
116 | #if os.path.exists(new_dir_path):
117 | # timestamp = datetime.now().strftime("_%Y%m%d_%H%M%S")
118 | # new_dir_path += timestamp
119 |
120 | os.makedirs(new_dir_path, exist_ok=True)
121 |
122 | # Moving the wav file to the new directory
123 | shutil.move(wav_file_path, os.path.join(new_dir_path, file_name))
124 |
125 | # Creating a zip file and adding the directory with the wav file in it
126 | # If the zip file already exists, append a timestamp to its name
127 | zip_file_path = dir_path + '/' + file_base_name + '.zip'
128 | if os.path.isfile(zip_file_path):
129 | zip_file_path = os.path.splitext(zip_file_path)[0] + ".zip"
130 |
131 | with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
132 | for foldername, subfolders, filenames in os.walk(new_dir_path):
133 | for filename in filenames:
134 | # create complete filepath of file in directory
135 | file_to_zip = os.path.join(foldername, filename)
136 | # add file to zip
137 | zipf.write(file_to_zip, os.path.relpath(file_to_zip, dir_path))
138 |
139 | print(f"Zip file saved at: {zip_file_path}")
140 | return zip_file_path
141 |
142 | def mindsflow_function(event, context) -> dict:
143 | # get params from the event
144 | url = event.get("audio_url")
145 | make_zip = event.get("make_zip", False)
146 | voice = event.get("voice", None)
147 | ext = url.split('.')[-1]
148 | if ext in [ 'zip']:
149 | return {
150 | 'audio_url': url
151 | }
152 | if ext in [ 'wav'] and make_zip is False:
153 | return {
154 | 'audio_url': url
155 | }
156 |
157 | # Download file
158 | local_filename = download_file(url)
159 | if voice is not None:
160 | new_filename = f'{voice}.wav'
161 | shutil.move(local_filename, new_filename)
162 | local_filename = new_filename
163 |
164 | # Convert audio to wav
165 | wav_filename = convert_audio_to_wav(local_filename)
166 |
167 | if make_zip: # TODo chasnge file nasme
168 | wav_filename = zip_wav_file(wav_filename)
169 | if voice is not None:
170 | new_filename = f'{voice}.zip'
171 | shutil.move(wav_filename, new_filename)
172 | wav_filename = new_filename
173 |
174 | # Upload wav file to S3 bucket
175 | response = upload_to_aws(wav_filename)
176 |
177 | files = glob.glob('./*.zip') + glob.glob('./*.wav') + glob.glob('./*.m4a') + glob.glob('./*.mp3')
178 | for file_name in files:
179 | try:
180 | os.remove(file_name)
181 | print('File ', file_name ,'removed successfully.')
182 | except:
183 | print('Error while deleting file ', file_name)
184 |
185 | # define result
186 | result = {
187 | 'audio_url': response
188 | }
189 |
190 | return result
191 |
192 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/returnInputParameters.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "returnInputParameters",
4 | "displayName": "",
5 | "description": "This method is designed to accept and return input parameters.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [],
9 | "properties": {}
10 | },
11 | "outputPattern": {
12 | "type": "object",
13 | "required": [],
14 | "properties": {}
15 | },
16 | "tag": "ParameterReturn",
17 | "testCases": [
18 | {}
19 | ],
20 | "aiPrompt": "Return the input parameters",
21 | "greeting": ""
22 | }
23 | $"""
24 |
25 | import json
26 |
27 | def mindsflow_function(event, context) -> dict:
28 | # directly return the input parameters
29 | return event
30 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/setEpochInJsonFile.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "setEpochInJsonFile",
4 | "displayName": "",
5 | "description": "Opens a local JSON file, modifies the 'epochs' field to a specified input value (N), and saves the changes back to the same JSON file.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "epochs"
10 | ],
11 | "properties": {
12 | "epochs": {
13 | "type": "integer",
14 | "description": "New epoch value to be set in the JSON file"
15 | }
16 | }
17 | },
18 | "outputPattern": {
19 | "type": "object",
20 | "required": [
21 | "config_url"
22 | ],
23 | "properties": {
24 | "config_url": {
25 | "type": "string",
26 | "description": ""
27 | }
28 | }
29 | },
30 | "tag": "VoiceCloning",
31 | "testCases": [
32 | {
33 | "epochs": 130
34 | },
35 | {
36 | "epochs": 0
37 | }
38 | ],
39 | "aiPrompt": "Open a json file from local, change the field epoc to N, where N is an input, and save the json in the same location. The json has this structure:\n\n{\n \"train\": {\n \"log_interval\": 100,\n \"eval_interval\": 200,\n \"seed\": 1234,\n \"epochs\": 100,\n \"learning_rate\": 0.0001,\n \"betas\": [\n 0.8,\n 0.99\n ],\n \"eps\": 1e-09,\n \"batch_size\": 16,\n \"fp16_run\": false,\n \"bf16_run\": false,",
40 | "greeting": ""
41 | }
42 | $"""
43 |
44 | import json
45 | import os
46 | import boto3
47 |
48 | s3 = boto3.resource('s3')
49 |
50 | def upload_to_aws(filename: str) -> str:
51 | bucket_name = os.environ.get('bucket_name')
52 | region = os.environ.get('region')
53 | session = boto3.Session(
54 | aws_access_key_id=os.environ.get('access_key_id'),
55 | aws_secret_access_key=os.environ.get('secret_access_key')
56 | )
57 | s3_client = session.client('s3')
58 | bucket_path = 'voice-clone'
59 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
60 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
61 | url = f'{s3_base_url}{bucket_path}/{filename}'
62 | return url
63 |
64 | def modify_epochs(file_path:str, new_epoch:int) -> bool:
65 | with open(file_path, 'r') as json_file:
66 | data = json.load(json_file)
67 | data['train']['epochs'] = new_epoch
68 |
69 | new_file_name = f'config_{new_epoch}.json'
70 | new_file_path = os.path.join(os.path.dirname(file_path), new_file_name)
71 | with open(new_file_path, 'w') as new_file:
72 | json.dump(data, new_file, indent=4)
73 |
74 | return new_file_path
75 |
76 | def mindsflow_function(event, context) -> dict:
77 | # extract parameters from event
78 | file_path = 'train_configs/config.json'
79 | new_epoch = event.get("epochs")
80 |
81 | # modify the epochs in JSON file
82 | new_file_path = modify_epochs(file_path, new_epoch)
83 |
84 | url = upload_to_aws(new_file_path)
85 |
86 | os.remove(new_file_path)
87 |
88 | # formulate the result
89 | result = {
90 | 'config_url': url
91 | }
92 |
93 | return result
94 |
95 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/splitVoiceMusic.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "splitVoiceMusic",
4 | "displayName": "",
5 | "description": "This Python method downloads an audio file from a given URL, separates music and voice using Spleeter, uploads the results to S3, and returns their URLs.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "audio_url"
10 | ],
11 | "properties": {
12 | "audio_url": {
13 | "type": "string",
14 | "description": "The url of the audio file to be processed"
15 | }
16 | }
17 | },
18 | "outputPattern": {
19 | "type": "object",
20 | "required": [
21 | "vocals_url",
22 | "accompaniment_url"
23 | ],
24 | "properties": {
25 | "vocals_url": {
26 | "type": "string",
27 | "description": "The url of the vocal part of the audio file on S3"
28 | },
29 | "accompaniment_url": {
30 | "type": "string",
31 | "description": "The url of the accompaniment part of the audio file on S3"
32 | }
33 | }
34 | },
35 | "tag": "DataPreprocessing",
36 | "testCases": [
37 | {
38 | "audio_url": "https://github.com/deezer/spleeter/raw/master/audio_example.mp3"
39 | },
40 | {
41 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/dd6f9f73-de0c-4792-a48c-ccc2e8abe7bd_audio.wav"
42 | }
43 | ],
44 | "aiPrompt": "Given the url of a audio file, download it. split music and voice with Spleeter. Upload the results to s3 and return their urls",
45 | "greeting": ""
46 | }
47 | $"""
48 |
49 | import json
50 | import os
51 | import subprocess
52 | import urllib.request
53 | import boto3
54 | from botocore.exceptions import NoCredentialsError
55 | import uuid
56 | import shutil
57 |
58 |
59 | def download_file(url: str) -> str:
60 | local_filename = url.split('/')[-1]
61 | urllib.request.urlretrieve(url, local_filename)
62 | return local_filename
63 |
64 | def upload_to_aws(filename: str) -> str:
65 | # Uses your AWS credentials to access the service
66 | bucket_name = os.environ.get('bucket_name')
67 | region = os.environ.get('region')
68 | # Create a session using the provided credentials
69 | session = boto3.Session(
70 | aws_access_key_id=os.environ.get('access_key_id'),
71 | aws_secret_access_key=os.environ.get('secret_access_key')
72 | )
73 | # Create an S3 client
74 | s3_client = session.client('s3')
75 | bucket_path = 'ai-video'
76 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
77 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
78 | url = f'{s3_base_url}{bucket_path}/{filename}'
79 | return url
80 |
81 | def mindsflow_function(event, context) -> dict:
82 |
83 | # execute only first time to set up env
84 | def execute_command(command):
85 | process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
86 | output, error = process.communicate()
87 | execute_command("pip uninstall -y ffmpeg")
88 | execute_command("pip uninstall -y ffmpeg-python")
89 | execute_command("pip install ffmpeg-python")
90 | execute_command("pip install spleeter")
91 |
92 | from spleeter.separator import Separator
93 |
94 | # Get the audio URL from the event
95 | audio_url = event.get("audio_url")
96 |
97 | # Download the audio file
98 | audio_file = download_file(audio_url)
99 | audio_name = audio_file.split('.')[0]
100 |
101 | # Split the music and voice with Spleeter
102 | vocals_file = f"{audio_name}/vocals.wav"
103 | accompaniment_file = f"{audio_name}/accompaniment.wav"
104 |
105 |
106 | # Create a separator object
107 | separator = Separator('spleeter:2stems')
108 |
109 | # Use the separator to separate the streams
110 | # 'audio_example.mp3' is your input audio file
111 | separator.separate_to_file(audio_file, '')
112 |
113 | # Upload the results to S3
114 | vocals_url = upload_to_aws(vocals_file)
115 | accompaniment_url = upload_to_aws(accompaniment_file)
116 |
117 | execute_command("pip uninstall spleeter")
118 |
119 | # Define result
120 | result = {
121 | 'vocals_url': vocals_url,
122 | 'accompaniment_url': accompaniment_url
123 | }
124 |
125 | # Delete the files after uploading
126 | os.remove(vocals_file)
127 | os.remove(accompaniment_file)
128 | shutil.rmtree(audio_name)
129 |
130 | return result
131 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/textToSpeech.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "textToSpeech",
4 | "displayName": "",
5 | "description": "This Python method converts a text string into audio. The URL of the resulting audio is then returned.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "text"
10 | ],
11 | "properties": {
12 | "text": {
13 | "type": "string",
14 | "description": "Text to convert into voice"
15 | },
16 | "speaker": {
17 | "type": "string",
18 | "description": "speaker"
19 | },
20 | "language": {
21 | "type": "string",
22 | "description": "Voice language"
23 | },
24 | "voice_speed": {
25 | "type": "number",
26 | "description": "voice speed"
27 | }
28 | }
29 | },
30 | "outputPattern": {
31 | "type": "object",
32 | "required": [
33 | "duration",
34 | "audio_url"
35 | ],
36 | "properties": {
37 | "duration": {
38 | "type": "number",
39 | "description": ""
40 | },
41 | "audio_url": {
42 | "type": "string",
43 | "description": "URL address of the generated voice"
44 | }
45 | }
46 | },
47 | "tag": "TextToSpeech",
48 | "testCases": [
49 | {
50 | "text": "Hi, my name is Hello world",
51 | "speaker": "en-US-GuyNeural",
52 | "language": "en",
53 | "voice_speed": 1
54 | },
55 | {
56 | "text": "What is the weather today?",
57 | "speaker": "",
58 | "language": "en",
59 | "voice_speed": 1
60 | },
61 | {
62 | "text": "Mi piace mangiare la pasta",
63 | "speaker": "",
64 | "language": "it",
65 | "voice_speed": 1
66 | }
67 | ],
68 | "aiPrompt": "The method converts a given text string into audio. If a sample voice is provided, the generated audio is created by cloning the sample voice. The URL address of the generated voice is returned.",
69 | "greeting": ""
70 | }
71 | $"""
72 |
73 | import os
74 | import boto3
75 | import datetime
76 | import requests
77 | from pydub import AudioSegment
78 | import pydub
79 | import azure.cognitiveservices.speech as speechsdk
80 | from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, SpeechSynthesisOutputFormat
81 |
82 | def download_file(url, save_path):
83 | response = requests.get(url)
84 | with open(save_path, 'wb') as file:
85 | file.write(response.content)
86 | file_extension = url.split(".")[-1].lower()
87 | if file_extension == "mp3": # Convert the MP3 file to WAV
88 | audio = AudioSegment.from_mp3(save_path)
89 | audio.export(save_path, format="wav")
90 | return save_path
91 | elif file_extension == "wav":
92 | return save_path
93 | else:
94 | raise Exception("Unsupported file format. Only MP3 and WAV files are supported.")
95 |
96 | lang_dict = {
97 | 'en': 'en-US',
98 | 'ch': 'zh-CN',
99 | 'zh': 'zh-CN',
100 | 'it': 'it-IT',
101 | 'de': 'de-DE',
102 | 'fr': 'fr-FR',
103 | 'es': 'es-ES'
104 | }
105 |
106 | speaker_dict = {
107 | 'en-US': 'Microsoft Server Speech Text to Speech Voice (en-US, Jessa24kRUS)',
108 | 'zh-CN': 'Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)',
109 | 'it-IT': 'Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)',
110 | 'de-DE': 'Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)',
111 | 'fr-FR': 'Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)',
112 | 'es-ES': 'Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)'
113 | }
114 |
115 | def generate_audio(text: str, lang: str = 'en', voice_speed: float = 1.0, speaker: str = None):
116 | if lang in lang_dict.keys():
117 | lang = lang_dict[lang]
118 | print('Setting lang:', lang)
119 | if speaker is None or speaker in ['none', '']: # use default speaker
120 | speaker = speaker_dict[lang]
121 | print('Using speaker:', speaker)
122 | current_time = datetime.datetime.now()
123 | timestamp = current_time.strftime("%Y%m%d%H%M%S")
124 | filename = f'audio_{timestamp}.wav'
125 | speech_key = os.environ.get('azure_key')
126 | service_region = os.environ.get('azure_region')
127 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
128 | audio_output = speechsdk.audio.AudioOutputConfig(filename=filename)
129 | speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output)
130 |
131 | if voice_speed != 1.0 and voice_speed != 1:
132 | voice_speed = int(voice_speed * 100.0 - 100.0)
133 | text = f"" + text + ""
134 | else:
135 | text = f"" + text + ""
136 | result = speech_synthesizer.speak_ssml_async(text).get()
137 | stream = AudioDataStream(result)
138 | stream.save_to_wav_file(filename)
139 |
140 | # Get the duration of the audio file
141 | audio = pydub.AudioSegment.from_file(filename)
142 | duration = audio.duration_seconds
143 |
144 | bucket_name = os.environ.get('bucket_name')
145 | region = os.environ.get('region')
146 |
147 | # Create a session using the provided credentials
148 | session = boto3.Session(
149 | aws_access_key_id=os.environ.get('access_key_id'),
150 | aws_secret_access_key=os.environ.get('secret_access_key')
151 | )
152 |
153 | # Create an S3 client
154 | s3_client = session.client('s3')
155 |
156 | bucket_path = 'temp_audio'
157 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
158 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
159 | video_url = f'{s3_base_url}{bucket_path}/{filename}'
160 |
161 | os.remove(filename)
162 |
163 | return video_url, duration
164 |
165 | def mindsflow_function(event, context) -> dict:
166 | # get the text and save path from the event
167 | text = event.get("text")
168 | lang = event.get("language", "en")
169 | voice_speed = event.get("voice_speed", None)
170 | speaker = event.get("speaker", None)
171 |
172 | # generate the audio file
173 | audio_url, duration = generate_audio(text, lang, voice_speed, speaker)
174 |
175 | # define result
176 | result = {
177 | 'audio_url': audio_url,
178 | 'duration': duration
179 | }
180 |
181 | return result
182 |
183 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/transcribeAudio.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "transcribeAudio",
4 | "displayName": "",
5 | "description": "This method transcribes audio into text using Azure API, maps start time and duration for each word, converts the transcription to JSON format, and uploads the resulting file to AWS S3. Its input is an audio file.",
6 | "inputPattern": {
7 | "type": "object",
8 | "properties": {
9 | "lang": {
10 | "type": "string",
11 | "description": ""
12 | },
13 | "audio_url": {
14 | "type": "string",
15 | "description": "URL string of the audio to be transcribed"
16 | }
17 | },
18 | "required": [
19 | "audio_url"
20 | ]
21 | },
22 | "outputPattern": {
23 | "type": "object",
24 | "properties": {
25 | "text": {
26 | "type": "string",
27 | "description": ""
28 | },
29 | "duration": {
30 | "type": "number",
31 | "description": ""
32 | },
33 | "transcription_json_url": {
34 | "type": "string",
35 | "description": "The transcription results from the audio file"
36 | }
37 | },
38 | "required": [
39 | "text",
40 | "duration",
41 | "transcription_json_url"
42 | ]
43 | },
44 | "tag": "TextToSpeech",
45 | "testCases": [
46 | {
47 | "lang": "en",
48 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/temp_audio/audio_20231226132719.wav"
49 | },
50 | {
51 | "lang": "en",
52 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/tony_stark.wav"
53 | }
54 | ],
55 | "aiPrompt": "This method is designed to transcribe audio using the Azure API, get the start time and duration of each word, convert the output to JSON format, and then upload the resulting file to AWS S3. The input for this process is an audio file",
56 | "greeting": ""
57 | }
58 | $"""
59 |
60 | import json
61 | import requests
62 | import boto3
63 | import time
64 | import random
65 | import string
66 | import os
67 | from azure.cognitiveservices.speech import SpeechConfig, SpeechRecognizer, AudioConfig
68 | import azure.cognitiveservices.speech as speechsdk
69 | from pydub.utils import mediainfo
70 |
71 |
72 | def get_random_string():
73 | letters = string.ascii_lowercase
74 | result_str = ''.join(random.choice(letters) for _ in range(8))
75 | timestamp = int(time.time())
76 | random_str = str(timestamp) + '_' + result_str
77 | return random_str
78 |
79 |
80 | def upload_to_aws(filename: str) -> str:
81 | # Uses your AWS credentials to access the service
82 | bucket_name = os.environ.get('bucket_name')
83 | region = os.environ.get('region')
84 |
85 | # Create a session using the provided credentials
86 | session = boto3.Session(
87 | aws_access_key_id=os.environ.get('access_key_id'),
88 | aws_secret_access_key=os.environ.get('secret_access_key')
89 | )
90 |
91 | # Create an S3 client
92 | s3_client = session.client('s3')
93 |
94 | bucket_path = 'ai-video'
95 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
96 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
97 | url = f'{s3_base_url}{bucket_path}/{filename}'
98 |
99 | return url
100 |
101 |
102 | def modify_last_word(input_string):
103 | # Remove any trailing whitespace
104 | input_string = input_string.strip()
105 |
106 | if input_string.endswith(','):
107 | # Replace the last character with a period
108 | input_string = input_string[:-1] + '.'
109 | # Check if the last word ends with a period
110 | if not input_string.endswith('.'):
111 | # Add a period at the end if the last word doesn't end with one
112 | input_string += '.'
113 |
114 | return input_string
115 |
116 |
117 | def add_punctuation(input_str: str, event) -> str:
118 | data = {
119 | "style": "LLM-Only",
120 | "stream": False,
121 | "messageContent": input_str,
122 | "agentId": 1605
123 | }
124 | resp = event.chat.messages(data=data)
125 | return resp
126 |
127 |
128 | def fix_punctuation(a_string: str, b_string: str) -> str:
129 | i_a = 0
130 | i_b = 0
131 | while i_a < len(a_string) - 1 and i_b < len(b_string) - 1:
132 | while b_string[i_b] in [',', '.', '!', '?']:
133 | i_b += 1
134 | if a_string[i_a] != ' ' and a_string[i_a + 1] != ' ' and b_string[i_b] == a_string[i_a] and (b_string[i_b+1:i_b+3] == ', ' or b_string[i_b+1:i_b+2] == ',') and b_string[i_b+3] == a_string[i_a + 1]:
135 | print('a')
136 | b_string = b_string[:i_b+1] + b_string[i_b+3:]
137 | i_a += 1
138 | i_b += 1
139 | return b_string
140 |
141 |
142 | def transcribe_audio(audio_path: str, lang: str, event) -> dict:
143 | final_results = {'Display': '', 'Lexical': '', 'Words': [], 'Duration': 0}
144 | done = False
145 | audio_info = mediainfo(audio_path)
146 | total_duration = int(float(audio_info["duration"]) * 1e7) # convert seconds to 100-nanosecond units
147 | print('duration:', total_duration)
148 | final_results['Duration'] = total_duration
149 |
150 | def recognized_cb(evt):
151 | """callback that is called when a piece of speech is recognized"""
152 | print('RECOGNIZED: {}'.format(evt))
153 | nonlocal final_results
154 | if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
155 | json_result = json.loads(evt.result.json)
156 |
157 | lexical = json_result["NBest"][0]['Lexical'].split()
158 | display = json_result["NBest"][0]['Display'].split()
159 | lexical= [element for element in lexical if not element.startswith("'")]
160 | #print('L', len(lexical), lexical)
161 | #print('D',len(display), display)
162 |
163 | words = []
164 | lexical_list = []
165 | #display_list = []
166 | best_words = json_result["NBest"][0]['Words']
167 | #print('B', best_words)
168 | i = 0
169 | for item in best_words:
170 | if "'" in item['Word']:
171 | print('skip:', item['Word'])
172 | continue
173 | if (best_words[i]['Offset']) +best_words[i]['Duration'] / 2 <= total_duration:
174 | words.append(best_words[i])
175 | lexical_list.append(lexical[i])
176 | #display_list.append(display[i])
177 | #print(lexical[i], best_words[i]['Word'], best_words[i]['Offset'])
178 | i += 1
179 | #print(i, len(best_words))
180 | while i < len(best_words) and (best_words[i]['Offset'] +best_words[i]['Duration'] / 2 ) <= total_duration:
181 | if i>= len(lexical):
182 | #print('DEBUG:', i, len(lexical), len(best_words))
183 | #print('DEBUG: exit cycle')
184 | break
185 | words.append(best_words[i])
186 | lexical_list.append(lexical[i])
187 | #display_list.append(display[i])
188 | #print(display[i], lexical[i], best_words[i]['Word'], best_words[i]['Offset'])
189 | i += 1
190 |
191 | #print('end record duration')
192 | #print(display[i], lexical[i], best_words[i]['Word'], best_words[i]['Offset'])
193 | lexical = ' '.join(lexical_list).strip()
194 | #display = ' '.join(display_list).strip()
195 | #print('update results')
196 | final_results['Words'] += words
197 | final_results['Lexical'] += lexical.strip() + ' '
198 | #final_results['Display'] += display.strip() + ' '
199 | #print(final_results['Lexical'] )
200 |
201 | def stop_cb(evt):
202 | """callback that stops continuous recognition on receiving an event `evt`"""
203 | print('CLOSING on {}'.format(evt))
204 | nonlocal done
205 | done = True
206 |
207 | # your Azure Speech service configuration
208 | speech_key = os.environ.get('azure_key')
209 | service_region = os.environ.get('azure_region')
210 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
211 | speech_config.request_word_level_timestamps()
212 | lang_dict = {
213 | 'en': 'en-US',
214 | 'ch': 'zh-CN',
215 | 'zh': 'zh-CN',
216 | 'it': 'it-IT',
217 | 'de': 'de-DE',
218 | 'fr': 'fr-FR',
219 | 'es': 'es-ES'
220 | }
221 | speech_config.speech_recognition_language = lang_dict[lang]
222 | #speech_config.set_property(speechsdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "1000")
223 |
224 | # specifying audio file path
225 | audio_input = speechsdk.AudioConfig(filename=audio_path)
226 |
227 | # creating a speech recognizer
228 | speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
229 | speech_recognizer.recognized.connect(recognized_cb)
230 | speech_recognizer.session_stopped.connect(stop_cb)
231 | speech_recognizer.canceled.connect(stop_cb)
232 |
233 | # perform continuous recognition
234 | speech_recognizer.start_continuous_recognition()
235 | while not done:
236 | time.sleep(.5)
237 |
238 | final_results['Display'] = add_punctuation(final_results['Lexical'], event).strip().replace('//', '').replace('"', '')
239 | final_results['Display'] = fix_punctuation(final_results['Lexical'], final_results['Display'])
240 | print(len(final_results['Display'].split()), len(final_results['Lexical'].split()))
241 | #final_results['Display'] = modify_last_word(final_results['Display'])
242 | return final_results
243 |
244 |
245 | def mindsflow_function(event, context) -> dict:
246 | # get the audio url from the event
247 | audio_url = event.get("audio_url")
248 | lang = event.get("lang", "en")
249 |
250 | # download the audio file
251 | audio_file = requests.get(audio_url)
252 |
253 | audio_path = audio_url.split('/')[-1]
254 | with open(audio_path, 'wb') as f:
255 | f.write(audio_file.content)
256 |
257 | # get the Transcription result
258 | transcription_result = transcribe_audio(audio_path, lang, event)
259 |
260 | transcription_path = 'audio_transcription_{}.json'.format(get_random_string())
261 | # upload transcription result to S3
262 | with open(transcription_path, 'w') as f:
263 | json.dump(transcription_result, f)
264 |
265 | url = upload_to_aws(transcription_path)
266 |
267 | # prepare the result
268 | result = {
269 | 'transcription_json_url': url,
270 | 'duration': transcription_result['Duration'],
271 | 'text': transcription_result['Display']
272 | }
273 |
274 | if os.path.exists(transcription_path):
275 | os.remove(transcription_path)
276 |
277 | return result
278 |
279 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/translateCaptionsJson.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "translateCaptionsJson",
4 | "displayName": "",
5 | "description": "Translate captions in json file",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "json_url",
10 | "source_language",
11 | "target_language"
12 | ],
13 | "properties": {
14 | "json_url": {
15 | "type": "string",
16 | "description": ""
17 | },
18 | "source_language": {
19 | "type": "string",
20 | "description": ""
21 | },
22 | "target_language": {
23 | "type": "string",
24 | "description": ""
25 | }
26 | }
27 | },
28 | "outputPattern": {
29 | "type": "object",
30 | "required": [
31 | "json_url"
32 | ],
33 | "properties": {
34 | "json_url": {
35 | "type": "string",
36 | "description": ""
37 | }
38 | }
39 | },
40 | "tag": "Translation",
41 | "testCases": [
42 | {
43 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/sentence_times_1699866757_slkpxpcq.json",
44 | "source_language": "en",
45 | "target_language": "it"
46 | },
47 | {
48 | "json_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/sentence_times_1700135459_xyxdjgbl.json",
49 | "source_language": "zh",
50 | "target_language": "en"
51 | }
52 | ],
53 | "aiPrompt": "",
54 | "greeting": ""
55 | }
56 | $"""
57 |
58 | import json
59 | from googletrans import Translator, LANGUAGES
60 | import os
61 | import boto3
62 | import requests
63 | import shutil
64 | import random
65 | import string
66 | import pydub
67 |
68 |
69 | def download_file(url, filename):
70 | res = requests.get(url)
71 | with open(filename, "wb") as f:
72 | f.write(res.content)
73 |
74 |
75 | def upload_to_aws(filename: str) -> str:
76 | bucket_name = os.environ.get('bucket_name')
77 | region = os.environ.get('region')
78 | session = boto3.Session(
79 | aws_access_key_id=os.environ.get('access_key_id'),
80 | aws_secret_access_key=os.environ.get('secret_access_key')
81 | )
82 | s3_client = session.client('s3')
83 | bucket_path = 'temp_audio'
84 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
85 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
86 | url = f'{s3_base_url}{bucket_path}/{filename}'
87 | return url
88 |
89 |
90 | def get_captions_from_url(url):
91 | filename = f"{url.split('/')[-1]}"
92 | # download the json file
93 | download_file(url, filename)
94 | # read the contents
95 | with open(filename, 'r', encoding='utf-8') as f:
96 | captions = json.load(f)
97 | return captions, filename
98 |
99 |
100 | def translate_text(text, source_language, target_language):
101 | translator = Translator()
102 | lang_dict = {
103 | 'en': 'english',
104 | 'zh': 'chinese (simplified)',
105 | 'ch': 'chinese (simplified)',
106 | 'de': 'german',
107 | 'ge': 'german',
108 | 'it': 'italian',
109 | 'fr': 'french',
110 | 'sp': 'spanish',
111 | 'es': 'spanish',
112 | }
113 | source_language = lang_dict[source_language]
114 | target_language = lang_dict[target_language]
115 | #print(source_language, target_language)
116 | #print(LANGUAGES.values())
117 | if source_language not in LANGUAGES.values() or target_language not in LANGUAGES.values():
118 | return "Invalid source or target language."
119 | translation = translator.translate(text, src=source_language, dest=target_language)
120 | return translation.text
121 |
122 |
123 | # make this func indepenednt
124 | def translate_captions(captions, source_language, target_language):
125 | translated_captions = []
126 | for cap in captions:
127 | cap['translation'] = translate_text(cap['sentence'], source_language, target_language)
128 | translated_captions.append(cap)
129 | return translated_captions
130 |
131 |
132 | def mindsflow_function(event, context) -> dict:
133 | json_url = event.get("json_url")
134 | target_language = event.get("target_language")
135 | source_language = event.get("source_language")
136 |
137 | # download and read the captions from the json file
138 | captions, json_file = get_captions_from_url(json_url)
139 | # add translated sentences into the target language
140 | translated_captions = translate_captions(captions, source_language, target_language)
141 |
142 | translated_json = 'translated' + ''.join(random.choice(string.ascii_letters) for _ in range(6)) + '.json'
143 | with open(translated_json, 'w', encoding='utf8') as f:
144 | json.dump(translated_captions, f, ensure_ascii=False, indent=4)
145 | json_url = upload_to_aws(translated_json)
146 | os.remove(translated_json)
147 |
148 | return {'json_url': json_url}
149 |
150 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/translateSrtFile.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "translateSrtFile",
4 | "displayName": "",
5 | "description": "This method downloads a subtitle file from a provided URL, translates it into a specified target language while keeping the original language, uploads the translated file to S3, and removes it from the local system.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "srt_url",
10 | "source_lang",
11 | "target_lang",
12 | "show_source_lang_captions",
13 | "show_target_lang_captions"
14 | ],
15 | "properties": {
16 | "srt_url": {
17 | "type": "string",
18 | "description": "URL of the SRT file to be translated"
19 | },
20 | "source_lang": {
21 | "type": "string",
22 | "description": "The language of the original SRT file"
23 | },
24 | "target_lang": {
25 | "type": "string",
26 | "description": "The language to translate the SRT file into"
27 | },
28 | "captions_line": {
29 | "type": "integer",
30 | "description": ""
31 | },
32 | "show_source_lang_captions": {
33 | "type": "boolean",
34 | "description": ""
35 | },
36 | "show_target_lang_captions": {
37 | "type": "boolean",
38 | "description": ""
39 | }
40 | }
41 | },
42 | "outputPattern": {
43 | "type": "object",
44 | "required": [
45 | "transl_srt_url"
46 | ],
47 | "properties": {
48 | "transl_srt_url": {
49 | "type": "string",
50 | "description": "The S3 bucket path where the translated file is uploaded"
51 | }
52 | }
53 | },
54 | "tag": "VideoCaptions",
55 | "testCases": [
56 | {
57 | "srt_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/01w5zy.srt",
58 | "source_lang": "it",
59 | "target_lang": "ch",
60 | "captions_line": 15,
61 | "show_source_lang_captions": true,
62 | "show_target_lang_captions": false
63 | },
64 | {
65 | "srt_url": "",
66 | "source_lang": "",
67 | "target_lang": "de",
68 | "captions_line": 0,
69 | "show_source_lang_captions": false,
70 | "show_target_lang_captions": false
71 | }
72 | ],
73 | "aiPrompt": "Given the url of a srt file, download it and translate subtitle to a target language. The final srt file must contain subtitles in both languages. Input include also original and target language. Finally upload the file to s3 and remote if from local.",
74 | "greeting": ""
75 | }
76 | $"""
77 |
78 | import json
79 | import boto3
80 | import requests
81 | from googletrans import Translator
82 | from pysrt import open as open_srt
83 | import random
84 | import os
85 | import string
86 |
87 |
88 | def download_file(url: str, local_path: str) -> bool:
89 | r = requests.get(url, allow_redirects=True)
90 | open(local_path, 'wb').write(r.content)
91 | return True
92 |
93 |
94 | def translate_text(input_file_path: str, output_file_path: str, origin_lang: str, target_lang: str) -> str:
95 | translator = Translator()
96 | srt_file = open_srt(input_file_path)
97 | for line in srt_file:
98 | translated_text = translator.translate(line.text, src=origin_lang, dest=target_lang)
99 | line.text += "\n" + translated_text.text
100 | srt_file.save(output_file_path, encoding='utf-8')
101 | return True
102 |
103 | def split_text(input_file_path, output_file_path: str, source_lang: str, target_lang: str, new_line_after: int = 15, show_target_lang_captions = True, show_source_lang_captions = True) -> bool:
104 | srt_file = open_srt(input_file_path)
105 | for line in srt_file:
106 | source_text, trans_text = line.text.split("\n")
107 | if "chinese" in target_lang or "japanese" in target_lang:
108 | trans_text = '\n'.join(trans_text[i:min(i+new_line_after, len(trans_text))] for i in range(0, len(trans_text), new_line_after))
109 | if "chinese" in source_lang or "japanese" in source_lang:
110 | source_text = '\n'.join(source_text[i:min(i+new_line_after, len(source_text))] for i in range(0, len(source_text), new_line_after))
111 | if show_source_lang_captions is False and show_target_lang_captions is True:
112 | line.text = trans_text
113 | elif show_target_lang_captions is False and show_source_lang_captions is True:
114 | line.text = source_text
115 | else:
116 | line.text = source_text + "\n" + trans_text
117 | srt_file.save(output_file_path, encoding='utf-8')
118 | return True
119 |
120 |
121 | llm_prompt = '''Given the input sentence in {}, correct any logical, semantic or spelling mistake. If possible also summarize the correctd sentence. Return only the correct sentence.
122 | SENTENCE: {}
123 | CORRECT SENTENCE: '''
124 | def fix_text(input_str: str, event) -> str:
125 | data = {
126 | "style": "LLM-Only",
127 | "stream": False,
128 | "messageContent": input_str,
129 | "agentId": 964
130 | }
131 | resp = event.chat.messages(data=data)
132 | return resp
133 |
134 |
135 | def fix_srt_file(input_file_path: str, origin_lang: str, event) -> bool:
136 | srt_file = open_srt(input_file_path)
137 | for line in srt_file:
138 | temp_prompt = llm_prompt.format(origin_lang, line.text)
139 | fixed_text = fix_text(temp_prompt, event)
140 | #print(line.text, fixed_text)
141 | line.text = fixed_text
142 | srt_file.save(input_file_path, encoding='utf-8')
143 | return True
144 |
145 |
146 | s3_client = boto3.client('s3')
147 |
148 | def upload_to_aws(filename: str) -> str:
149 | # Uses your AWS credentials to access the service
150 | bucket_name = os.environ.get('bucket_name')
151 | region = os.environ.get('region')
152 | # Create a session using the provided credentials
153 | session = boto3.Session(
154 | aws_access_key_id=os.environ.get('access_key_id'),
155 | aws_secret_access_key=os.environ.get('secret_access_key')
156 | )
157 | # Create an S3 client
158 | s3_client = session.client('s3')
159 | bucket_path = 'ai-video'
160 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
161 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
162 | url = f'{s3_base_url}{bucket_path}/{filename}'
163 | return url
164 |
165 | def remove_file(local_path: str):
166 | os.remove(local_path)
167 |
168 |
169 | def mindsflow_function(event, context) -> dict:
170 | srt_file_url = event.get("srt_url")
171 | src_lang = event.get("source_lang")
172 | tgt_lang = event.get("target_lang")
173 | captions_line = event.get("captions_line", 15)
174 | show_target_lang_captions = event.get("show_target_lang_captions", True)
175 | show_source_lang_captions = event.get("show_source_lang_captions", True)
176 |
177 | input_file_path = "input_file.srt"
178 | random_string = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(6))
179 | output_file_path = random_string + "output_file.srt"
180 |
181 | download_file(srt_file_url, input_file_path)
182 |
183 | lang_dict = {
184 | 'en': 'english',
185 | 'zh': 'chinese (simplified)',
186 | 'ch': 'chinese (simplified)',
187 | 'de': 'german',
188 | 'ge': 'german',
189 | 'it': 'italian',
190 | 'fr': 'french',
191 | 'sp': 'spanish',
192 | 'es': 'spanish',
193 | }
194 | if src_lang in lang_dict.keys():
195 | src_lang = lang_dict[src_lang]
196 | if tgt_lang in lang_dict.keys():
197 | tgt_lang = lang_dict[tgt_lang]
198 |
199 | #fix_srt_file(input_file_path, src_lang, event)
200 |
201 | translate_text(input_file_path, input_file_path, src_lang, tgt_lang)
202 | split_text(input_file_path, output_file_path, src_lang, tgt_lang, new_line_after=captions_line, show_target_lang_captions=show_target_lang_captions,
203 | show_source_lang_captions=show_source_lang_captions)
204 |
205 | trans_srt_url = upload_to_aws(output_file_path)
206 |
207 | remove_file(input_file_path)
208 | remove_file(output_file_path)
209 |
210 | result = {
211 | 'transl_srt_url': trans_srt_url
212 | }
213 |
214 | return result
215 |
216 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/translateTargetToSource.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "translateTargetToSource",
4 | "displayName": "",
5 | "description": "This method is designed to translate text from one language to another, utilizing the target language as input and outputting the translation in the source language.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "text",
10 | "source_lang",
11 | "target_lang"
12 | ],
13 | "properties": {
14 | "text": {
15 | "type": "string",
16 | "description": "Text to be translated"
17 | },
18 | "source_lang": {
19 | "type": "string",
20 | "description": "Source language of the text"
21 | },
22 | "target_lang": {
23 | "type": "string",
24 | "description": "Target language to translate the text into"
25 | }
26 | }
27 | },
28 | "outputPattern": {
29 | "type": "object",
30 | "required": [
31 | "text"
32 | ],
33 | "properties": {
34 | "text": {
35 | "type": "string",
36 | "description": "The translated text"
37 | }
38 | }
39 | },
40 | "tag": "Translation",
41 | "testCases": [
42 | {
43 | "text": "Hello world",
44 | "source_lang": "english",
45 | "target_lang": "chinese (simplified)"
46 | },
47 | {
48 | "text": "Guten tag",
49 | "source_lang": "",
50 | "target_lang": ""
51 | }
52 | ],
53 | "aiPrompt": "Translate a text from target language to source language",
54 | "greeting": ""
55 | }
56 | $"""
57 |
58 | import json
59 | from googletrans import Translator, LANGUAGES
60 |
61 | def translate_text(text, source_language, target_language):
62 | #print(LANGUAGES.values())
63 | translator = Translator()
64 |
65 | if source_language not in LANGUAGES.values() or target_language not in LANGUAGES.values():
66 | return "Invalid source or target language."
67 |
68 | translation = translator.translate(text, src=source_language, dest=target_language)
69 |
70 | return translation.text
71 |
72 | def mindsflow_function(event, context) -> dict:
73 | # get the text and languages from the event
74 | text = event.get("text")
75 | src_lang = event.get("source_lang")
76 | tgt_lang = event.get("target_lang")
77 |
78 | lang_dict = {
79 | 'en': 'english',
80 | 'zh': 'chinese (simplified)',
81 | 'ch': 'chinese (simplified)',
82 | 'de': 'german',
83 | 'ge': 'german',
84 | 'it': 'italian',
85 | 'fr': 'french',
86 | 'sp': 'spanish',
87 | 'es': 'spanish',
88 | }
89 | if src_lang in lang_dict.keys():
90 | src_lang = lang_dict[src_lang]
91 | if tgt_lang in lang_dict.keys():
92 | tgt_lang = lang_dict[tgt_lang]
93 |
94 | # get the translation result
95 | translation_result = translate_text(text, src_lang, tgt_lang)
96 |
97 | # define result
98 | result = {
99 | 'text': translation_result
100 | }
101 |
102 | return result
103 |
104 |
--------------------------------------------------------------------------------
/agent-video-generator/functions/uploadYoutubeVideo.py:
--------------------------------------------------------------------------------
1 | """$
2 | {
3 | "name": "uploadYoutubeVideo",
4 | "displayName": "",
5 | "description": "Manages the process of uploading a video to YouTube inclusive of its URL, title, description, and category, after which it deletes the video and returns a success status. The YouTube credentials are loaded from a JSON file.",
6 | "inputPattern": {
7 | "type": "object",
8 | "required": [
9 | "title",
10 | "upload",
11 | "category",
12 | "video_url",
13 | "description",
14 | "account_name"
15 | ],
16 | "properties": {
17 | "title": {
18 | "type": "string",
19 | "description": "Title of the video to be uploaded"
20 | },
21 | "upload": {
22 | "type": "boolean",
23 | "description": ""
24 | },
25 | "category": {
26 | "type": "string",
27 | "description": "Category of the video to be uploaded"
28 | },
29 | "video_url": {
30 | "type": "string",
31 | "description": "URL of the video to be uploaded to YouTube"
32 | },
33 | "description": {
34 | "type": "string",
35 | "description": "Description of the video to be uploaded"
36 | },
37 | "account_name": {
38 | "type": "string",
39 | "description": ""
40 | }
41 | }
42 | },
43 | "outputPattern": {
44 | "type": "object",
45 | "required": [
46 | "upload_success"
47 | ],
48 | "properties": {
49 | "upload_success": {
50 | "type": "boolean",
51 | "description": "A boolean flag indicating if the video was successfully uploaded to YouTube"
52 | }
53 | }
54 | },
55 | "tag": "UploadVideo",
56 | "testCases": [
57 | {
58 | "title": "Sample Video 1",
59 | "upload": false,
60 | "category": "Music",
61 | "video_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/ai-video/output_1696843400_daefppdn.mp4",
62 | "description": "This is a sample video 1 for testing.",
63 | "account_name": "mindsflow.ai"
64 | },
65 | {
66 | "title": "Sample Video 2",
67 | "upload": false,
68 | "category": "Test",
69 | "video_url": "https://example.com/video2.mp4",
70 | "description": "This is a sample video 2 for testing.",
71 | "account_name": ""
72 | }
73 | ],
74 | "aiPrompt": "Upload video to youtube, input are video URL, title, description and category, delete the video after upload. Read youtube credentials from json file. Return succeeded True or False",
75 | "greeting": ""
76 | }
77 | $"""
78 |
79 | from youtube_upload.client import YoutubeUploader
80 | import json
81 | import os
82 | import requests
83 |
84 | category_dict = {
85 | 'Autos & Vehicles': '2',
86 | 'Film & Animation': '1',
87 | 'Music': '10',
88 | 'Pets & Animals': '15',
89 | 'Sports': '17',
90 | 'Short Movies': '18',
91 | 'Travel & Events': '19',
92 | 'Gaming': '20',
93 | 'Videoblogging': '21',
94 | 'People & Blogs': '22',
95 | 'Comedy': '23',
96 | 'Entertainment': '24',
97 | 'News & Politics': '25',
98 | 'Howto & Style': '26',
99 | 'Education': '27',
100 | 'Science & Technology': '28',
101 | 'Nonprofits & Activism': '29',
102 | 'Movies': '30',
103 | 'Anime/Animation': '31',
104 | 'Action/Adventure': '32',
105 | 'Classics': '33',
106 | 'Documentary': '35',
107 | 'Drama': '36',
108 | 'Family': '37',
109 | 'Foreign': '38',
110 | 'Horror': '39',
111 | 'Sci-Fi/Fantasy': '40',
112 | 'Thriller': '41',
113 | 'Shorts': '42',
114 | 'Shows': '43',
115 | 'Trailers': '44'
116 | }
117 |
118 | credentials_path = 'youtube'
119 | config_file = f'{credentials_path}/api_tokens.json'
120 |
121 | def download_file(url, new_file_name):
122 | response = requests.get(url)
123 | with open(new_file_name, 'wb') as f:
124 | f.write(response.content)
125 | return new_file_name
126 |
127 | def mindsflow_function(event, context) -> dict:
128 | video_url = event.get("video_url")
129 | title = event.get("title")
130 | description = event.get("description")
131 | category = event.get("category")
132 | account_name = event.get("account_name")
133 | upload = event.get("upload", True)
134 |
135 | if upload == False:
136 | return {
137 | 'upload_success': False
138 | }
139 |
140 | # download the video
141 | video_path = download_file(video_url, 'video_youtube.mp4')
142 |
143 | with open(config_file, 'r') as json_file:
144 | data = json.load(json_file)
145 | account = data[account_name]
146 |
147 | # Get the credentials
148 | refresh_token = account['refresh_token']
149 | access_token = account['access_token']
150 | secrets_file = account['secrets_file']
151 |
152 | uploader = YoutubeUploader(secrets_file_path=f'{credentials_path}/{secrets_file}')
153 | uploader.authenticate(refresh_token=refresh_token,
154 | access_token=access_token)
155 |
156 | # Video options
157 | options = {
158 | "title" : title, # The video title
159 | "description" : description, # The video description
160 | "tags" : [],
161 | "categoryId" : category_dict[category],
162 | "privacyStatus" : "public", # Video privacy. Can either be "public", "private", or "unlisted"
163 | "kids" : True, # Specifies if the Video if for kids or not. Defaults to False.
164 | #"thumbnailLink" : "https://cdn.havecamerawilltravel.com/photographer/files/2020/01/youtube-logo-new-1068x510.jpg" # Optional. Specifies video thumbnail.
165 | }
166 |
167 | # upload video
168 | try:
169 | uploader.upload(video_path, options)
170 | success = True
171 | except:
172 | success = False
173 |
174 | os.remove(video_path)
175 |
176 | # define result
177 | result = {
178 | 'upload_success': success
179 | }
180 |
181 | return result
182 |
183 |
--------------------------------------------------------------------------------
/results/flow/part1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/part1.png
--------------------------------------------------------------------------------
/results/flow/part2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/part2.png
--------------------------------------------------------------------------------
/results/flow/part3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/part3.png
--------------------------------------------------------------------------------
/results/flow/translation/part1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/translation/part1.png
--------------------------------------------------------------------------------
/results/flow/translation/part2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/translation/part2.png
--------------------------------------------------------------------------------
/results/flow/translation/part3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/flow/translation/part3.png
--------------------------------------------------------------------------------
/results/into_video_transl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/into_video_transl.png
--------------------------------------------------------------------------------
/results/intro.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/intro.jpg
--------------------------------------------------------------------------------
/results/videos/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video1.mp4
--------------------------------------------------------------------------------
/results/videos/video1_transl.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video1_transl.mp4
--------------------------------------------------------------------------------
/results/videos/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video2.mp4
--------------------------------------------------------------------------------
/results/videos/video2_transl.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video2_transl.mp4
--------------------------------------------------------------------------------
/results/videos/video3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video3.mp4
--------------------------------------------------------------------------------
/results/videos/video3_transl.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video3_transl.mp4
--------------------------------------------------------------------------------
/results/videos/video4.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video4.mp4
--------------------------------------------------------------------------------
/results/videos/video4_transl.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davide97l/ai-video-generator/3760447cf99942040e1f291050cef752fc49427d/results/videos/video4_transl.mp4
--------------------------------------------------------------------------------
/video_translation.md:
--------------------------------------------------------------------------------
1 | # AI video translator agent
2 |
3 | 
4 |
5 | This project utilizes advanced AI techniques to translate the spoken language in videos from one language to another.
6 | It also incorporates an automatic caption generation system that generates translated subtitles for improved accessibility and understanding.
7 | The agent is hosted on [Mindsflow.ai](https://mindsflow.ai/).
8 |
9 | ## Features
10 |
11 | - **Automatically translated speech**: Using [Azure-API](https://azure.microsoft.com/en-us/products/ai-services/text-to-speech), the agent can translate the speech of the original video to a target language.
12 |
13 | - **Automatically translated captions**: Video speech transcription is first translated with [googletrans](https://pypi.org/project/googletrans/), then translated captions are added to the video with [ffmpeg](https://ffmpeg.org/about.html).
14 |
15 | - **Automatic upload**: Once the video is ready, the agent can automatically upload it on your favourite social media platform.
16 |
17 | - **Flow-based programming**: The agents is based on a [flow-based programming](https://en.wikipedia.org/wiki/Flow-based_programming) model to assemble different AI and algorithmic components into a complete video. The flow is developed and hosted on [Mindsflow.ai](https://mindsflow.ai/). All the blocks of the flow are available [here](agent-video-generator/functions).
18 |
19 | **Note**: running this agent requires to have an [Azure API](https://azure.microsoft.com/en-us/products/api-management/?ef_id=_k_Cj0KCQiA5-uuBhDzARIsAAa21T9Ii5vg2kAFHYwFfD2k7pnFp1Rg-HbVmvAOKfTrqq5Ue2TfbAIdahEaAmkWEALw_wcB_k_&OCID=AIDcmmy6frl1tq_SEM__k_Cj0KCQiA5-uuBhDzARIsAAa21T9Ii5vg2kAFHYwFfD2k7pnFp1Rg-HbVmvAOKfTrqq5Ue2TfbAIdahEaAmkWEALw_wcB_k_&gad_source=1&gclid=Cj0KCQiA5-uuBhDzARIsAAa21T9Ii5vg2kAFHYwFfD2k7pnFp1Rg-HbVmvAOKfTrqq5Ue2TfbAIdahEaAmkWEALw_wcB) key.
20 |
21 | ## Results
22 |
23 | In the following table you can watch some generated samples.
24 |
25 | | Original | Translation (Chinese) |
26 | |--------------------------------------------------------------|------------------------------------------------------|
27 | | https://youtube.com/shorts/wKOBppgV2R0?feature=share | https://youtube.com/shorts/Zbk02MsAoko?feature=share |
28 | | https://youtube.com/shorts/6dgCNjVMBpM?feature=share | https://youtube.com/shorts/wbZbhVQmpNw?feature=share |
29 | | https://youtube.com/shorts/wKOBppgV2R0?feature=share | https://youtube.com/shorts/GCIVzEHeeqM?feature=share |
30 | | https://youtu.be/ERfDdZq9ve8 | https://youtu.be/a1tKgG0UP6A |
31 |
32 | More results are available on [Douyin](https://www.douyin.com/user/MS4wLjABAAAAnDmwuk2SS4WBc8swBbYhtbGpH1Mrp3nlHrTnMcyDJdW5RUsr4BCajyo716Wyc76L?is_search=0&list_name=follow&nt=0)
33 |
34 | ## Flow
35 |
36 | | Part 1 | Part 2 | Part 3 |
37 | |---------------------------------------------------|-------------------------------------|-------------------------------------|
38 | |  |  |  |
39 |
40 | For more details you see the full images [here](./results/flow/translation/).
41 |
42 | ## Input format
43 |
44 | ```
45 | {
46 | "voice": "zh-CN-YunfengNeural", # (full list of voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts)
47 | "video_url": "video_to_translate.mp4", # link to the video to be translated. It must point to a video mp4 file stored online.
48 | "source_language": "en", # video original language
49 | "target_language": "zh", # language to translated the video
50 | "account_name": "mindsflow.ai", # account name, only if you want the video to be automatically uploaded on your platform
51 | "upload": false, # whether to upload the video on social media
52 | }
53 | ```
54 |
55 | ## Output format
56 |
57 | The output of the agent is structured in the following way:
58 |
59 | ```
60 | {
61 | "result": "link to result"
62 | }
63 | ```
64 | In this output, result is a link pointing to a ZIP file. This ZIP file contains:
65 |
66 | - The generated video in mp4 format
67 | - The original video in mp4 format
68 | - The video original subtitles in srt text format
69 | - The video translated subtitles in srt text format
70 |
71 | ## Extra
72 |
73 | Try out more AI agents at [https://chat.mindsflow.ai/en-US/explore](https://chat.mindsflow.ai/en-US/explore).
--------------------------------------------------------------------------------
/voice_clone/functions/clone_voice_vits.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import boto3
4 | import requests
5 | import random
6 | import string
7 |
8 | default_train_config = 'config_1000'
9 |
10 | def mindsflow_function(event, context) -> dict:
11 | # get from event
12 | dataset_url = event.get('dataset_url')
13 | config = event.get('train_config', default_train_config)
14 | split = event.get('audio_split', 12)
15 | clean_noise = event.get('clean_noise', False)
16 | voice = event.get('voice', None)
17 | api_ip = os.environ.get('api_ip')
18 |
19 | if config is None or len(config) == 0:
20 | config = default_train_config
21 | if voice is not None and len(voice) == 0:
22 | voice = None
23 |
24 | voice_clone_url = f"http://{api_ip}:5000/voice_clone/"
25 |
26 | data = {
27 | "dataset_url": dataset_url,
28 | "config": config,
29 | "split": split,
30 | "clean_noise": clean_noise
31 | }
32 |
33 | headers = {
34 | 'Content-Type': 'application/json'
35 | }
36 |
37 | print('Cloning voice...')
38 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers)
39 | if response.status_code != 200:
40 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}')
41 | print('Voice cloned')
42 |
43 | response_dict = response.json()
44 |
45 | return {
46 | "succeeded": response_dict["succeeded"],
47 | "voice": response_dict["voice"] if voice is None else voice
48 | }
49 |
--------------------------------------------------------------------------------
/voice_clone/functions/generate_voice_vits.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import boto3
4 | import requests
5 | import random
6 | import string
7 |
8 | s3 = boto3.resource('s3')
9 |
10 | def download_file(url: str, save_path: str):
11 | resp = requests.get(url)
12 | with open(save_path, 'wb') as f:
13 | f.write(resp.content)
14 |
15 |
16 | def generate_random_string(length):
17 | letters = string.ascii_letters
18 | result_str = ''.join(random.choice(letters) for i in range(length))
19 | return result_str
20 |
21 |
22 | def upload_to_aws(filename: str) -> str:
23 | bucket_name = os.environ.get('bucket_name')
24 | region = os.environ.get('region')
25 | session = boto3.Session(
26 | aws_access_key_id=os.environ.get('access_key_id'),
27 | aws_secret_access_key=os.environ.get('secret_access_key')
28 | )
29 | s3_client = session.client('s3')
30 | bucket_path = 'voice-clone'
31 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
32 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
33 | url = f'{s3_base_url}{bucket_path}/{filename}'
34 | return url
35 |
36 |
37 | def mindsflow_function(event, context) -> dict:
38 | # get from event
39 | audio_url = event.get('audio_url')
40 | voice= event.get('voice')
41 | clean_noise = event.get('clean_noise')
42 | api_ip = os.environ.get('api_ip')
43 |
44 | voice_clone_url = f"http://{api_ip}:5001/generate_voice/"
45 |
46 | data = {
47 | "audio_url": audio_url,
48 | "voice": voice,
49 | "clean_noise": clean_noise
50 | }
51 |
52 | headers = {
53 | 'Content-Type': 'application/json'
54 | }
55 |
56 | print('Generating voice...')
57 | response = requests.post(voice_clone_url, data=json.dumps(data), headers=headers)
58 | if response.status_code != 200:
59 | raise RuntimeError(f'Voice cloning failed with status code: {response.status_code}')
60 | print('Voice generated')
61 |
62 | audio_path = voice + '_' + audio_url.split('/')[-1]
63 | # Save the file to the directory
64 | with open(audio_path, 'wb') as file:
65 | file.write(response.content)
66 |
67 | result_url = upload_to_aws(audio_path)
68 |
69 | # clean up
70 | os.remove(audio_path)
71 |
72 | return {
73 | "audio_url": result_url
74 | }
75 |
--------------------------------------------------------------------------------
/voice_clone/functions/set_epoch_in_json_config.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import boto3
4 |
5 | s3 = boto3.resource('s3')
6 |
7 | def upload_to_aws(filename: str) -> str:
8 | bucket_name = os.environ.get('bucket_name')
9 | region = os.environ.get('region')
10 | session = boto3.Session(
11 | aws_access_key_id=os.environ.get('access_key_id'),
12 | aws_secret_access_key=os.environ.get('secret_access_key')
13 | )
14 | s3_client = session.client('s3')
15 | bucket_path = 'voice-clone'
16 | s3_client.upload_file(f"{filename}", bucket_name, f"{bucket_path}/{filename}")
17 | s3_base_url = f'https://{bucket_name}.s3.{region}.amazonaws.com/'
18 | url = f'{s3_base_url}{bucket_path}/{filename}'
19 | return url
20 |
21 | def modify_epochs(file_path:str, new_epoch:int) -> bool:
22 | with open(file_path, 'r') as json_file:
23 | data = json.load(json_file)
24 | data['train']['epochs'] = new_epoch
25 |
26 | new_file_name = f'config_{new_epoch}.json'
27 | new_file_path = os.path.join(os.path.dirname(file_path), new_file_name)
28 | with open(new_file_path, 'w') as new_file:
29 | json.dump(data, new_file, indent=4)
30 |
31 | return new_file_path
32 |
33 | def mindsflow_function(event, context) -> dict:
34 | # extract parameters from event
35 | file_path = 'train_configs/config.json'
36 | new_epoch = event.get("epochs")
37 |
38 | # modify the epochs in JSON file
39 | new_file_path = modify_epochs(file_path, new_epoch)
40 |
41 | url = upload_to_aws(new_file_path)
42 |
43 | os.remove(new_file_path)
44 |
45 | # formulate the result
46 | result = {
47 | 'config_url': url
48 | }
49 |
50 | return result
51 |
--------------------------------------------------------------------------------
/voice_clone/voice_clone_api/functions.py:
--------------------------------------------------------------------------------
1 | from pydub import AudioSegment
2 | import os
3 | from scipy.io import wavfile
4 | import noisereduce as nr
5 | import numpy as np
6 | import wave
7 | import requests
8 |
9 |
10 | def is_stereo(filename):
11 | with wave.open(filename, 'rb') as wav_file:
12 | channels = wav_file.getnchannels()
13 |
14 | if channels == 2:
15 | return True
16 | else:
17 | return False
18 |
19 |
20 | def reduce_noise(file_name):
21 | if file_name.split('.')[-1] != 'wav':
22 | file_name = convert_mp3_to_wav(file_name, remove_original=True)
23 | rate, data = wavfile.read(file_name)
24 | if is_stereo(file_name):
25 | # from https://github.com/timsainb/noisereduce/issues/57
26 | data1 = data[:,0]
27 | data2 = data[0:,1]
28 | # perform noise reduction
29 | reduced_noise1 = nr.reduce_noise(y=data1, sr=rate)
30 | reduced_noise2 = nr.reduce_noise(y=data2, sr=rate)
31 | reduced_noise = np.stack((reduced_noise1, reduced_noise2), axis=1)
32 | else:
33 | reduced_noise = nr.reduce_noise(y=data, sr=rate)
34 | wavfile.write(file_name, rate, reduced_noise)
35 | return file_name
36 |
37 |
38 | def split_audio(input_file, duration):
39 | # Load audio file
40 | audio = AudioSegment.from_file(input_file)
41 |
42 | # Length of audio file
43 | length_audio = len(audio)
44 |
45 | # Split audio file into chunks of 'duration'
46 | chunks = [audio[i:i+duration*1000] for i in range(0, length_audio, duration*1000)]
47 |
48 | # Save chunks in the same folder as the original file
49 | for i, chunk in enumerate(chunks):
50 | chunk_name = f'{input_file[:-4]}_chunk_{i}.wav'
51 | print(f'Created {chunk_name}')
52 | chunk.export(chunk_name, format='wav')
53 |
54 |
55 | def convert_mp3_to_wav(file_path, remove_original=True):
56 | audio = AudioSegment.from_mp3(file_path)
57 | output_path = change_file_extension(file_path, 'wav')
58 | audio.export(output_path, format="wav")
59 | if remove_original:
60 | os.remove(file_path)
61 | return output_path
62 |
63 |
64 | def change_file_extension(filename, new_extension):
65 | # Get the file name without the old extension
66 | base = os.path.splitext(filename)[0]
67 | # Return the file name with the new extension
68 | return base + '.' + new_extension
69 |
70 |
71 | def download_file(url, filename):
72 | r = requests.get(url, allow_redirects=True)
73 | open(filename, 'wb').write(r.content)
74 |
--------------------------------------------------------------------------------
/voice_clone/voice_clone_api/infer.py:
--------------------------------------------------------------------------------
1 | from functions import *
2 |
3 |
4 | audio_path = '../results'
5 | dataset_raw = 'dataset_raw'
6 | logs = 'logs'
7 |
8 | # reference: https://github.com/svc-develop-team/so-vits-svc
9 |
10 |
11 | def infer(audio_url, dataset_name, config, clean_noise=False):
12 | os.system(f"wget {audio_url}")
13 | audio_name = audio_url.split('/')[-1].split('.')[0]
14 | audio_name_with_ext = audio_url.split('/')[-1]
15 | ext = audio_name_with_ext.split('.')[-1]
16 | if ext == 'mp3':
17 | audio_name_with_ext = convert_mp3_to_wav(audio_name_with_ext)
18 | if not os.path.exists(f"{audio_path}"):
19 | os.system(f"mkdir {audio_path}")
20 | if clean_noise:
21 | audio_name_with_ext = reduce_noise(audio_name_with_ext)
22 | os.system(f"mv {audio_name_with_ext} {audio_path}")
23 | os.system(f"svc infer {audio_path}/{audio_name_with_ext} -m {logs}/{dataset_name}/ -c {logs}/{config}.json")
24 | os.system(f"rm {audio_path}/{audio_name_with_ext}")
25 | #os.system(f"mv {audio_name}.out.wav {audio_path}")
26 | return f"{audio_path}/{audio_name}.out.wav"
27 |
--------------------------------------------------------------------------------
/voice_clone/voice_clone_api/infer_api.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI, UploadFile, File, HTTPException
2 | import os
3 | from fastapi.responses import FileResponse
4 | import argparse
5 | import logging
6 | import uvicorn
7 | from infer import *
8 |
9 | app = FastAPI()
10 | dataset_raw = 'dataset_raw'
11 | logs = 'logs'
12 | config = "config_1000"
13 |
14 |
15 | @app.post("/generate_voice")
16 | async def generate_audio_file(data: dict):
17 | print('Received audio generation request data: ', data)
18 | dataset = data['voice']
19 | audio_url = data['audio_url']
20 | clean_noise = data['clean_noise']
21 |
22 | try:
23 | generated_audio = infer(audio_url, dataset, config, clean_noise=clean_noise)
24 | except Exception as e:
25 | raise HTTPException(status_code=500, detail=e)
26 |
27 | response = FileResponse(generated_audio, filename=generated_audio)
28 | return response
29 |
30 |
31 | # python3 infer_api.py --port 5000
32 | if __name__ == "__main__":
33 | # Setting up argument parsing
34 | parser = argparse.ArgumentParser()
35 | parser.add_argument("--port", "-p", type=int, default=5000, help="Port to run server on, default is 8000.")
36 | args = parser.parse_args()
37 |
38 | logging.basicConfig(level=logging.INFO)
39 | uvicorn.run(app, host="0.0.0.0", port=args.port)
40 |
--------------------------------------------------------------------------------
/voice_clone/voice_clone_api/test_api/run_make_prompt.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 |
4 | # Specify the API endpoint
5 | url = "http://IP/voice_clone/"
6 |
7 | # Specify the data payload
8 | data = {
9 | "audio_url": "https://function-stable-diffusion.s3.ap-northeast-1.amazonaws.com/transfer/tony_stark.wav",
10 | "character_name": "xxx"
11 | }
12 |
13 | headers = {
14 | 'Content-Type': 'application/json'
15 | }
16 |
17 | response = requests.post(url, data=json.dumps(data), headers=headers)
18 |
19 | if response.status_code == 200:
20 | print(f"Response from server: {response.json()}")
21 | else:
22 | print(f"Failed to get response. Status code: {response.status_code}")
--------------------------------------------------------------------------------
/voice_clone/voice_clone_api/test_api/run_voice_clone.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import os
3 | import json
4 |
5 | data = {
6 | 'character_name': 'elon_musk',
7 | 'text': 'SpaceX aims to make humanity a multiplanetary species.'
8 | }
9 |
10 | response = requests.post('http://IP/generate_audio/', json=data)
11 |
12 | with open('response.wav', 'wb') as f:
13 | f.write(response.content)
14 |
15 | if response.status_code == 200:
16 | filename = 'result.wav'
17 | folder_name = 'results'
18 | if not os.path.exists(folder_name):
19 | os.makedirs(folder_name)
20 | file_path = os.path.join(folder_name, filename)
21 |
22 | # Save the file to the directory
23 | with open(file_path, 'wb') as file:
24 | file.write(response.content)
25 | print(f'File saved at {file_path}')
26 | else:
27 | print('Failed to get response from the server.')
--------------------------------------------------------------------------------
/voice_clone/voice_clone_api/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pydub import AudioSegment
3 | from functions import split_audio, reduce_noise, download_file
4 | import argparse
5 |
6 |
7 | dataset_raw = 'dataset_raw'
8 | logs = 'logs'
9 |
10 | # reference: https://github.com/svc-develop-team/so-vits-svc
11 |
12 |
13 | def preprocess(dataset_path: str, split_threshold: int = 12, f0_method='dio', clean_noise=False):
14 | # dataset zip url
15 | if not os.path.exists(dataset_raw):
16 | os.mkdir(dataset_raw)
17 | os.system(f"wget {dataset_path}")
18 | dataset_name = dataset_path.split('/')[-1].split('.')[0]
19 | print('Dataset name:', dataset_name)
20 | dataset_dir = os.path.join(dataset_raw, dataset_name)
21 | if not os.path.exists(dataset_dir):
22 | os.mkdir(dataset_dir)
23 | print('Created dir:', dataset_dir)
24 | os.system(f"mv {dataset_name}.zip {dataset_raw}")
25 | os.system(f"unzip {dataset_raw}/{dataset_name}.zip -d {dataset_raw}")
26 | # after unzip, if the file contains a single file, create folder in {dataset_raw} with the name of the file and
27 | # move the wav file to that folder
28 | files = [f for f in os.listdir(dataset_raw) if f.endswith('.wav')]
29 | if len(files) == 1:
30 | single_file = files[0]
31 | file_name, file_ext = os.path.splitext(single_file)
32 | new_folder_path = f"{dataset_raw}/{file_name}"
33 | os.system(f"mkdir {new_folder_path}")
34 | os.system(f"mv {dataset_raw}/{single_file} {new_folder_path}")
35 |
36 | os.system(f"rm {dataset_raw}/{dataset_name}.zip")
37 |
38 | if os.path.exists(os.path.join(dataset_raw, '__MACOSX')):
39 | os.system(f"rm -rf {dataset_raw}/__MACOSX")
40 | if os.path.exists(os.path.join('dataset/44k', '__MACOSX')):
41 | os.system(f"rm -rf dataset/44k/__MACOSX")
42 |
43 | for root, _, files in os.walk(dataset_dir):
44 | for name in files:
45 | filename = os.path.join(root, name)
46 | if filename.endswith((".mp3", ".wav")):
47 |
48 | if clean_noise:
49 | filename = reduce_noise(filename)
50 |
51 | # Split long audio file into smaller chunks
52 | audio = AudioSegment.from_file(filename)
53 | duration_seconds = len(audio) / 1000 # duration in seconds
54 | if duration_seconds > split_threshold:
55 | split_audio(filename, split_threshold)
56 | os.remove(filename)
57 | print(f'Removed {filename}')
58 |
59 | os.system("svc pre-resample")
60 | os.system("svc pre-config")
61 | os.system(f"svc pre-hubert -fm {f0_method}")
62 | return dataset_name
63 |
64 |
65 | def train(dataset_name, config):
66 | if not os.path.exists(logs):
67 | os.mkdir(logs)
68 | if 'http' in config:
69 | download_file(config, f"{logs}/custom_config.json")
70 | config = f"{logs}/custom_config.json"
71 | else:
72 | if '.json' in config:
73 | config = config.split('.json')[0]
74 | config = f'{logs}/{config}.json'
75 | if os.path.exists(f"{logs}/{dataset_name}"):
76 | os.system(f"rm -rf {logs}/{dataset_name}")
77 | if not os.path.exists(f"{logs}/{dataset_name}"):
78 | os.mkdir(f"{logs}/{dataset_name}")
79 | os.system(f"svc train --model-path {logs}/{dataset_name} --config-path {config}")
80 | # svc train --model-path logs/davide_en --config-path logs/config_100.json
81 |
82 |
83 | def clean():
84 | os.system(f"rm -rf {dataset_raw}")
85 | os.system(f"rm -rf dataset")
86 | os.system(f"rm -rf filelists")
87 |
88 |
--------------------------------------------------------------------------------
/voice_clone/voice_clone_api/train_api.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI, UploadFile, File, HTTPException
2 | import logging
3 | import uvicorn
4 | from train import *
5 |
6 | app = FastAPI()
7 | dataset_raw = 'dataset_raw'
8 | logs = 'logs'
9 |
10 |
11 | @app.post("/voice_clone")
12 | async def generate_audio_file(data: dict):
13 | print('Received audio generation request data: ', data)
14 | dataset = data['dataset_url']
15 | split = data['split']
16 | config = data['config']
17 | clean_noise = data['clean_noise']
18 | try:
19 | dataset_name = preprocess(dataset, split_threshold=split, clean_noise=clean_noise)
20 | train(dataset_name, config)
21 | clean()
22 | except Exception as e:
23 | raise HTTPException(status_code=500, detail=e)
24 |
25 | return {
26 | "succeeded": True,
27 | "voice": dataset_name,
28 | }
29 |
30 |
31 | # python3 train_api.py --port 5000
32 | if __name__ == "__main__":
33 | # Setting up argument parsing
34 | parser = argparse.ArgumentParser()
35 | parser.add_argument("--port", "-p", type=int, default=5000, help="Port to run server on, default is 8000.")
36 | args = parser.parse_args()
37 |
38 | logging.basicConfig(level=logging.INFO)
39 | uvicorn.run(app, host="0.0.0.0", port=args.port)
40 |
--------------------------------------------------------------------------------