├── .gitignore ├── README.md ├── character_code.json ├── get_dataset.py └── preprocessing.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | tts_dataset/ 133 | metadata* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ProsekaTTS -------------------------------------------------------------------------------- /character_code.json: -------------------------------------------------------------------------------- 1 | { 2 | "code" : [ 3 | { 4 | "name":"hoshino_ichika", 5 | "character":"一歌", 6 | "char_code":"chr_ts_1.", 7 | "speaker_id": 0 8 | }, 9 | { 10 | "name":"tenma_saki", 11 | "character":"咲希", 12 | "char_code":"chr_ts_2.", 13 | "speaker_id": 1 14 | 15 | }, 16 | { 17 | "name":"mochizuki_honami", 18 | "character":"穂波", 19 | "char_code":"chr_ts_3.", 20 | "speaker_id": 2 21 | 22 | }, 23 | { 24 | "name":"hinomori_shiho", 25 | "character":"志歩", 26 | "char_code":"chr_ts_4.", 27 | "speaker_id": 3 28 | }, 29 | 30 | 31 | 32 | { 33 | "name":"hanasato_minori", 34 | "character":"みのり", 35 | "char_code":"chr_ts_5.", 36 | "speaker_id": 4 37 | }, 38 | { 39 | "name":"kiritani_haruka", 40 | "character":"遥", 41 | "char_code":"chr_ts_6.", 42 | "speaker_id": 5 43 | }, 44 | { 45 | "name":"momoi_airi", 46 | "character":"愛莉", 47 | "char_code":"chr_ts_7.", 48 | "speaker_id": 6 49 | }, 50 | { 51 | "name":"hinomori_shizuku", 52 | "character":"雫", 53 | "char_code":"chr_ts_8.", 54 | "speaker_id": 7 55 | }, 56 | 57 | 58 | 59 | { 60 | "name":"azusawa_kohane", 61 | "character":"こはね", 62 | "char_code":"chr_ts_9.", 63 | "speaker_id": 8 64 | }, 65 | { 66 | "name":"shiraishi_an", 67 | "character":"杏", 68 | "char_code":"chr_ts_10.", 69 | "speaker_id": 9 70 | }, 71 | { 72 | "name":"shinonome_akito", 73 | "character":"彰人", 74 | "char_code":"chr_ts_11.", 75 | "speaker_id": 10 76 | }, 77 | { 78 | "name":"aoyagi_toya", 79 | "character":"冬弥", 80 | "char_code":"chr_ts_12.", 81 | "speaker_id": 11 82 | }, 83 | 84 | 85 | 86 | { 87 | "name":"tenma_tsukasa", 88 | "character":"司", 89 | "char_code":"chr_ts_13.", 90 | "speaker_id": 12 91 | }, 92 | { 93 | "name":"ootori_emu", 94 | "character":"えむ", 95 | "char_code":"chr_ts_14.", 96 | "speaker_id": 13 97 | }, 98 | { 99 | "name":"kusanagi_nene", 100 | "character":"寧々", 101 | "char_code":"chr_ts_15.", 102 | "speaker_id": 14 103 | }, 104 | { 105 | "name":"kamishiro_rui", 106 | "character":"類", 107 | "char_code":"chr_ts_16.", 108 | "speaker_id": 15 109 | }, 110 | 111 | 112 | 113 | { 114 | "name":"yoasaki_kanade", 115 | "character":"奏", 116 | "char_code":"chr_ts_17.", 117 | "speaker_id": 16 118 | }, 119 | { 120 | "name":"asahina_mafuyu", 121 | "character":"まふゆ", 122 | "char_code":"chr_ts_18.", 123 | "speaker_id": 17 124 | }, 125 | { 126 | "name":"shinonome_ena", 127 | "character":"絵名", 128 | "char_code":"chr_ts_19.", 129 | "speaker_id": 18 130 | }, 131 | { 132 | "name":"akiyama_mizuki", 133 | "character":"瑞希", 134 | "char_code":"chr_ts_20.", 135 | "speaker_id": 19 136 | } 137 | ] 138 | } -------------------------------------------------------------------------------- /get_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pprint import pprint 3 | import json 4 | from pySmartDL import SmartDL 5 | from time import sleep 6 | from selenium.common import exceptions 7 | from pprint import pprint 8 | import chromedriver_autoinstaller 9 | 10 | from selenium.webdriver import Chrome 11 | from selenium.webdriver.chrome.options import Options 12 | from selenium.common import exceptions 13 | from selenium.webdriver.common.by import By 14 | from pySmartDL import SmartDL 15 | 16 | 17 | 18 | class GetDataURL: 19 | def __init__(self, driver) -> None: 20 | self.driver = driver 21 | 22 | 23 | def get_all_data(self): 24 | lst = [] 25 | lst.extend(self.character_stories()) 26 | lst.extend(self.card_stories()) 27 | lst.extend(self.area_talk()) 28 | lst.extend(self.unit_story()) 29 | # lst.extend(self.special_story()) 30 | # lst.extend(self.event_story()) 31 | return lst 32 | 33 | 34 | def character_stories(self): 35 | url_lst = [] 36 | self.driver.get('https://sekai.best/storyreader/charaStory') 37 | sleep(5) 38 | container = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 39 | for i in container: 40 | if character in i.text: 41 | url = i.find_element(By.TAG_NAME, 'a').get_attribute('href') 42 | url_lst.append(url) 43 | pprint(url_lst) 44 | return url_lst 45 | 46 | 47 | def card_stories(self): 48 | url_lst = [] 49 | self.driver.get('https://sekai.best/storyreader/cardStory') 50 | sleep(10) 51 | container = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 52 | for i in container: 53 | if character in i.text: 54 | i.click() 55 | break 56 | 57 | con = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 58 | for j in [i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in con]: 59 | self.driver.get(j) 60 | sleep(5) 61 | c = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 62 | url_lst.extend([i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in c]) 63 | pprint(url_lst) 64 | return url_lst 65 | 66 | 67 | def area_talk(self): 68 | url_lst = [] 69 | self.driver.get('https://sekai.best/storyreader/areaTalk/5') # MapTalk에서 에어리어 '교실 세카이' 70 | sleep(5) 71 | container = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 72 | for i in container: 73 | for j in i.find_elements(By.TAG_NAME, 'img'): 74 | if char_code in j.get_attribute('src'): 75 | url_lst.append(i.find_elements(By.TAG_NAME, 'a')[-1].get_attribute('href')) 76 | break 77 | pprint(url_lst) 78 | return url_lst 79 | 80 | 81 | def unit_story(self): 82 | url_lst = [] 83 | self.driver.get('https://sekai.best/storyreader/unitStory/light_sound') # Leo/need 스토리 84 | sleep(5) 85 | 86 | container = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 87 | for j in [i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in container]: 88 | self.driver.get(j) 89 | sleep(5) 90 | c = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 91 | url_lst.extend([i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in c]) 92 | pprint(url_lst) 93 | return url_lst 94 | 95 | 96 | def special_story(self): 97 | url_lst = [] 98 | self.driver.get('https://sekai.best/storyreader/specialStory') 99 | sleep(5) 100 | 101 | container = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 102 | for j in [i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in container]: 103 | self.driver.get(j) 104 | sleep(5) 105 | c = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 106 | url_lst.extend([i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in c]) 107 | pprint(url_lst) 108 | return url_lst 109 | 110 | 111 | def event_story(self): 112 | url_lst = [] 113 | self.driver.get('https://sekai.best/storyreader/eventStory') 114 | sleep(10) 115 | 116 | container = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 117 | for j in [i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in container]: 118 | self.driver.get(j) 119 | sleep(5) 120 | try: 121 | c = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 122 | url_lst.extend([i.find_element(By.TAG_NAME, 'a').get_attribute('href') for i in c]) 123 | except exceptions.StaleElementReferenceException: 124 | print("StaleElementReferenceException") 125 | 126 | pprint(url_lst) 127 | return url_lst 128 | 129 | 130 | 131 | 132 | class GetProsekaDataset: 133 | def __init__(self, driver) -> None: 134 | if not os.path.isdir(f'./tts_dataset/{name}/'): os.mkdir(f'./tts_dataset/{name}/') 135 | if not os.path.isdir(f'./tts_dataset/{name}/mp3s/'): os.mkdir(f'./tts_dataset/{name}/mp3s/') 136 | if not os.path.isdir(f'./tts_dataset/{name}/wavs/'): os.mkdir(f'./tts_dataset/{name}/wavs/') 137 | 138 | self.metadata = open(f'./tts_dataset/{name}/metadata.txt', 'a', encoding='utf-8') 139 | self.driver = driver 140 | 141 | 142 | def get_data(self, talk_urls): 143 | for tu in talk_urls: 144 | self.driver.get(tu) 145 | sleep(10) 146 | containers = self.driver.find_elements(By.XPATH, '//*[@id="root"]/div/div[3]/div[2]/div') 147 | for i in containers: 148 | if self.is_character(i): 149 | try: 150 | transcript = i.find_element(By.TAG_NAME, 'p').text.strip() 151 | mp3Link = i.find_element(By.TAG_NAME, 'a').get_attribute('href') 152 | fname = mp3Link.split('/')[-1].replace('.mp3', '.wav') 153 | print(f'{transcript}\n{mp3Link}') 154 | self.download_mp3(mp3Link) 155 | print('\n') 156 | self.metadata.writelines(f'../tts_dataset/{name}/wavs/{fname}|{transcript}\n') 157 | except exceptions.NoSuchElementException: 158 | print("요소가 없습니다!") 159 | 160 | 161 | 162 | 163 | def is_character(self, element): 164 | try: 165 | if element.text.strip().startswith(character): 166 | return True 167 | else: 168 | return False 169 | except: 170 | return False 171 | 172 | 173 | def download_mp3(self, mp3Url): 174 | while True: 175 | try: 176 | obj = SmartDL(mp3Url, f'./tts_dataset/{name}/mp3s/') 177 | obj.start() 178 | path = obj.get_dest() 179 | break 180 | except: 181 | print("1초 후 다시 요청") 182 | sleep(1) 183 | 184 | 185 | 186 | 187 | if __name__ == '__main__': 188 | import sys 189 | char_json = json.loads( 190 | open('character_code.json', 'r', encoding='utf-8').read() 191 | )['code'] [int(sys.argv[1])] # Speaker ID 192 | 193 | name = char_json['name'] 194 | character = char_json['character'] 195 | char_code = char_json['char_code'] 196 | 197 | print(name, character, char_code) 198 | 199 | options = Options() 200 | path = chromedriver_autoinstaller.install() 201 | driver = Chrome() 202 | 203 | g = GetDataURL(driver) 204 | data = g.get_all_data() 205 | 206 | p = GetProsekaDataset(driver) 207 | p.get_data(data) -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | import wave 2 | import contextlib 3 | import os 4 | 5 | import jaconv 6 | 7 | 8 | 9 | def set_sample_rate(name): 10 | os.chdir(f'tts_dataset/{name}/mp3s/') 11 | os.system('FOR /F "tokens=*" %G IN (\'dir /b *.mp3\') DO ffmpeg -i "%G" -ac 1 -ar 22050 "../wavs/%~nG.wav" ') 12 | 13 | 14 | def text_replacing(text): 15 | text = text.strip() 16 | text = text.lower() 17 | repl_lst = list('♪『』()/') 18 | for i in repl_lst: 19 | text = text.replace(i, '') 20 | text = jaconv.alphabet2kata(text) 21 | # text = jaconv.normalize(text) 22 | return text 23 | 24 | 25 | 26 | def determining_dataset(name): 27 | metadata = open(f'tts_dataset/{name}/metadata2.txt', 'w', encoding='utf-8') 28 | 29 | with open(f'tts_dataset/{name}/metadata.txt', 'r', encoding='utf-8') as f: 30 | for i in f.read().split('\n'): 31 | fname, transcript = i.split('|') 32 | 33 | with contextlib.closing(wave.open(f'{fname[3:]}', 'r')) as f: 34 | frames = f.getnframes() 35 | rate = f.getframerate() 36 | duration = frames / float(rate) 37 | 38 | transcript = text_replacing(transcript) 39 | 40 | if (2.0 < duration < 10.0) and (10 <= len(transcript) <= 40): 41 | metadata.writelines(f'{fname}|{transcript}\n') 42 | 43 | 44 | 45 | if __name__ == '__main__': 46 | import sys 47 | import json 48 | 49 | char_json = json.loads( 50 | open('character_code.json', 'r', encoding='utf-8').read() 51 | )['code'] [int(sys.argv[1])] # Speaker ID 52 | 53 | name = char_json['name'] 54 | character = char_json['character'] 55 | char_code = char_json['char_code'] 56 | 57 | print(name, character, char_code) 58 | 59 | # set_sample_rate(name) 60 | # determining_dataset(name) 61 | --------------------------------------------------------------------------------