├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── analyze.py ├── analyze_dataset.py ├── benchmarks.py ├── datasets.py ├── datasets ├── WiLI-2018 │ └── download ├── open-subtitles-v2018-100k-per-lang │ ├── download │ └── stats.md ├── tatoeba-sentences-2021-06-05-common-48 │ ├── download │ └── stats.md └── tatoeba-sentences-2021-06-05 │ ├── download │ └── stats.md ├── get_memory_usage.py ├── models ├── fasttext.py ├── gcld3.py ├── langdetect.py ├── langid.py └── pycld2.py ├── requirements.txt ├── results ├── open-subtitles-v2018-100k-per-lang │ ├── fasttext-compressed │ │ └── classification_performance.md │ ├── fasttext │ │ └── classification_performance.md │ ├── gcld3 │ │ └── classification_performance.md │ ├── langdetect │ │ └── classification_performance.md │ ├── langid │ │ └── classification_performance.md │ ├── pycld2 │ │ └── classification_performance.md │ └── results.md ├── tatoeba-sentences-2021-06-05-common-48 │ ├── fasttext-compressed │ │ └── classification_performance.md │ ├── fasttext │ │ └── classification_performance.md │ ├── gcld3 │ │ └── classification_performance.md │ ├── langdetect │ │ └── classification_performance.md │ ├── langid │ │ └── classification_performance.md │ ├── pycld2 │ │ └── classification_performance.md │ └── results.md └── tatoeba-sentences-2021-06-05 │ ├── fasttext-compressed │ ├── c5.xlarge_speed_performance.md │ ├── classification_performance.md │ └── mbp_m1_speed_performance.md │ ├── fasttext │ ├── c5.xlarge_speed_performance.md │ ├── classification_performance.md │ └── mbp_m1_speed_performance.md │ ├── gcld3 │ ├── c5.xlarge_speed_performance.md │ ├── classification_performance.md │ └── mbp_m1_speed_performance.md │ ├── langdetect │ ├── c5.xlarge_speed_performance.md │ ├── classification_performance.md │ └── mbp_m1_speed_performance.md │ ├── langid │ ├── c5.xlarge_speed_performance.md │ ├── classification_performance.md │ └── mbp_m1_speed_performance.md │ ├── pycld2 │ ├── c5.xlarge_speed_performance.md │ ├── classification_performance.md │ └── mbp_m1_speed_performance.md │ └── results.md ├── run.py └── templates ├── classification_performance.md ├── dataset_results.md └── speed_performance.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Dataset files 2 | datasets/**/**.bz2 3 | datasets/**/**.gzip 4 | datasets/**/**.csv 5 | datasets/**/**.txt 6 | results/**/**.npy 7 | results/**/results.csv 8 | 9 | .DS_Store 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | pip-wheel-metadata/ 34 | share/python-wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | MANIFEST 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .nox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | *.py,cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.9-buster 2 | 3 | RUN apt update 4 | RUN apt install -y protobuf-compiler 5 | 6 | WORKDIR /src 7 | 8 | # other libraries 9 | COPY requirements.txt ./ 10 | RUN pip install -r requirements.txt 11 | 12 | COPY run.py ./ 13 | COPY datasets ./datasets 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 modelpredict 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # language-identification-survey 2 | Live survey of off-the-shelf language identification tools for python 3 | 4 | ## Reproducing benchmark 5 | 6 | ### 1. Download the dataset 7 | ```bash 8 | ./datasets/tatoeba-sentences-2021-06-05/download 9 | ``` 10 | 11 | ### 2. Run the language inference for benchmarks 12 | 13 | Available benchmarks: 14 | - fasttext 15 | - fasttext-compressed 16 | - gcld3 17 | - langdetect 18 | - langid 19 | - pycld2 20 | 21 | Available datasets: 22 | - tatoeba-sentences-2021-06-05 23 | - tatoeba-sentences-2021-06-05-common-48 24 | - open-subtitles-v2018-100k-per-lang 25 | 26 | On the host machine. 27 | ```bash 28 | python run.py 29 | ``` 30 | 31 | In docker: 32 | ```bash 33 | docker build -t bench . 34 | docker run -v `pwd`:/src -t -i bench python /src/run.py 35 | ``` 36 | 37 | ### 3. Run analysis 38 | ```bash 39 | python analyze.py --correctness 40 | python analyze.py --timings 41 | ``` 42 | 43 | ### 4. Get memory usage for different models 44 | ```bash 45 | python get_memory_usage.py 46 | # e.g. python get_memory_usage.py fasttext 47 | # e.g. python get_memory_usage.py fasttext-compressed 48 | ``` 49 | 50 | It will print memory usage in MB (bytes/1024/1024). 51 | -------------------------------------------------------------------------------- /analyze.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import argparse 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from jinja2 import Environment, select_autoescape, FileSystemLoader 8 | from langcodes import Language 9 | from typing import Dict, Any 10 | 11 | import datasets 12 | import analyze_dataset 13 | from benchmarks import BENCHMARKS 14 | from langcodes import Language 15 | 16 | 17 | jinja_env = Environment(loader=FileSystemLoader("./templates"), autoescape=select_autoescape()) 18 | 19 | 20 | def get_alpha3(l): 21 | try: 22 | return Language.get(l).to_alpha3() 23 | except: 24 | return 'unk' 25 | 26 | 27 | def read_results(dataset_name, benchmark_name='fasttext', lang_dtype='str'): 28 | results_path = os.path.join('results', dataset_name, benchmark_name, 'results.csv') 29 | results = pd.read_csv(results_path, sep=',', index_col=0, names=['detected_lang', 'detected_prob']) 30 | # langdetect/pycld2 returns nan for small number of rows. We'll just convert them to strings 31 | results['detected_lang'] = results['detected_lang'].astype(str).astype("category") 32 | results['detected_lang_alpha3'] = results['detected_lang'].apply(lambda x: get_alpha3(x.replace('__label__', ''))).astype(lang_dtype) 33 | return results 34 | 35 | 36 | def accuracy(results_df): 37 | correct = (results_df['alpha3'] == results_df['detected_lang_alpha3']).astype(int) 38 | return correct.mean() 39 | 40 | 41 | def get_stats_per_language(results): 42 | langs = results['alpha3'].unique().tolist() 43 | class_metrics = {} 44 | for lang in langs: 45 | tp = (results['alpha3'] == lang) & (results['detected_lang_alpha3'] == lang) 46 | fp = (results['alpha3'] != lang) & (results['detected_lang_alpha3'] == lang) 47 | tn = (results['alpha3'] != lang) & (results['detected_lang_alpha3'] != lang) 48 | fn = (results['alpha3'] == lang) & (results['detected_lang_alpha3'] != lang) 49 | precision = tp.sum() / (tp.sum() + fp.sum()) 50 | recall = tp.sum() / (tp.sum() + fn.sum()) 51 | f1 = tp.sum() / (tp.sum() + (fp.sum() + fn.sum()/2)) 52 | class_metrics[lang] = dict( 53 | sentences_count=tp.sum()+fn.sum(), 54 | precision=precision, 55 | recall=recall, 56 | tp=tp.sum(), 57 | fp=fp.sum(), 58 | tn=tn.sum(), 59 | fn=fn.sum(), 60 | f1=f1, 61 | ) 62 | 63 | stats_per_language = pd.DataFrame.from_records(data=list(class_metrics.values()), index=list(class_metrics.keys())) 64 | stats_per_language.index.name = 'language_alpha3' 65 | stats_per_language = stats_per_language.reset_index() 66 | 67 | # assign the language 68 | stats_per_language['language'] = stats_per_language['language_alpha3'].apply(analyze_dataset.get_language_name) 69 | 70 | # sort by sentences count and set the index to be row number 71 | stats_per_language.sort_values(['sentences_count'], ascending=False, inplace=True) 72 | stats_per_language = stats_per_language.reset_index() 73 | stats_per_language.index += 1 74 | 75 | return stats_per_language[['language_alpha3', 'language', 'sentences_count', 'precision', 'recall', 'f1', 'tp', 'fp', 'tn', 'fn']] 76 | 77 | 78 | def md_link(text, url): 79 | return f"[{text}]({url})" 80 | 81 | 82 | def create_dataset_results_table(dataset_name, metrics_per_benchmark): 83 | link_base = "https://github.com/modelpredict/language-identification-survey/blob/main/results/" 84 | 85 | for benchmark_name, row in metrics_per_benchmark.items(): 86 | per_language_link = os.path.join(link_base, dataset_name, benchmark_name, f"classification_performance.md#metrics-per-language") 87 | acc_link = os.path.join(link_base, dataset_name, benchmark_name, f"classification_performance.md") 88 | supported_languages_link = os.path.join(link_base, dataset_name, benchmark_name, f"classification_performance.md#supported-languages") 89 | 90 | row['per_language_link'] = md_link("See metrics", per_language_link) 91 | row['agg_accuracy'] = md_link(row['agg_accuracy'], acc_link) 92 | row['supported_languages'] = md_link(row['supported_languages'], supported_languages_link) 93 | 94 | df = pd.DataFrame.from_records([{'name':k, **v} for k, v in metrics_per_benchmark.items()]) 95 | df.columns=['Library', 'Supported languages', '# sentences supported', 'Aggregated accuracy', 'Per language metrics'] 96 | return df 97 | 98 | 99 | def write_md(template_name: str, template_ctx: Dict[str, Any], path: str): 100 | tmpl = jinja_env.get_template(f'{template_name}.md') 101 | rendered = tmpl.render(template_ctx) 102 | with open(path, 'w') as fd: 103 | fd.write(rendered) 104 | 105 | 106 | if __name__ == "__main__": 107 | parser = argparse.ArgumentParser(description='Write aggregated results files.') 108 | parser.add_argument('--dataset', '-d', type=str, choices=datasets.names(), required=True) 109 | parser.add_argument('--timings_prefix', '-t', type=str, default='', help='Prefix of the times.npy file') 110 | parser.add_argument("--timings", type=bool, nargs='?', const=True, default=False, help='Analyze timings') 111 | parser.add_argument("--correctness", type=bool, nargs='?', const=True, default=False, help='Analyze correctness') 112 | args = parser.parse_args() 113 | 114 | dataset_name = args.dataset 115 | timings_prefix = args.timings_prefix 116 | dataset = datasets.get(dataset_name) 117 | 118 | metrics_per_benchmark = {} 119 | 120 | for benchmark_name in BENCHMARKS.keys(): 121 | benchmark_results_path = pathlib.Path('results') / dataset_name / benchmark_name 122 | print() 123 | if not benchmark_results_path.exists(): 124 | print(f"Skipping {benchmark_name}. Results files not found on {benchmark_results_path}") 125 | continue 126 | 127 | if args.correctness: 128 | print(f"Analyzing {benchmark_name} results on {dataset_name}...") 129 | 130 | supported_languages = [Language.get(lang) for lang in BENCHMARKS[benchmark_name]['supported_languages_alpha3']] 131 | supported_languages_list_str = ", ".join(f"{lang.to_alpha3()} ({lang.display_name()})" for lang in supported_languages) 132 | 133 | print(f"Reading results...") 134 | results = read_results(dataset_name, benchmark_name, lang_dtype=dataset.dtypes['alpha3']) 135 | supported_langs = BENCHMARKS[benchmark_name]['supported_languages_alpha3'] 136 | dataset_subset = datasets.get_supported_dataset_subset(dataset, supported_languages=supported_langs) 137 | print(f"Merging with dataset...") 138 | joined_results = pd.merge(dataset_subset, results, left_index=True, right_index=True, how="left", validate="one_to_one") 139 | 140 | print(f"Calculating accuracy...") 141 | aggregated_accuracy = accuracy(joined_results) 142 | print(f"Calculating metrics per language...") 143 | stats_per_language = get_stats_per_language(joined_results) 144 | dataset_supported_pct = "{:.2f}%".format(100. * len(dataset_subset) / len(dataset)) 145 | 146 | metrics_per_benchmark[benchmark_name] = { 147 | 'supported_languages': len(supported_langs), 148 | 'supported_dataset': f"{len(dataset):,} ({dataset_supported_pct})", 149 | 'agg_accuracy': "{:.2f}%".format(100. * aggregated_accuracy), 150 | } 151 | 152 | # assemble the md file and write it 153 | template_ctx = dict( 154 | benchmark_name=benchmark_name, 155 | dataset_name=dataset_name, 156 | dataset_len=len(dataset_subset), 157 | dataset_supported_pct=dataset_supported_pct, 158 | supported_languages_count=len(supported_languages), 159 | supported_languages_list_str=supported_languages_list_str, 160 | accuracy="{:.2f}%".format(100. * aggregated_accuracy), 161 | stats_per_language=stats_per_language.to_markdown(floatfmt=".3f"), 162 | ) 163 | results_path = os.path.join('results', dataset_name, benchmark_name, 'classification_performance.md') 164 | print(f"Dumping classification performance analysis to {results_path}") 165 | write_md('classification_performance', template_ctx=template_ctx, path=results_path) 166 | 167 | if args.timings: 168 | times = np.load(os.path.join('results', dataset_name, benchmark_name, f'{timings_prefix}times.npy')) 169 | template_ctx = dict( 170 | benchmark_name=benchmark_name, 171 | dataset_name=dataset_name, 172 | latency_avg=np.mean(times) / 10**6, 173 | latency_std=np.std(times) / 10**6, 174 | latency_p50=np.quantile(times, [0.5])[0] / 10**6, 175 | latency_p90=np.quantile(times, [0.9])[0] / 10**6, 176 | latency_p95=np.quantile(times, [0.95])[0] / 10**6, 177 | latency_p99=np.quantile(times, [0.99])[0] / 10**6, 178 | throughput=10**9/np.mean(times), 179 | ) 180 | 181 | results_path = os.path.join('results', dataset_name, benchmark_name, f'{timings_prefix}speed_performance.md') 182 | print(f"Dumping latency/throughput analysis to {results_path}") 183 | write_md('speed_performance', template_ctx=template_ctx, path=results_path) 184 | 185 | print(f"Creating aggregated table for {dataset_name}") 186 | agg_table_path = os.path.join('results', dataset_name, 'results.md') 187 | df = create_dataset_results_table(dataset_name, metrics_per_benchmark) 188 | template_ctx = dict( 189 | dataset_name=dataset_name, 190 | results_table=df.to_markdown(floatfmt=".3f", index=False) 191 | ) 192 | write_md('dataset_results', template_ctx=template_ctx, path=agg_table_path) 193 | print(f"Written in {agg_table_path}") 194 | -------------------------------------------------------------------------------- /analyze_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import datasets 5 | from langcodes import Language 6 | 7 | 8 | def get_language_name(alpha3): 9 | try: 10 | return Language.get(alpha3).display_name() 11 | except: 12 | print(f"Failed to get name for language '{alpha3}'") 13 | return "--" 14 | 15 | 16 | def get_stats_table(df): 17 | df = df.copy() 18 | df['text_len'] = df['text'].str.len() 19 | 20 | # calculate stats (count, pct, mean(text_len)) per language, sorted by count DESC 21 | counts = df.groupby('alpha3').agg({'text_len': ['count', 'mean']}).reset_index() 22 | counts.columns = ['alpha3', 'sentences', 'mean_len'] 23 | counts['dataset_percentage'] = (counts['sentences'] / counts['sentences'].sum() * 100).apply(lambda x: "{:.2f}%".format(x)) 24 | counts.sort_values(['sentences'], ascending=False, inplace=True) 25 | counts.reset_index(inplace=True) 26 | counts.index += 1 27 | 28 | # assign language name 29 | counts['language'] = counts['alpha3'].apply(get_language_name) 30 | return counts[['alpha3', 'language', 'sentences', 'dataset_percentage', 'mean_len']] 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser(description='Write aggregated results files.') 35 | parser.add_argument('--dataset', '-d', type=str, choices=datasets.names(), required=True) 36 | args = parser.parse_args() 37 | 38 | print(f"Dumping stats for dataset {args.dataset}") 39 | 40 | ds = datasets.get(args.dataset) 41 | stats_df = get_stats_table(ds) 42 | 43 | with open(os.path.join('datasets', args.dataset, 'stats.md'), 'w') as fd: 44 | stats_df.to_markdown(fd, index=True) 45 | -------------------------------------------------------------------------------- /benchmarks.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from langcodes import Language 4 | from models import gcld3, langid, langdetect, pycld2, fasttext 5 | 6 | 7 | BENCHMARKS = { 8 | 'fasttext': { 9 | 'run': partial(fasttext.run, model_path=fasttext.MODEL_BIN), 10 | 'measure_memory': partial(fasttext.measure_memory, model_path=fasttext.MODEL_BIN), 11 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in fasttext.SUPPORTED_LANGUAGES], 12 | }, 13 | 'fasttext-compressed': { 14 | 'run': partial(fasttext.run, model_path=fasttext.MODEL_COMPRESSED), 15 | 'measure_memory': partial(fasttext.measure_memory, model_path=fasttext.MODEL_COMPRESSED), 16 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in fasttext.SUPPORTED_LANGUAGES], 17 | }, 18 | 'gcld3': { 19 | 'run': gcld3.run, 20 | 'measure_memory': gcld3.measure_memory, 21 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in gcld3.SUPPORTED_LANGUAGES], 22 | }, 23 | 'langdetect': { 24 | 'run': langdetect.run, 25 | 'measure_memory': langdetect.measure_memory, 26 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in langdetect.SUPPORTED_LANGUAGES], 27 | }, 28 | 'langid': { 29 | 'run': langid.run, 30 | 'measure_memory': langid.measure_memory, 31 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in langid.SUPPORTED_LANGUAGES], 32 | }, 33 | 'pycld2': { 34 | 'run': pycld2.run, 35 | 'measure_memory': pycld2.measure_memory, 36 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in pycld2.SUPPORTED_LANGUAGES], 37 | }, 38 | } 39 | 40 | 41 | def common_languages(): 42 | supported_languages = set(BENCHMARKS['fasttext']['supported_languages_alpha3']) 43 | for b in BENCHMARKS: 44 | supported_languages = supported_languages.intersection(BENCHMARKS[b]['supported_languages_alpha3']) 45 | return supported_languages 46 | -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | import re 2 | from argparse import ArgumentError 3 | from glob import glob 4 | 5 | import pandas as pd 6 | from langcodes import Language 7 | 8 | 9 | __DATASETS = {} 10 | 11 | 12 | def get_alpha3(lang): 13 | l = Language.get(lang) 14 | try: 15 | return l.to_alpha3() 16 | except: 17 | return None 18 | 19 | 20 | def dataset(load_fn): 21 | __DATASETS[load_fn.__name__] = load_fn 22 | return load_fn 23 | 24 | 25 | def get(name): 26 | name = name.replace('-', '_') 27 | if name in __DATASETS: 28 | return __DATASETS[name]() 29 | raise ArgumentError(f"Unkown dataset {name}") 30 | 31 | 32 | def names(): 33 | return [name.replace('_', '-') for name in __DATASETS.keys()] 34 | 35 | 36 | @dataset 37 | def tatoeba_sentences_2021_06_05(): 38 | dataset_path = 'datasets/tatoeba-sentences-2021-06-05/sentences.csv' 39 | ds = pd.read_csv(dataset_path, sep='\t', index_col=0, names=['language', 'text'], dtype={'language': 'category'}) 40 | ds['alpha3'] = ds['language'].apply(get_alpha3).astype("category") 41 | return ds 42 | 43 | 44 | @dataset 45 | def tatoeba_sentences_2021_06_05_common_48(): 46 | dataset_path = 'datasets/tatoeba-sentences-2021-06-05-common-48/sentences.csv' 47 | ds = pd.read_csv(dataset_path, index_col=0, names=['language', 'text'], dtype={'language': 'category'}) 48 | ds['alpha3'] = ds['language'].apply(get_alpha3).astype("category") 49 | return ds 50 | 51 | 52 | @dataset 53 | def open_subtitles_v2018_100k_per_lang(): 54 | dataset_files = 'datasets/open-subtitles-v2018-100k-per-lang/*.txt' 55 | dfs = [] 56 | for f in glob(dataset_files): 57 | sentences = open(f, encoding='utf-8').readlines() 58 | language = re.split('[\./]', f)[-2] 59 | data = dict( 60 | text=sentences, 61 | language=language, 62 | ) 63 | dfs.append(pd.DataFrame(data=data)) 64 | 65 | big_df = pd.concat(dfs).reset_index().drop('index', axis=1) 66 | big_df['language'] = big_df['language'].astype("category") 67 | big_df['alpha3'] = big_df['language'].apply(get_alpha3).astype("category") 68 | return big_df 69 | 70 | 71 | def get_supported_dataset_subset(dataset, supported_languages): 72 | return dataset[dataset['alpha3'].isin(supported_languages)] 73 | -------------------------------------------------------------------------------- /datasets/WiLI-2018/download: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | cd $SCRIPT_DIR 5 | 6 | FILENAME="wili-2018.zip" 7 | if [[ -f $FILENAME ]]; then 8 | echo "Dataset already downloaded" 9 | else 10 | echo "Downloading the dataset..." 11 | curl https://zenodo.org/record/841984/files/wili-2018.zip?download=1 > ${FILENAME} 12 | fi 13 | 14 | if [[ ! -f "labels.csv" ]]; then 15 | echo "Extracting the dataset..." 16 | unzip $FILENAME 17 | fi 18 | -------------------------------------------------------------------------------- /datasets/open-subtitles-v2018-100k-per-lang/download: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.en.gz" | gzip -d -c | head -n 100000 > en.txt 4 | 5 | # Languages commented out are not supported by some benchmarks 6 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.af.gz" | gzip -d -c | head -n 100000 > ar.txt 7 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz" | gzip -d -c | head -n 100000 > ar.txt 8 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.bg.gz" | gzip -d -c | head -n 100000 > bg.txt 9 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.bn.gz" | gzip -d -c | head -n 100000 > bn.txt 10 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.br.gz" | gzip -d -c | head -n 100000 > br.txt 11 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.bs.gz" | gzip -d -c | head -n 100000 > bs.txt 12 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ca.gz" | gzip -d -c | head -n 100000 > ca.txt 13 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.cs.gz" | gzip -d -c | head -n 100000 > cs.txt 14 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.da.gz" | gzip -d -c | head -n 100000 > da.txt 15 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.de.gz" | gzip -d -c | head -n 100000 > de.txt 16 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.el.gz" | gzip -d -c | head -n 100000 > el.txt 17 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.eo.gz" | gzip -d -c | head -n 100000 > eo.txt 18 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.es.gz" | gzip -d -c | head -n 100000 > es.txt 19 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.et.gz" | gzip -d -c | head -n 100000 > et.txt 20 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.eu.gz" | gzip -d -c | head -n 100000 > eu.txt 21 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fa.gz" | gzip -d -c | head -n 100000 > fa.txt 22 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fi.gz" | gzip -d -c | head -n 100000 > fi.txt 23 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fr.gz" | gzip -d -c | head -n 100000 > fr.txt 24 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.gl.gz" | gzip -d -c | head -n 100000 > gl.txt 25 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.he.gz" | gzip -d -c | head -n 100000 > he.txt 26 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.hi.gz" | gzip -d -c | head -n 100000 > hi.txt 27 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.hr.gz" | gzip -d -c | head -n 100000 > hr.txt 28 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.hu.gz" | gzip -d -c | head -n 100000 > hu.txt 29 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.hy.gz" | gzip -d -c | head -n 100000 > hy.txt 30 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.id.gz" | gzip -d -c | head -n 100000 > id.txt 31 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.is.gz" | gzip -d -c | head -n 100000 > is.txt 32 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.it.gz" | gzip -d -c | head -n 100000 > it.txt 33 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ja.gz" | gzip -d -c | head -n 100000 > ja.txt 34 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ka.gz" | gzip -d -c | head -n 100000 > ka.txt 35 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.kk.gz" | gzip -d -c | head -n 100000 > kk.txt 36 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ko.gz" | gzip -d -c | head -n 100000 > ko.txt 37 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.lt.gz" | gzip -d -c | head -n 100000 > lt.txt 38 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.lv.gz" | gzip -d -c | head -n 100000 > lv.txt 39 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.mk.gz" | gzip -d -c | head -n 100000 > mk.txt 40 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ml.gz" | gzip -d -c | head -n 100000 > ml.txt 41 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ms.gz" | gzip -d -c | head -n 100000 > ms.txt 42 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.nl.gz" | gzip -d -c | head -n 100000 > nl.txt 43 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.no.gz" | gzip -d -c | head -n 100000 > no.txt 44 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pl.gz" | gzip -d -c | head -n 100000 > pl.txt 45 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt.gz" | gzip -d -c | head -n 100000 > pt.txt 46 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt_br.gz" | gzip -d -c | head -n 100000 > pt_br.txt 47 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ro.gz" | gzip -d -c | head -n 100000 > ro.txt 48 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ru.gz" | gzip -d -c | head -n 100000 > ru.txt 49 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.si.gz" | gzip -d -c | head -n 100000 > si.txt 50 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sk.gz" | gzip -d -c | head -n 100000 > sk.txt 51 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sl.gz" | gzip -d -c | head -n 100000 > sl.txt 52 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sq.gz" | gzip -d -c | head -n 100000 > sq.txt 53 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sr.gz" | gzip -d -c | head -n 100000 > sr.txt 54 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sv.gz" | gzip -d -c | head -n 100000 > sv.txt 55 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ta.gz" | gzip -d -c | head -n 100000 > ta.txt 56 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.te.gz" | gzip -d -c | head -n 100000 > te.txt 57 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.th.gz" | gzip -d -c | head -n 100000 > th.txt 58 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.tl.gz" | gzip -d -c | head -n 100000 > tl.txt 59 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.tr.gz" | gzip -d -c | head -n 100000 > tr.txt 60 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.uk.gz" | gzip -d -c | head -n 100000 > uk.txt 61 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ur.gz" | gzip -d -c | head -n 100000 > ur.txt 62 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.vi.gz" | gzip -d -c | head -n 100000 > vi.txt 63 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ze_en.gz" | gzip -d -c | head -n 100000 > ze_en.txt 64 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ze_zh.gz" | gzip -d -c | head -n 100000 > ze_zh.txt 65 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.zh_cn.gz" | gzip -d -c | head -n 100000 > zh_cn.txt 66 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.zh_tw.gz" | gzip -d -c | head -n 100000 > zh_tw.txt 67 | -------------------------------------------------------------------------------- /datasets/open-subtitles-v2018-100k-per-lang/stats.md: -------------------------------------------------------------------------------- 1 | | | alpha3 | language | sentences | dataset_percentage | mean_len | 2 | |---:|:---------|:-----------------|------------:|:---------------------|-----------:| 3 | | 1 | ara | Arabic | 100000 | 2.36% | 25.5612 | 4 | | 2 | kor | Korean | 100000 | 2.36% | 15.2265 | 5 | | 3 | lav | Latvian | 100000 | 2.36% | 28.2011 | 6 | | 4 | mkd | Macedonian | 100000 | 2.36% | 26.6284 | 7 | | 5 | mal | Malayalam | 100000 | 2.36% | 31.8701 | 8 | | 6 | nld | Dutch | 100000 | 2.36% | 31.3679 | 9 | | 7 | nob | Norwegian Bokmål | 100000 | 2.36% | 28.3095 | 10 | | 8 | pol | Polish | 100000 | 2.36% | 28.6661 | 11 | | 9 | por | Portuguese | 100000 | 2.36% | 31.2119 | 12 | | 10 | ron | Romanian | 100000 | 2.36% | 30.8544 | 13 | | 11 | rus | Russian | 100000 | 2.36% | 28.5364 | 14 | | 12 | slk | Slovak | 100000 | 2.36% | 28.1906 | 15 | | 13 | slv | Slovenian | 100000 | 2.36% | 27.4234 | 16 | | 14 | sqi | Albanian | 100000 | 2.36% | 28.1297 | 17 | | 15 | swe | Swedish | 100000 | 2.36% | 29.2429 | 18 | | 16 | tha | Thai | 100000 | 2.36% | 25.2803 | 19 | | 17 | tur | Turkish | 100000 | 2.36% | 30.4281 | 20 | | 18 | ukr | Ukrainian | 100000 | 2.36% | 26.5965 | 21 | | 19 | vie | Vietnamese | 100000 | 2.36% | 28.6742 | 22 | | 20 | bul | Bulgarian | 100000 | 2.36% | 28.3935 | 23 | | 21 | lit | Lithuanian | 100000 | 2.36% | 26.8903 | 24 | | 22 | jpn | Japanese | 100000 | 2.36% | 12.6401 | 25 | | 23 | est | Estonian | 100000 | 2.36% | 30.0332 | 26 | | 24 | ben | Bangla | 100000 | 2.36% | 24.8633 | 27 | | 25 | cat | Catalan | 100000 | 2.36% | 30.6194 | 28 | | 26 | ces | Czech | 100000 | 2.36% | 27.2082 | 29 | | 27 | dan | Danish | 100000 | 2.36% | 27.9852 | 30 | | 28 | deu | German | 100000 | 2.36% | 31.7739 | 31 | | 29 | ell | Greek | 100000 | 2.36% | 30.6192 | 32 | | 30 | ita | Italian | 100000 | 2.36% | 31.1652 | 33 | | 31 | spa | Spanish | 100000 | 2.36% | 32.271 | 34 | | 32 | eng | English | 100000 | 2.36% | 30.3055 | 35 | | 33 | fas | Persian | 100000 | 2.36% | 25.1484 | 36 | | 34 | fin | Finnish | 100000 | 2.36% | 29.2801 | 37 | | 35 | fra | French | 100000 | 2.36% | 30.8136 | 38 | | 36 | heb | Hebrew | 100000 | 2.36% | 25.2828 | 39 | | 37 | hin | Hindi | 100000 | 2.36% | 26.7333 | 40 | | 38 | hrv | Croatian | 100000 | 2.36% | 28.8623 | 41 | | 39 | hun | Hungarian | 100000 | 2.36% | 30.0959 | 42 | | 40 | ind | Indonesian | 100000 | 2.36% | 29.4894 | 43 | | 41 | zho | Chinese | 100000 | 2.36% | 12.5245 | 44 | | 42 | urd | Urdu | 46523 | 1.10% | 27.2766 | 45 | | 43 | tam | Tamil | 40165 | 0.95% | 29.4984 | 46 | | 44 | tel | Telugu | 30416 | 0.72% | 26.5348 | 47 | | 45 | fil | Filipino | 19314 | 0.46% | 31.8587 | -------------------------------------------------------------------------------- /datasets/tatoeba-sentences-2021-06-05-common-48/download: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | cd $SCRIPT_DIR 5 | 6 | FILENAME="tatoeba-sentences-2021-06-05-common-48.tar.bz2" 7 | if [[ -f $FILENAME ]]; then 8 | echo "Dataset already downloaded" 9 | else 10 | echo "Downloading the dataset..." 11 | wget https://modelpredict.s3.amazonaws.com/datasets/${FILENAME} 12 | fi 13 | 14 | if [[ ! -f "sentences.csv" ]]; then 15 | echo "Extracting the dataset..." 16 | tar xvfj $FILENAME 17 | fi 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /datasets/tatoeba-sentences-2021-06-05-common-48/stats.md: -------------------------------------------------------------------------------- 1 | | | alpha3 | language | sentences | dataset_percentage | mean_len | 2 | |---:|:---------|:-----------------|------------:|:---------------------|-----------:| 3 | | 1 | eng | English | 1479733 | 19.83% | 39.3277 | 4 | | 2 | rus | Russian | 849653 | 11.39% | 33.4655 | 5 | | 3 | ita | Italian | 787053 | 10.55% | 33.4897 | 6 | | 4 | tur | Turkish | 709573 | 9.51% | 34.7355 | 7 | | 5 | deu | German | 553727 | 7.42% | 47.4774 | 8 | | 6 | fra | French | 466192 | 6.25% | 41.3866 | 9 | | 7 | por | Portuguese | 385737 | 5.17% | 38.2929 | 10 | | 8 | spa | Spanish | 338781 | 4.54% | 38.8894 | 11 | | 9 | hun | Hungarian | 323048 | 4.33% | 34.0299 | 12 | | 10 | jpn | Japanese | 208761 | 2.80% | 18.2659 | 13 | | 11 | heb | Hebrew | 197226 | 2.64% | 25.5678 | 14 | | 12 | ukr | Ukrainian | 171674 | 2.30% | 27.8153 | 15 | | 13 | nld | Dutch | 144340 | 1.93% | 34.7853 | 16 | | 14 | fin | Finnish | 128011 | 1.72% | 35.7946 | 17 | | 15 | pol | Polish | 109662 | 1.47% | 33.2333 | 18 | | 16 | mkd | Macedonian | 77938 | 1.04% | 27.3793 | 19 | | 17 | mar | Marathi | 64126 | 0.86% | 27.587 | 20 | | 18 | lit | Lithuanian | 59659 | 0.80% | 30.1439 | 21 | | 19 | ces | Czech | 57030 | 0.76% | 28.3683 | 22 | | 20 | dan | Danish | 49399 | 0.66% | 33.7159 | 23 | | 21 | swe | Swedish | 41677 | 0.56% | 30.1428 | 24 | | 22 | ara | Arabic | 35991 | 0.48% | 26.7817 | 25 | | 23 | ell | Greek | 34071 | 0.46% | 30.3915 | 26 | | 24 | ron | Romanian | 24943 | 0.33% | 34.4097 | 27 | | 25 | bul | Bulgarian | 24503 | 0.33% | 31.7201 | 28 | | 26 | vie | Vietnamese | 19234 | 0.26% | 38.7891 | 29 | | 27 | fil | Filipino | 16649 | 0.22% | 36.8098 | 30 | | 28 | slk | Slovak | 14660 | 0.20% | 25.7422 | 31 | | 29 | ind | Indonesian | 14542 | 0.19% | 37.4785 | 32 | | 30 | hin | Hindi | 14230 | 0.19% | 27.6058 | 33 | | 31 | nob | Norwegian Bokmål | 14223 | 0.19% | 37.4732 | 34 | | 32 | cat | Catalan | 7971 | 0.11% | 37.334 | 35 | | 33 | kor | Korean | 7570 | 0.10% | 16.8085 | 36 | | 34 | hrv | Croatian | 5204 | 0.07% | 30.058 | 37 | | 35 | ben | Bangla | 4714 | 0.06% | 23.7809 | 38 | | 36 | afr | Afrikaans | 4031 | 0.05% | 29.676 | 39 | | 37 | est | Estonian | 3637 | 0.05% | 27.6646 | 40 | | 38 | tha | Thai | 3528 | 0.05% | 20.5697 | 41 | | 39 | sqi | Albanian | 2526 | 0.03% | 32.2743 | 42 | | 40 | urd | Urdu | 2008 | 0.03% | 30.7495 | 43 | | 41 | cym | Welsh | 1344 | 0.02% | 29.3058 | 44 | | 42 | slv | Slovenian | 1093 | 0.01% | 28.4282 | 45 | | 43 | mal | Malayalam | 827 | 0.01% | 36.8222 | 46 | | 44 | tam | Tamil | 334 | 0.00% | 35.2784 | 47 | | 45 | tel | Telugu | 254 | 0.00% | 28.0157 | 48 | | 46 | pan | Punjabi | 196 | 0.00% | 32.8622 | 49 | | 47 | kan | Kannada | 176 | 0.00% | 35.3636 | 50 | | 48 | guj | Gujarati | 168 | 0.00% | 24.244 | -------------------------------------------------------------------------------- /datasets/tatoeba-sentences-2021-06-05/download: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | cd $SCRIPT_DIR 5 | 6 | FILENAME="tatoeba-sentences-2021-06-05.tar.bz2" 7 | if [[ -f $FILENAME ]]; then 8 | echo "Dataset already downloaded" 9 | else 10 | echo "Downloading the dataset..." 11 | wget https://modelpredict.s3.amazonaws.com/datasets/${FILENAME} 12 | fi 13 | 14 | if [[ ! -f "sentences.csv" ]]; then 15 | echo "Extracting the dataset..." 16 | tar xvfj $FILENAME 17 | fi 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /get_memory_usage.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from benchmarks import BENCHMARKS 3 | 4 | 5 | MB = 1024*1024 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser(description='Calculates memory usage for loading the model and running one inference request.') 10 | parser.add_argument('benchmarks', nargs='+') 11 | 12 | args = parser.parse_args() 13 | 14 | mem_usage = {} 15 | for benchmark_name in args.benchmarks: 16 | mem_usage[benchmark_name] = BENCHMARKS[benchmark_name]['measure_memory']() / MB 17 | print(mem_usage) 18 | -------------------------------------------------------------------------------- /models/fasttext.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | import psutil 5 | 6 | 7 | MODEL_BIN = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' 8 | MODEL_COMPRESSED = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz' 9 | 10 | 11 | SUPPORTED_LANGUAGES = "af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh".split(" ") 12 | 13 | 14 | def measure_memory(model_path): 15 | p = psutil.Process(os.getpid()) 16 | mem_before = p.memory_info().rss 17 | import fasttext 18 | download_model(model_path) 19 | model = fasttext.load_model('/tmp/fasttext.model') 20 | model.predict("hello darkness my ol' fren") 21 | return p.memory_info().rss - mem_before 22 | 23 | 24 | 25 | def run(dataset, elapsed, model_path): 26 | import fasttext 27 | 28 | lang = np.chararray(len(dataset), itemsize=15) 29 | prob = np.zeros((len(dataset),), dtype=np.float) 30 | 31 | download_model(model_path) 32 | model = fasttext.load_model('/tmp/fasttext.model') 33 | 34 | for i, text in enumerate(dataset): 35 | # For some reason fasttext likes one line at a time. 36 | text = text.replace('\n', ' ') 37 | 38 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC) 39 | result = model.predict(text) 40 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time 41 | 42 | assert len(result[0]) == 1 43 | lang[i] = result[0][0] 44 | prob[i] = result[1][0] 45 | 46 | return dict(lang=lang, prob=prob) 47 | 48 | 49 | def download_model(path): 50 | os.system(f"wget -O /tmp/fasttext.model {path}") 51 | -------------------------------------------------------------------------------- /models/gcld3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import psutil 4 | import time 5 | 6 | # https://github.com/google/cld3 7 | 8 | SUPPORTED_LANGUAGES = "af am ar bg bg-Latn bn bs ca ceb co cs cy da de el el-Latn en eo es et eu fa fi fil fr fy ga gd gl gu ha haw hi hi-Latn hmn hr ht hu hy id ig is it iw ja ja-Latn jv ka kk km kn ko ku ky la lb lo lt lv mg mi mk ml mn mr ms mt my ne nl no ny pa pl ps pt ro ru ru-Latn sd si sk sl sm sn so sq sr st su sv sw ta te tg th tr uk ur uz vi xh yi yo zh zh-Latn zu".split(" ") 9 | 10 | 11 | def measure_memory(): 12 | p = psutil.Process(os.getpid()) 13 | mem_before = p.memory_info().rss 14 | import gcld3 15 | model = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=3000) 16 | model.FindLanguage(text="hello darkness my ol' fren") 17 | return p.memory_info().rss - mem_before 18 | 19 | 20 | def run(dataset, elapsed): 21 | import gcld3 22 | detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=3000) 23 | 24 | lang = np.chararray(len(dataset), itemsize=10) 25 | prob = np.zeros((len(dataset),), dtype=np.float) 26 | for i, text in enumerate(dataset): 27 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC) 28 | result = detector.FindLanguage(text=text) 29 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time 30 | 31 | lang[i] = result.language 32 | prob[i] = result.probability 33 | 34 | return dict(lang=lang, prob=prob) 35 | -------------------------------------------------------------------------------- /models/langdetect.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import psutil 4 | import os 5 | 6 | 7 | SUPPORTED_LANGUAGES = ("af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he, " + \ 8 | "hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl, " + \ 9 | "pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw").split(", ") 10 | 11 | 12 | def measure_memory(): 13 | p = psutil.Process(os.getpid()) 14 | mem_before = p.memory_info().rss 15 | import langdetect 16 | langdetect.detect_langs("hello darkness my ol' fren") 17 | return p.memory_info().rss - mem_before 18 | 19 | 20 | def run(dataset, elapsed): 21 | import langdetect 22 | lang = np.chararray(len(dataset), itemsize=10) 23 | prob = np.zeros((len(dataset),), dtype=np.float) 24 | 25 | for i, text in enumerate(dataset): 26 | try: 27 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC) 28 | result = langdetect.detect_langs(text) 29 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time 30 | except: 31 | result = None 32 | 33 | if result: 34 | lang[i] = result[0].lang 35 | prob[i] = result[0].prob 36 | else: 37 | lang[i] = 'n/a' 38 | prob[i] = float('nan') 39 | 40 | return dict(lang=lang, prob=prob) 41 | -------------------------------------------------------------------------------- /models/langid.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import psutil 4 | import os 5 | 6 | 7 | SUPPORTED_LANGUAGES = "af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, es, et, eu, fa, fi, fo, fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja, jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, sl, sq, sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu".split(", ") 8 | 9 | 10 | def measure_memory(): 11 | p = psutil.Process(os.getpid()) 12 | mem_before = p.memory_info().rss 13 | import langid 14 | langid.classify("hello darkness my ol' fren") 15 | return p.memory_info().rss - mem_before 16 | 17 | 18 | def run(dataset, elapsed): 19 | import langid 20 | lang = np.chararray(len(dataset), itemsize=10) 21 | prob = np.zeros((len(dataset),), dtype=np.float) 22 | 23 | for i, text in enumerate(dataset): 24 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC) 25 | result = langid.classify(text) 26 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time 27 | 28 | lang[i] = result[0] 29 | prob[i] = result[1] 30 | 31 | return dict(lang=lang, prob=prob) 32 | -------------------------------------------------------------------------------- /models/pycld2.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import psutil 4 | import os 5 | 6 | 7 | def get_supported_languages(): 8 | from pycld2 import LANGUAGES 9 | # from https://github.com/CLD2Owners/cld2 10 | langs = """ 11 | Afrikaans Albanian Arabic Armenian Azerbaijani Basque Belarusian Bengali Bihari Bulgarian Catalan Cebuano Cherokee Croatian Czech Chinese Chinese_T Danish Dhivehi Dutch English Estonian Finnish French Galician Ganda Georgian German Greek Gujarati Haitian_Creole Hebrew Hindi Hmong Hungarian Icelandic Indonesian Inuktitut Irish Italian Javanese Japanese Kannada Khmer Kinyarwanda Korean Laothian Latvian Limbu Lithuanian Macedonian Malay Malayalam Maltese Marathi Nepali Norwegian Oriya Persian Polish Portuguese Punjabi Romanian Russian Scots_Gaelic Serbian Sinhalese Slovak Slovenian Spanish Swahili Swedish Syriac Tagalog Tamil Telugu Thai Turkish Ukrainian Urdu Vietnamese Welsh Yiddish 12 | """.upper().strip().split(" ") 13 | langs.remove("CHINESE_T") 14 | langs.append("CHINESET") 15 | name_to_code = {name.upper(): code for name, code in LANGUAGES} 16 | return [name_to_code[name] for name in langs] 17 | 18 | 19 | SUPPORTED_LANGUAGES = get_supported_languages() 20 | 21 | 22 | def measure_memory(): 23 | p = psutil.Process(os.getpid()) 24 | mem_before = p.memory_info().rss 25 | import pycld2 26 | pycld2.detect("hello darkness my ol' fren") 27 | return p.memory_info().rss - mem_before 28 | 29 | 30 | def run(dataset, elapsed): 31 | import pycld2 32 | lang = np.chararray(len(dataset), itemsize=10) 33 | prob = np.zeros((len(dataset),), dtype=np.float) 34 | 35 | errored = 0 36 | 37 | for i, text in enumerate(dataset): 38 | try: 39 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC) 40 | result = pycld2.detect(text) 41 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time 42 | 43 | lang_label = result[2][0][1] 44 | lang[i] = lang_label if lang_label != 'un' else None 45 | prob[i] = float('nan') 46 | except pycld2.error: 47 | # Unfortunately, pycld2 errors on "invalid utf-8" sequence for some texts, 48 | # even though python successfully loads, encodes and decodes them as utf-8. 49 | errored += 1 50 | lang[i] = None 51 | prob[i] = float('nan') 52 | 53 | print(f"pycld2 errored on {errored} texts") 54 | 55 | return dict(lang=lang, prob=prob) 56 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Jinja2==3.0.1 2 | fasttext==0.9.2 3 | gcld3==3.0.13 4 | ipython 5 | langcodes==3.1.0 6 | langdetect==1.0.9 7 | langid==1.1.6 8 | language-data==1.0 9 | pandas==1.2.4 10 | psutil==5.8.0 11 | pycld2==0.41 12 | scikit-learn==0.24.2 13 | tabulate==0.8.9 14 | tqdm==4.61.0 15 | ipdb==0.13.9 16 | -------------------------------------------------------------------------------- /results/open-subtitles-v2018-100k-per-lang/fasttext-compressed/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for fasttext-compressed on open-subtitles-v2018-100k-per-lang 2 | 3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%) 4 | - **Aggregated accuracy: 75.21%** 5 | 6 |

Supported languages (176)

7 | 8 | afr (Afrikaans), als (Tosk Albanian), amh (Amharic), arg (Aragonese), ara (Arabic), arz (Egyptian Arabic), asm (Assamese), ast (Asturian), ava (Avaric), aze (Azerbaijani), azb (South Azerbaijani), bak (Bashkir), bar (Bavarian), bcl (Central Bikol), bel (Belarusian), bul (Bulgarian), bih (Bihari languages), ben (Bangla), bod (Tibetan), bpy (Bishnupriya), bre (Breton), bos (Bosnian), bxr (Russia Buriat), cat (Catalan), cbk (Chavacano), che (Chechen), ceb (Cebuano), ckb (Central Kurdish), cos (Corsican), ces (Czech), chv (Chuvash), cym (Welsh), dan (Danish), deu (German), diq (Dimli (individual language)), dsb (Lower Sorbian), dty (Dotyali), div (Divehi), ell (Greek), eml (Unknown language [eml]), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fra (French), frr (Northern Frisian), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), grn (Guarani), gom (Goan Konkani), guj (Gujarati), glv (Manx), heb (Hebrew), hin (Hindi), hif (Fiji Hindi), hrv (Croatian), hsb (Upper Sorbian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ina (Interlingua), ind (Indonesian), ile (Interlingue), ilo (Iloko), ido (Ido), isl (Icelandic), ita (Italian), jpn (Japanese), jbo (Lojban), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), krc (Karachay-Balkar), kur (Kurdish), kom (Komi), cor (Cornish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lez (Lezghian), lim (Limburgish), lmo (Lombard), lao (Lao), lrc (Northern Luri), lit (Lithuanian), lav (Latvian), mai (Maithili), mlg (Malagasy), mhr (Eastern Mari), min (Minangkabau), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), mrj (Western Mari), msa (Malay), mlt (Maltese), mwl (Mirandese), mya (Burmese), myv (Erzya), mzn (Mazanderani), nah (Nahuatl languages), nap (Neapolitan), nds (Low German), nep (Nepali), new (Newari), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), oss (Ossetic), pan (Punjabi), pam (Pampanga), pfl (Palatine German), pol (Polish), pms (Piedmontese), pnb (Western Panjabi), pus (Pashto), por (Portuguese), que (Quechua), roh (Romansh), ron (Romanian), rus (Russian), rue (Rusyn), san (Sanskrit), sah (Sakha), srd (Sardinian), scn (Sicilian), sco (Scots), snd (Sindhi), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), srp (Serbian), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tuk (Turkmen), fil (Filipino), tur (Turkish), tat (Tatar), tyv (Tuvinian), uig (Uyghur), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vec (Venetian), vep (Veps), vie (Vietnamese), vls (West Flemish), vol (Volapük), wln (Walloon), war (Waray), wuu (Wu Chinese), xal (Kalmyk), xmf (Mingrelian), yid (Yiddish), yor (Yoruba), yue (Cantonese), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|-------:|--------:|------:| 14 | | 1 | ell | Greek | 100000 | 0.997 | 0.982 | 0.988 | 98172 | 253 | 4136165 | 1828 | 15 | | 2 | swe | Swedish | 100000 | 0.831 | 0.793 | 0.749 | 79263 | 16147 | 4120271 | 20737 | 16 | | 3 | mkd | Macedonian | 100000 | 0.870 | 0.761 | 0.765 | 76086 | 11387 | 4125031 | 23914 | 17 | | 4 | tha | Thai | 100000 | 0.999 | 0.957 | 0.977 | 95713 | 124 | 4136294 | 4287 | 18 | | 5 | cat | Catalan | 100000 | 0.918 | 0.508 | 0.635 | 50786 | 4527 | 4131891 | 49214 | 19 | | 6 | bul | Bulgarian | 100000 | 0.951 | 0.654 | 0.760 | 65408 | 3357 | 4133061 | 34592 | 20 | | 7 | fin | Finnish | 100000 | 0.818 | 0.865 | 0.768 | 86466 | 19296 | 4117122 | 13534 | 21 | | 8 | dan | Danish | 100000 | 0.601 | 0.700 | 0.533 | 69996 | 46449 | 4089969 | 30004 | 22 | | 9 | hun | Hungarian | 100000 | 0.831 | 0.880 | 0.787 | 88007 | 17864 | 4118554 | 11993 | 23 | | 10 | kor | Korean | 100000 | 0.995 | 0.891 | 0.938 | 89072 | 435 | 4135983 | 10928 | 24 | | 11 | spa | Spanish | 100000 | 0.625 | 0.908 | 0.606 | 90765 | 54449 | 4081969 | 9235 | 25 | | 12 | zho | Chinese | 100000 | 0.934 | 0.757 | 0.812 | 75651 | 5310 | 4131108 | 24349 | 26 | | 13 | slk | Slovak | 100000 | 0.902 | 0.400 | 0.538 | 39987 | 4356 | 4132062 | 60013 | 27 | | 14 | ron | Romanian | 100000 | 0.971 | 0.631 | 0.756 | 63068 | 1876 | 4134542 | 36932 | 28 | | 15 | ind | Indonesian | 100000 | 0.949 | 0.671 | 0.770 | 67114 | 3624 | 4132794 | 32886 | 29 | | 16 | est | Estonian | 100000 | 0.952 | 0.551 | 0.686 | 55128 | 2808 | 4133610 | 44872 | 30 | | 17 | por | Portuguese | 100000 | 0.790 | 0.810 | 0.722 | 80970 | 21587 | 4114831 | 19030 | 31 | | 18 | hrv | Croatian | 100000 | 0.728 | 0.264 | 0.361 | 26401 | 9843 | 4126575 | 73599 | 32 | | 19 | heb | Hebrew | 100000 | 1.000 | 0.969 | 0.984 | 96866 | 44 | 4136374 | 3134 | 33 | | 20 | ita | Italian | 100000 | 0.514 | 0.890 | 0.498 | 88985 | 84250 | 4052168 | 11015 | 34 | | 21 | slv | Slovenian | 100000 | 0.749 | 0.334 | 0.429 | 33416 | 11213 | 4125205 | 66584 | 35 | | 22 | ces | Czech | 100000 | 0.713 | 0.737 | 0.632 | 73683 | 29706 | 4106712 | 26317 | 36 | | 23 | mal | Malayalam | 100000 | 0.999 | 0.964 | 0.980 | 96373 | 125 | 4136293 | 3627 | 37 | | 24 | lit | Lithuanian | 100000 | 0.899 | 0.686 | 0.746 | 68646 | 7737 | 4128681 | 31354 | 38 | | 25 | ukr | Ukrainian | 100000 | 0.899 | 0.591 | 0.686 | 59075 | 6613 | 4129805 | 40925 | 39 | | 26 | pol | Polish | 100000 | 0.826 | 0.848 | 0.769 | 84830 | 17928 | 4118490 | 15170 | 40 | | 27 | fas | Persian | 100000 | 0.881 | 0.442 | 0.566 | 44234 | 5991 | 4130427 | 55766 | 41 | | 28 | jpn | Japanese | 100000 | 0.926 | 0.911 | 0.886 | 91080 | 7300 | 4129118 | 8920 | 42 | | 29 | hin | Hindi | 100000 | 0.998 | 0.797 | 0.886 | 79742 | 176 | 4136242 | 20258 | 43 | | 30 | eng | English | 100000 | 0.272 | 0.931 | 0.269 | 93052 | 249357 | 3887061 | 6948 | 44 | | 31 | sqi | Albanian | 100000 | 0.994 | 0.679 | 0.805 | 67935 | 395 | 4136023 | 32065 | 45 | | 32 | rus | Russian | 100000 | 0.563 | 0.956 | 0.556 | 95628 | 74265 | 4062153 | 4372 | 46 | | 33 | fra | French | 100000 | 0.727 | 0.864 | 0.688 | 86424 | 32405 | 4104013 | 13576 | 47 | | 34 | lav | Latvian | 100000 | 0.985 | 0.621 | 0.758 | 62101 | 931 | 4135487 | 37899 | 48 | | 35 | deu | German | 100000 | 0.745 | 0.849 | 0.699 | 84936 | 29124 | 4107294 | 15064 | 49 | | 36 | tur | Turkish | 100000 | 0.863 | 0.896 | 0.822 | 89562 | 14158 | 4122260 | 10438 | 50 | | 37 | ara | Arabic | 100000 | 0.661 | 0.916 | 0.642 | 91590 | 46947 | 4089471 | 8410 | 51 | | 38 | vie | Vietnamese | 100000 | 0.986 | 0.842 | 0.903 | 84224 | 1188 | 4135230 | 15776 | 52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.755 | 0.303 | 0.404 | 30259 | 9843 | 4126575 | 69741 | 53 | | 40 | ben | Bangla | 100000 | 0.999 | 0.938 | 0.967 | 93803 | 93 | 4136325 | 6197 | 54 | | 41 | nld | Dutch | 100000 | 0.897 | 0.798 | 0.806 | 79835 | 9125 | 4127293 | 20165 | 55 | | 42 | urd | Urdu | 46523 | 0.984 | 0.751 | 0.846 | 34930 | 564 | 4189331 | 11593 | 56 | | 43 | tam | Tamil | 40165 | 0.998 | 0.943 | 0.968 | 37857 | 84 | 4196169 | 2308 | 57 | | 44 | tel | Telugu | 30416 | 0.998 | 0.927 | 0.961 | 28199 | 51 | 4205951 | 2217 | 58 | | 45 | fil | Filipino | 19314 | 0.836 | 0.566 | 0.633 | 10925 | 2136 | 4214968 | 8389 | -------------------------------------------------------------------------------- /results/open-subtitles-v2018-100k-per-lang/fasttext/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for fasttext on open-subtitles-v2018-100k-per-lang 2 | 3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%) 4 | - **Aggregated accuracy: 80.16%** 5 | 6 |

Supported languages (176)

7 | 8 | afr (Afrikaans), als (Tosk Albanian), amh (Amharic), arg (Aragonese), ara (Arabic), arz (Egyptian Arabic), asm (Assamese), ast (Asturian), ava (Avaric), aze (Azerbaijani), azb (South Azerbaijani), bak (Bashkir), bar (Bavarian), bcl (Central Bikol), bel (Belarusian), bul (Bulgarian), bih (Bihari languages), ben (Bangla), bod (Tibetan), bpy (Bishnupriya), bre (Breton), bos (Bosnian), bxr (Russia Buriat), cat (Catalan), cbk (Chavacano), che (Chechen), ceb (Cebuano), ckb (Central Kurdish), cos (Corsican), ces (Czech), chv (Chuvash), cym (Welsh), dan (Danish), deu (German), diq (Dimli (individual language)), dsb (Lower Sorbian), dty (Dotyali), div (Divehi), ell (Greek), eml (Unknown language [eml]), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fra (French), frr (Northern Frisian), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), grn (Guarani), gom (Goan Konkani), guj (Gujarati), glv (Manx), heb (Hebrew), hin (Hindi), hif (Fiji Hindi), hrv (Croatian), hsb (Upper Sorbian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ina (Interlingua), ind (Indonesian), ile (Interlingue), ilo (Iloko), ido (Ido), isl (Icelandic), ita (Italian), jpn (Japanese), jbo (Lojban), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), krc (Karachay-Balkar), kur (Kurdish), kom (Komi), cor (Cornish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lez (Lezghian), lim (Limburgish), lmo (Lombard), lao (Lao), lrc (Northern Luri), lit (Lithuanian), lav (Latvian), mai (Maithili), mlg (Malagasy), mhr (Eastern Mari), min (Minangkabau), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), mrj (Western Mari), msa (Malay), mlt (Maltese), mwl (Mirandese), mya (Burmese), myv (Erzya), mzn (Mazanderani), nah (Nahuatl languages), nap (Neapolitan), nds (Low German), nep (Nepali), new (Newari), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), oss (Ossetic), pan (Punjabi), pam (Pampanga), pfl (Palatine German), pol (Polish), pms (Piedmontese), pnb (Western Panjabi), pus (Pashto), por (Portuguese), que (Quechua), roh (Romansh), ron (Romanian), rus (Russian), rue (Rusyn), san (Sanskrit), sah (Sakha), srd (Sardinian), scn (Sicilian), sco (Scots), snd (Sindhi), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), srp (Serbian), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tuk (Turkmen), fil (Filipino), tur (Turkish), tat (Tatar), tyv (Tuvinian), uig (Uyghur), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vec (Venetian), vep (Veps), vie (Vietnamese), vls (West Flemish), vol (Volapük), wln (Walloon), war (Waray), wuu (Wu Chinese), xal (Kalmyk), xmf (Mingrelian), yid (Yiddish), yor (Yoruba), yue (Cantonese), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|-------:|--------:|------:| 14 | | 1 | ell | Greek | 100000 | 0.993 | 0.993 | 0.990 | 99294 | 693 | 4135725 | 706 | 15 | | 2 | swe | Swedish | 100000 | 0.861 | 0.820 | 0.787 | 81969 | 13210 | 4123208 | 18031 | 16 | | 3 | mkd | Macedonian | 100000 | 0.910 | 0.848 | 0.841 | 84828 | 8412 | 4128006 | 15172 | 17 | | 4 | tha | Thai | 100000 | 0.998 | 0.958 | 0.977 | 95843 | 150 | 4136268 | 4157 | 18 | | 5 | cat | Catalan | 100000 | 0.950 | 0.598 | 0.720 | 59808 | 3148 | 4133270 | 40192 | 19 | | 6 | bul | Bulgarian | 100000 | 0.947 | 0.787 | 0.839 | 78701 | 4413 | 4132005 | 21299 | 20 | | 7 | fin | Finnish | 100000 | 0.865 | 0.895 | 0.824 | 89517 | 13918 | 4122500 | 10483 | 21 | | 8 | dan | Danish | 100000 | 0.619 | 0.747 | 0.560 | 74699 | 45945 | 4090473 | 25301 | 22 | | 9 | hun | Hungarian | 100000 | 0.844 | 0.932 | 0.818 | 93172 | 17283 | 4119135 | 6828 | 23 | | 10 | kor | Korean | 100000 | 0.987 | 0.930 | 0.952 | 92967 | 1182 | 4135236 | 7033 | 24 | | 11 | spa | Spanish | 100000 | 0.706 | 0.936 | 0.689 | 93648 | 39086 | 4097332 | 6352 | 25 | | 12 | zho | Chinese | 100000 | 0.928 | 0.850 | 0.858 | 85002 | 6573 | 4129845 | 14998 | 26 | | 13 | slk | Slovak | 100000 | 0.918 | 0.512 | 0.639 | 51250 | 4573 | 4131845 | 48750 | 27 | | 14 | ron | Romanian | 100000 | 0.961 | 0.723 | 0.812 | 72276 | 2908 | 4133510 | 27724 | 28 | | 15 | ind | Indonesian | 100000 | 0.920 | 0.725 | 0.783 | 72503 | 6296 | 4130122 | 27497 | 29 | | 16 | est | Estonian | 100000 | 0.941 | 0.678 | 0.769 | 67801 | 4256 | 4132162 | 32199 | 30 | | 17 | por | Portuguese | 100000 | 0.735 | 0.883 | 0.701 | 88308 | 31776 | 4104642 | 11692 | 31 | | 18 | hrv | Croatian | 100000 | 0.732 | 0.305 | 0.399 | 30462 | 11167 | 4125251 | 69538 | 32 | | 19 | heb | Hebrew | 100000 | 1.000 | 0.980 | 0.990 | 98046 | 41 | 4136377 | 1954 | 33 | | 20 | ita | Italian | 100000 | 0.639 | 0.925 | 0.622 | 92461 | 52332 | 4084086 | 7539 | 34 | | 21 | slv | Slovenian | 100000 | 0.858 | 0.386 | 0.510 | 38621 | 6372 | 4130046 | 61379 | 35 | | 22 | ces | Czech | 100000 | 0.781 | 0.793 | 0.709 | 79295 | 22186 | 4114232 | 20705 | 36 | | 23 | mal | Malayalam | 100000 | 0.999 | 0.967 | 0.982 | 96741 | 99 | 4136319 | 3259 | 37 | | 24 | lit | Lithuanian | 100000 | 0.909 | 0.765 | 0.797 | 76457 | 7678 | 4128740 | 23543 | 38 | | 25 | ukr | Ukrainian | 100000 | 0.932 | 0.676 | 0.762 | 67632 | 4955 | 4131463 | 32368 | 39 | | 26 | pol | Polish | 100000 | 0.837 | 0.894 | 0.798 | 89392 | 17360 | 4119058 | 10608 | 40 | | 27 | fas | Persian | 100000 | 0.948 | 0.577 | 0.703 | 57657 | 3136 | 4133282 | 42343 | 41 | | 28 | jpn | Japanese | 100000 | 0.942 | 0.934 | 0.912 | 93404 | 5734 | 4130684 | 6596 | 42 | | 29 | hin | Hindi | 100000 | 0.998 | 0.850 | 0.917 | 84951 | 138 | 4136280 | 15049 | 43 | | 30 | eng | English | 100000 | 0.433 | 0.939 | 0.427 | 93908 | 123135 | 4013283 | 6092 | 44 | | 31 | sqi | Albanian | 100000 | 0.987 | 0.700 | 0.814 | 69957 | 933 | 4135485 | 30043 | 45 | | 32 | rus | Russian | 100000 | 0.679 | 0.962 | 0.670 | 96174 | 45549 | 4090869 | 3826 | 46 | | 33 | fra | French | 100000 | 0.709 | 0.919 | 0.687 | 91892 | 37802 | 4098616 | 8108 | 47 | | 34 | lav | Latvian | 100000 | 0.989 | 0.668 | 0.794 | 66756 | 724 | 4135694 | 33244 | 48 | | 35 | deu | German | 100000 | 0.813 | 0.891 | 0.775 | 89068 | 20443 | 4115975 | 10932 | 49 | | 36 | tur | Turkish | 100000 | 0.881 | 0.933 | 0.854 | 93281 | 12574 | 4123844 | 6719 | 50 | | 37 | ara | Arabic | 100000 | 0.783 | 0.924 | 0.759 | 92437 | 25579 | 4110839 | 7563 | 51 | | 38 | vie | Vietnamese | 100000 | 0.991 | 0.870 | 0.922 | 86978 | 798 | 4135620 | 13022 | 52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.770 | 0.342 | 0.443 | 34206 | 10192 | 4126226 | 65794 | 53 | | 40 | ben | Bangla | 100000 | 0.999 | 0.947 | 0.971 | 94676 | 141 | 4136277 | 5324 | 54 | | 41 | nld | Dutch | 100000 | 0.877 | 0.855 | 0.816 | 85450 | 12031 | 4124387 | 14550 | 55 | | 42 | urd | Urdu | 46523 | 0.970 | 0.761 | 0.842 | 35426 | 1100 | 4188795 | 11097 | 56 | | 43 | tam | Tamil | 40165 | 0.995 | 0.944 | 0.967 | 37924 | 177 | 4196076 | 2241 | 57 | | 44 | tel | Telugu | 30416 | 0.994 | 0.931 | 0.959 | 28330 | 167 | 4205835 | 2086 | 58 | | 45 | fil | Filipino | 19314 | 0.838 | 0.651 | 0.684 | 12579 | 2435 | 4214669 | 6735 | -------------------------------------------------------------------------------- /results/open-subtitles-v2018-100k-per-lang/gcld3/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for gcld3 on open-subtitles-v2018-100k-per-lang 2 | 3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%) 4 | - **Aggregated accuracy: 73.08%** 5 | 6 |

Supported languages (107)

7 | 8 | afr (Afrikaans), amh (Amharic), ara (Arabic), bul (Bulgarian), bul (Bulgarian), ben (Bangla), bos (Bosnian), cat (Catalan), ceb (Cebuano), cos (Corsican), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fil (Filipino), fra (French), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), guj (Gujarati), hau (Hausa), haw (Hawaiian), hin (Hindi), hin (Hindi), hmn (Hmong), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), ibo (Igbo), isl (Icelandic), ita (Italian), heb (Hebrew), jpn (Japanese), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mri (Maori), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), mya (Burmese), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), nya (Nyanja), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), ron (Romanian), rus (Russian), rus (Russian), snd (Sindhi), sin (Sinhala), slk (Slovak), slv (Slovenian), smo (Samoan), sna (Shona), som (Somali), sqi (Albanian), srp (Serbian), sot (Southern Sotho), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vie (Vietnamese), xho (Xhosa), yid (Yiddish), yor (Yoruba), zho (Chinese), zho (Chinese), zul (Zulu) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|------:|--------:|------:| 14 | | 1 | ell | Greek | 100000 | 0.930 | 0.991 | 0.927 | 99142 | 7411 | 4129007 | 858 | 15 | | 2 | swe | Swedish | 100000 | 0.835 | 0.742 | 0.729 | 74175 | 14708 | 4121710 | 25825 | 16 | | 3 | mkd | Macedonian | 100000 | 0.891 | 0.647 | 0.717 | 64667 | 7898 | 4128520 | 35333 | 17 | | 4 | tha | Thai | 100000 | 0.997 | 0.942 | 0.968 | 94203 | 263 | 4136155 | 5797 | 18 | | 5 | cat | Catalan | 100000 | 0.819 | 0.644 | 0.668 | 64430 | 14233 | 4122185 | 35570 | 19 | | 6 | bul | Bulgarian | 100000 | 0.707 | 0.707 | 0.617 | 70659 | 29255 | 4107163 | 29341 | 20 | | 7 | fin | Finnish | 100000 | 0.807 | 0.784 | 0.726 | 78440 | 18791 | 4117627 | 21560 | 21 | | 8 | dan | Danish | 100000 | 0.761 | 0.614 | 0.614 | 61404 | 19263 | 4117155 | 38596 | 22 | | 9 | hun | Hungarian | 100000 | 0.864 | 0.757 | 0.759 | 75722 | 11924 | 4124494 | 24278 | 23 | | 10 | kor | Korean | 100000 | 0.969 | 0.955 | 0.947 | 95514 | 3081 | 4133337 | 4486 | 24 | | 11 | spa | Spanish | 100000 | 0.795 | 0.666 | 0.663 | 66646 | 17195 | 4119223 | 33354 | 25 | | 12 | zho | Chinese | 100000 | 0.919 | 0.858 | 0.854 | 85780 | 7585 | 4128833 | 14220 | 26 | | 13 | slk | Slovak | 100000 | 0.789 | 0.608 | 0.629 | 60799 | 16304 | 4120114 | 39201 | 27 | | 14 | ron | Romanian | 100000 | 0.898 | 0.611 | 0.698 | 61108 | 6976 | 4129442 | 38892 | 28 | | 15 | ind | Indonesian | 100000 | 0.858 | 0.470 | 0.578 | 47018 | 7801 | 4128617 | 52982 | 29 | | 16 | est | Estonian | 100000 | 0.819 | 0.710 | 0.702 | 71021 | 15723 | 4120695 | 28979 | 30 | | 17 | por | Portuguese | 100000 | 0.814 | 0.677 | 0.681 | 67704 | 15509 | 4120909 | 32296 | 31 | | 18 | hrv | Croatian | 100000 | 0.743 | 0.357 | 0.445 | 35706 | 12370 | 4124048 | 64294 | 32 | | 19 | heb | Hebrew | 100000 | 0.999 | 0.929 | 0.962 | 92883 | 66 | 4136352 | 7117 | 33 | | 20 | ita | Italian | 100000 | 0.815 | 0.695 | 0.691 | 69451 | 15732 | 4120686 | 30549 | 34 | | 21 | slv | Slovenian | 100000 | 0.778 | 0.627 | 0.632 | 62668 | 17892 | 4118526 | 37332 | 35 | | 22 | ces | Czech | 100000 | 0.828 | 0.659 | 0.682 | 65903 | 13732 | 4122686 | 34097 | 36 | | 23 | mal | Malayalam | 100000 | 1.000 | 0.958 | 0.979 | 95809 | 0 | 4136418 | 4191 | 37 | | 24 | lit | Lithuanian | 100000 | 0.839 | 0.679 | 0.700 | 67868 | 13048 | 4123370 | 32132 | 38 | | 25 | ukr | Ukrainian | 100000 | 0.756 | 0.576 | 0.592 | 57647 | 18557 | 4117861 | 42353 | 39 | | 26 | pol | Polish | 100000 | 0.883 | 0.774 | 0.782 | 77363 | 10245 | 4126173 | 22637 | 40 | | 27 | fas | Persian | 100000 | 0.869 | 0.552 | 0.643 | 55250 | 8360 | 4128058 | 44750 | 41 | | 28 | jpn | Japanese | 100000 | 0.703 | 0.967 | 0.695 | 96684 | 40843 | 4095575 | 3316 | 42 | | 29 | hin | Hindi | 100000 | 0.816 | 0.735 | 0.711 | 73451 | 16519 | 4119899 | 26549 | 43 | | 30 | eng | English | 100000 | 0.691 | 0.689 | 0.597 | 68859 | 30848 | 4105570 | 31141 | 44 | | 31 | sqi | Albanian | 100000 | 0.945 | 0.692 | 0.781 | 69231 | 3995 | 4132423 | 30769 | 45 | | 32 | rus | Russian | 100000 | 0.637 | 0.696 | 0.560 | 69635 | 39608 | 4096810 | 30365 | 46 | | 33 | fra | French | 100000 | 0.863 | 0.681 | 0.718 | 68140 | 10854 | 4125564 | 31860 | 47 | | 34 | lav | Latvian | 100000 | 0.830 | 0.690 | 0.699 | 68956 | 14159 | 4122259 | 31044 | 48 | | 35 | deu | German | 100000 | 0.838 | 0.760 | 0.740 | 75994 | 14742 | 4121676 | 24006 | 49 | | 36 | tur | Turkish | 100000 | 0.883 | 0.743 | 0.766 | 74271 | 9841 | 4126577 | 25729 | 50 | | 37 | ara | Arabic | 100000 | 0.856 | 0.840 | 0.792 | 83981 | 14088 | 4122330 | 16019 | 51 | | 38 | vie | Vietnamese | 100000 | 0.906 | 0.803 | 0.815 | 80261 | 8292 | 4128126 | 19739 | 52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.683 | 0.676 | 0.587 | 67633 | 31322 | 4105096 | 32367 | 53 | | 40 | ben | Bangla | 100000 | 1.000 | 0.955 | 0.977 | 95545 | 0 | 4136418 | 4455 | 54 | | 41 | nld | Dutch | 100000 | 0.841 | 0.731 | 0.728 | 73077 | 13823 | 4122595 | 26923 | 55 | | 42 | urd | Urdu | 46523 | 0.923 | 0.766 | 0.809 | 35638 | 2984 | 4186911 | 10885 | 56 | | 43 | tam | Tamil | 40165 | 0.994 | 0.929 | 0.958 | 37303 | 210 | 4196043 | 2862 | 57 | | 44 | tel | Telugu | 30416 | 0.993 | 0.898 | 0.940 | 27303 | 195 | 4205807 | 3113 | 58 | | 45 | fil | Filipino | 19314 | 0.681 | 0.561 | 0.538 | 10843 | 5069 | 4212035 | 8471 | -------------------------------------------------------------------------------- /results/open-subtitles-v2018-100k-per-lang/langdetect/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for langdetect on open-subtitles-v2018-100k-per-lang 2 | 3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%) 4 | - **Aggregated accuracy: 79.48%** 5 | 6 |

Supported languages (55)

7 | 8 | afr (Afrikaans), ara (Arabic), bul (Bulgarian), ben (Bangla), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), eng (English), spa (Spanish), est (Estonian), fas (Persian), fin (Finnish), fra (French), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hun (Hungarian), ind (Indonesian), ita (Italian), jpn (Japanese), kan (Kannada), kor (Korean), lit (Lithuanian), lav (Latvian), mkd (Macedonian), mal (Malayalam), mar (Marathi), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), pan (Punjabi), pol (Polish), por (Portuguese), ron (Romanian), rus (Russian), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), zho (Chinese), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|------:|--------:|------:| 14 | | 1 | ell | Greek | 100000 | 1.000 | 0.996 | 0.998 | 99615 | 0 | 4136418 | 385 | 15 | | 2 | swe | Swedish | 100000 | 0.853 | 0.734 | 0.739 | 73427 | 12668 | 4123750 | 26573 | 16 | | 3 | mkd | Macedonian | 100000 | 0.732 | 0.831 | 0.682 | 83087 | 30346 | 4106072 | 16913 | 17 | | 4 | tha | Thai | 100000 | 1.000 | 0.959 | 0.979 | 95853 | 0 | 4136418 | 4147 | 18 | | 5 | cat | Catalan | 100000 | 0.790 | 0.649 | 0.651 | 64878 | 17269 | 4119149 | 35122 | 19 | | 6 | bul | Bulgarian | 100000 | 0.718 | 0.694 | 0.620 | 69446 | 27311 | 4109107 | 30554 | 20 | | 7 | fin | Finnish | 100000 | 0.828 | 0.883 | 0.785 | 88258 | 18356 | 4118062 | 11742 | 21 | | 8 | dan | Danish | 100000 | 0.681 | 0.589 | 0.550 | 58899 | 27614 | 4108804 | 41101 | 22 | | 9 | hun | Hungarian | 100000 | 0.915 | 0.823 | 0.833 | 82309 | 7622 | 4128796 | 17691 | 23 | | 10 | kor | Korean | 100000 | 0.774 | 0.962 | 0.763 | 96150 | 28018 | 4108400 | 3850 | 24 | | 11 | spa | Spanish | 100000 | 0.791 | 0.707 | 0.680 | 70674 | 18621 | 4117797 | 29326 | 25 | | 12 | zho | Chinese | 100000 | 0.983 | 0.593 | 0.734 | 59251 | 1047 | 4135371 | 40749 | 26 | | 13 | slk | Slovak | 100000 | 0.766 | 0.628 | 0.624 | 62796 | 19226 | 4117192 | 37204 | 27 | | 14 | ron | Romanian | 100000 | 0.862 | 0.774 | 0.766 | 77434 | 12373 | 4124045 | 22566 | 28 | | 15 | ind | Indonesian | 100000 | 0.705 | 0.783 | 0.643 | 78346 | 32709 | 4103709 | 21654 | 29 | | 16 | est | Estonian | 100000 | 0.814 | 0.777 | 0.729 | 77744 | 17757 | 4118661 | 22256 | 30 | | 17 | por | Portuguese | 100000 | 0.676 | 0.751 | 0.608 | 75080 | 35971 | 4100447 | 24920 | 31 | | 18 | hrv | Croatian | 100000 | 0.670 | 0.652 | 0.568 | 65173 | 32078 | 4104340 | 34827 | 32 | | 19 | heb | Hebrew | 100000 | 1.000 | 0.990 | 0.995 | 98997 | 0 | 4136418 | 1003 | 33 | | 20 | ita | Italian | 100000 | 0.734 | 0.780 | 0.666 | 78012 | 28203 | 4108215 | 21988 | 34 | | 21 | slv | Slovenian | 100000 | 0.657 | 0.653 | 0.559 | 65297 | 34095 | 4102323 | 34703 | 35 | | 22 | ces | Czech | 100000 | 0.825 | 0.706 | 0.704 | 70563 | 14998 | 4121420 | 29437 | 36 | | 23 | mal | Malayalam | 100000 | 1.000 | 0.976 | 0.988 | 97615 | 0 | 4136418 | 2385 | 37 | | 24 | lit | Lithuanian | 100000 | 0.873 | 0.770 | 0.772 | 77002 | 11212 | 4125206 | 22998 | 38 | | 25 | ukr | Ukrainian | 100000 | 0.851 | 0.545 | 0.628 | 54530 | 9511 | 4126907 | 45470 | 39 | | 26 | pol | Polish | 100000 | 0.902 | 0.867 | 0.844 | 86670 | 9390 | 4127028 | 13330 | 40 | | 27 | fas | Persian | 100000 | 0.959 | 0.829 | 0.872 | 82872 | 3550 | 4132868 | 17128 | 41 | | 28 | jpn | Japanese | 100000 | 0.999 | 0.944 | 0.971 | 94447 | 59 | 4136359 | 5553 | 42 | | 29 | hin | Hindi | 100000 | 1.000 | 0.819 | 0.901 | 81904 | 0 | 4136418 | 18096 | 43 | | 30 | eng | English | 100000 | 0.593 | 0.749 | 0.539 | 74888 | 51451 | 4084967 | 25112 | 44 | | 31 | sqi | Albanian | 100000 | 0.932 | 0.808 | 0.839 | 80757 | 5914 | 4130504 | 19243 | 45 | | 32 | rus | Russian | 100000 | 0.648 | 0.793 | 0.597 | 79285 | 43117 | 4093301 | 20715 | 46 | | 33 | fra | French | 100000 | 0.757 | 0.776 | 0.682 | 77581 | 24894 | 4111524 | 22419 | 47 | | 34 | lav | Latvian | 100000 | 0.920 | 0.811 | 0.831 | 81127 | 7057 | 4129361 | 18873 | 48 | | 35 | deu | German | 100000 | 0.670 | 0.841 | 0.630 | 84127 | 41422 | 4094996 | 15873 | 49 | | 36 | tur | Turkish | 100000 | 0.895 | 0.851 | 0.829 | 85053 | 10012 | 4126406 | 14947 | 50 | | 37 | ara | Arabic | 100000 | 0.895 | 0.936 | 0.868 | 93567 | 11010 | 4125408 | 6433 | 51 | | 38 | vie | Vietnamese | 100000 | 0.927 | 0.917 | 0.890 | 91653 | 7171 | 4129247 | 8347 | 52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.635 | 0.647 | 0.541 | 64662 | 37243 | 4099175 | 35338 | 53 | | 40 | ben | Bangla | 100000 | 1.000 | 0.971 | 0.985 | 97127 | 0 | 4136418 | 2873 | 54 | | 41 | nld | Dutch | 100000 | 0.774 | 0.687 | 0.658 | 68682 | 20042 | 4116376 | 31318 | 55 | | 42 | urd | Urdu | 46523 | 0.880 | 0.884 | 0.832 | 41104 | 5590 | 4184305 | 5419 | 56 | | 43 | tam | Tamil | 40165 | 1.000 | 0.952 | 0.975 | 38236 | 2 | 4196251 | 1929 | 57 | | 44 | tel | Telugu | 30416 | 1.000 | 0.941 | 0.970 | 28630 | 0 | 4206002 | 1786 | 58 | | 45 | fil | Filipino | 19314 | 0.378 | 0.746 | 0.356 | 14401 | 23647 | 4193457 | 4913 | -------------------------------------------------------------------------------- /results/open-subtitles-v2018-100k-per-lang/langid/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for langid on open-subtitles-v2018-100k-per-lang 2 | 3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%) 4 | - **Aggregated accuracy: 74.19%** 5 | 6 |

Supported languages (97)

7 | 8 | afr (Afrikaans), amh (Amharic), arg (Aragonese), ara (Arabic), asm (Assamese), aze (Azerbaijani), bel (Belarusian), bul (Bulgarian), ben (Bangla), bre (Breton), bos (Bosnian), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), dzo (Dzongkha), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fao (Faroese), fra (French), gle (Irish), glg (Galician), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), isl (Icelandic), ita (Italian), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), nob (Norwegian Bokmål), nep (Nepali), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), que (Quechua), ron (Romanian), rus (Russian), kin (Kinyarwanda), sme (Northern Sami), sin (Sinhala), slk (Slovak), slv (Slovenian), sqi (Albanian), srp (Serbian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), uig (Uyghur), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), vol (Volapük), wln (Walloon), xho (Xhosa), zho (Chinese), zul (Zulu) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|-------:|--------:|------:| 14 | | 1 | ell | Greek | 100000 | 0.999 | 0.996 | 0.998 | 99645 | 69 | 4136349 | 355 | 15 | | 2 | swe | Swedish | 100000 | 0.856 | 0.710 | 0.728 | 70956 | 11937 | 4124481 | 29044 | 16 | | 3 | mkd | Macedonian | 100000 | 0.782 | 0.433 | 0.517 | 43309 | 12082 | 4124336 | 56691 | 17 | | 4 | tha | Thai | 100000 | 1.000 | 0.961 | 0.980 | 96095 | 20 | 4136398 | 3905 | 18 | | 5 | cat | Catalan | 100000 | 0.930 | 0.505 | 0.639 | 50550 | 3830 | 4132588 | 49450 | 19 | | 6 | bul | Bulgarian | 100000 | 0.689 | 0.570 | 0.547 | 57031 | 25798 | 4110620 | 42969 | 20 | | 7 | fin | Finnish | 100000 | 0.813 | 0.826 | 0.749 | 82642 | 18954 | 4117464 | 17358 | 21 | | 8 | dan | Danish | 100000 | 0.674 | 0.521 | 0.514 | 52077 | 25239 | 4111179 | 47923 | 22 | | 9 | hun | Hungarian | 100000 | 0.884 | 0.795 | 0.794 | 79534 | 10437 | 4125981 | 20466 | 23 | | 10 | kor | Korean | 100000 | 0.999 | 0.962 | 0.980 | 96165 | 68 | 4136350 | 3835 | 24 | | 11 | spa | Spanish | 100000 | 0.615 | 0.719 | 0.549 | 71937 | 44996 | 4091422 | 28063 | 25 | | 12 | zho | Chinese | 100000 | 0.890 | 0.916 | 0.855 | 91584 | 11311 | 4125107 | 8416 | 26 | | 13 | slk | Slovak | 100000 | 0.796 | 0.575 | 0.615 | 57535 | 14775 | 4121643 | 42465 | 27 | | 14 | ron | Romanian | 100000 | 0.934 | 0.682 | 0.767 | 68239 | 4830 | 4131588 | 31761 | 28 | | 15 | ind | Indonesian | 100000 | 0.875 | 0.529 | 0.630 | 52912 | 7540 | 4128878 | 47088 | 29 | | 16 | est | Estonian | 100000 | 0.867 | 0.632 | 0.692 | 63160 | 9726 | 4126692 | 36840 | 30 | | 17 | por | Portuguese | 100000 | 0.833 | 0.646 | 0.678 | 64621 | 12978 | 4123440 | 35379 | 31 | | 18 | hrv | Croatian | 100000 | 0.803 | 0.505 | 0.576 | 50470 | 12351 | 4124067 | 49530 | 32 | | 19 | heb | Hebrew | 100000 | 0.999 | 0.974 | 0.985 | 97363 | 121 | 4136297 | 2637 | 33 | | 20 | ita | Italian | 100000 | 0.729 | 0.711 | 0.635 | 71086 | 26488 | 4109930 | 28914 | 34 | | 21 | slv | Slovenian | 100000 | 0.631 | 0.569 | 0.509 | 56891 | 33249 | 4103169 | 43109 | 35 | | 22 | ces | Czech | 100000 | 0.768 | 0.694 | 0.657 | 69366 | 20953 | 4115465 | 30634 | 36 | | 23 | mal | Malayalam | 100000 | 1.000 | 0.977 | 0.988 | 97661 | 2 | 4136416 | 2339 | 37 | | 24 | lit | Lithuanian | 100000 | 0.821 | 0.710 | 0.703 | 71019 | 15497 | 4120921 | 28981 | 38 | | 25 | ukr | Ukrainian | 100000 | 0.805 | 0.500 | 0.574 | 49995 | 12080 | 4124338 | 50005 | 39 | | 26 | pol | Polish | 100000 | 0.841 | 0.862 | 0.787 | 86150 | 16323 | 4120095 | 13850 | 40 | | 27 | fas | Persian | 100000 | 0.830 | 0.526 | 0.604 | 52589 | 10748 | 4125670 | 47411 | 41 | | 28 | jpn | Japanese | 100000 | 0.974 | 0.966 | 0.957 | 96556 | 2623 | 4133795 | 3444 | 42 | | 29 | hin | Hindi | 100000 | 0.999 | 0.788 | 0.881 | 78804 | 65 | 4136353 | 21196 | 43 | | 30 | eng | English | 100000 | 0.239 | 0.912 | 0.236 | 91242 | 291069 | 3845349 | 8758 | 44 | | 31 | sqi | Albanian | 100000 | 0.987 | 0.747 | 0.846 | 74695 | 995 | 4135423 | 25305 | 45 | | 32 | rus | Russian | 100000 | 0.592 | 0.710 | 0.528 | 71033 | 48977 | 4087441 | 28967 | 46 | | 33 | fra | French | 100000 | 0.675 | 0.766 | 0.612 | 76607 | 36932 | 4099486 | 23393 | 47 | | 34 | lav | Latvian | 100000 | 0.930 | 0.749 | 0.804 | 74883 | 5659 | 4130759 | 25117 | 48 | | 35 | deu | German | 100000 | 0.639 | 0.827 | 0.599 | 82719 | 46756 | 4089662 | 17281 | 49 | | 36 | tur | Turkish | 100000 | 0.939 | 0.778 | 0.828 | 77837 | 5089 | 4131329 | 22163 | 50 | | 37 | ara | Arabic | 100000 | 0.847 | 0.888 | 0.804 | 88771 | 15980 | 4120438 | 11229 | 51 | | 38 | vie | Vietnamese | 100000 | 0.962 | 0.899 | 0.912 | 89889 | 3570 | 4132848 | 10111 | 52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.618 | 0.565 | 0.499 | 56473 | 34838 | 4101580 | 43527 | 53 | | 40 | ben | Bangla | 100000 | 1.000 | 0.929 | 0.963 | 92897 | 10 | 4136408 | 7103 | 54 | | 41 | nld | Dutch | 100000 | 0.831 | 0.780 | 0.744 | 77955 | 15815 | 4120603 | 22045 | 55 | | 42 | urd | Urdu | 46523 | 0.710 | 0.763 | 0.640 | 35497 | 14494 | 4175401 | 11026 | 56 | | 43 | tam | Tamil | 40165 | 1.000 | 0.950 | 0.974 | 38148 | 5 | 4196248 | 2017 | 57 | | 44 | tel | Telugu | 30416 | 0.999 | 0.945 | 0.971 | 28735 | 23 | 4205979 | 1681 | 58 | | 45 | fil | Filipino | 19314 | 0.783 | 0.503 | 0.564 | 9714 | 2695 | 4214409 | 9600 | -------------------------------------------------------------------------------- /results/open-subtitles-v2018-100k-per-lang/pycld2/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for pycld2 on open-subtitles-v2018-100k-per-lang 2 | 3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%) 4 | - **Aggregated accuracy: 68.41%** 5 | 6 |

Supported languages (83)

7 | 8 | afr (Afrikaans), sqi (Albanian), ara (Arabic), hye (Armenian), aze (Azerbaijani), eus (Basque), bel (Belarusian), ben (Bangla), bih (Bihari languages), bul (Bulgarian), cat (Catalan), ceb (Cebuano), chr (Cherokee), hrv (Croatian), ces (Czech), zho (Chinese), dan (Danish), div (Divehi), nld (Dutch), eng (English), est (Estonian), fin (Finnish), fra (French), glg (Galician), lug (Ganda), kat (Georgian), deu (German), ell (Greek), guj (Gujarati), hat (Haitian Creole), heb (Hebrew), hin (Hindi), hmn (Hmong), hun (Hungarian), isl (Icelandic), ind (Indonesian), iku (Inuktitut), gle (Irish), ita (Italian), jav (Javanese), jpn (Japanese), kan (Kannada), khm (Khmer), kin (Kinyarwanda), kor (Korean), lao (Lao), lav (Latvian), lif (Limbu), lit (Lithuanian), mkd (Macedonian), msa (Malay), mal (Malayalam), mlt (Maltese), mar (Marathi), nep (Nepali), nob (Norwegian Bokmål), ori (Odia), fas (Persian), pol (Polish), por (Portuguese), pan (Punjabi), ron (Romanian), rus (Russian), gla (Scottish Gaelic), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), spa (Spanish), swa (Swahili), swe (Swedish), syr (Syriac), fil (Filipino), tam (Tamil), tel (Telugu), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), cym (Welsh), yid (Yiddish), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|------:|--------:|------:| 14 | | 1 | ell | Greek | 100000 | 1.000 | 0.997 | 0.998 | 99664 | 26 | 4136392 | 336 | 15 | | 2 | swe | Swedish | 100000 | 0.995 | 0.632 | 0.771 | 63168 | 297 | 4136121 | 36832 | 16 | | 3 | mkd | Macedonian | 100000 | 0.973 | 0.452 | 0.612 | 45227 | 1246 | 4135172 | 54773 | 17 | | 4 | tha | Thai | 100000 | 1.000 | 0.961 | 0.980 | 96100 | 0 | 4136418 | 3900 | 18 | | 5 | cat | Catalan | 100000 | 0.991 | 0.484 | 0.648 | 48417 | 464 | 4135954 | 51583 | 19 | | 6 | bul | Bulgarian | 100000 | 0.969 | 0.570 | 0.710 | 57021 | 1805 | 4134613 | 42979 | 20 | | 7 | fin | Finnish | 100000 | 0.996 | 0.747 | 0.852 | 74729 | 300 | 4136118 | 25271 | 21 | | 8 | dan | Danish | 100000 | 0.873 | 0.573 | 0.659 | 57328 | 8331 | 4128087 | 42672 | 22 | | 9 | hun | Hungarian | 100000 | 0.999 | 0.764 | 0.865 | 76416 | 109 | 4136309 | 23584 | 23 | | 10 | kor | Korean | 100000 | 1.000 | 0.891 | 0.942 | 89101 | 0 | 4136418 | 10899 | 24 | | 11 | spa | Spanish | 100000 | 0.971 | 0.648 | 0.769 | 64824 | 1918 | 4134500 | 35176 | 25 | | 12 | zho | Chinese | 100000 | 0.999 | 0.684 | 0.812 | 68435 | 35 | 4136383 | 31565 | 26 | | 13 | slk | Slovak | 100000 | 0.904 | 0.658 | 0.732 | 65838 | 7024 | 4129394 | 34162 | 27 | | 14 | ron | Romanian | 100000 | 0.995 | 0.666 | 0.796 | 66604 | 367 | 4136051 | 33396 | 28 | | 15 | ind | Indonesian | 100000 | 0.970 | 0.626 | 0.752 | 62612 | 1912 | 4134506 | 37388 | 29 | | 16 | est | Estonian | 100000 | 0.993 | 0.668 | 0.797 | 66835 | 448 | 4135970 | 33165 | 30 | | 17 | por | Portuguese | 100000 | 0.938 | 0.667 | 0.760 | 66656 | 4394 | 4132024 | 33344 | 31 | | 18 | hrv | Croatian | 100000 | 0.968 | 0.468 | 0.625 | 46827 | 1560 | 4134858 | 53173 | 32 | | 19 | heb | Hebrew | 100000 | 1.000 | 0.619 | 0.765 | 61882 | 1 | 4136417 | 38118 | 33 | | 20 | ita | Italian | 100000 | 0.992 | 0.504 | 0.667 | 50426 | 396 | 4136022 | 49574 | 34 | | 21 | slv | Slovenian | 100000 | 0.991 | 0.444 | 0.612 | 44391 | 384 | 4136034 | 55609 | 35 | | 22 | ces | Czech | 100000 | 0.895 | 0.716 | 0.760 | 71587 | 8361 | 4128057 | 28413 | 36 | | 23 | mal | Malayalam | 100000 | 1.000 | 0.977 | 0.988 | 97651 | 0 | 4136418 | 2349 | 37 | | 24 | lit | Lithuanian | 100000 | 0.995 | 0.659 | 0.791 | 65892 | 340 | 4136078 | 34108 | 38 | | 25 | ukr | Ukrainian | 100000 | 0.989 | 0.456 | 0.622 | 45595 | 484 | 4135934 | 54405 | 39 | | 26 | pol | Polish | 100000 | 0.997 | 0.729 | 0.841 | 72937 | 221 | 4136197 | 27063 | 40 | | 27 | fas | Persian | 100000 | 0.998 | 0.604 | 0.752 | 60362 | 119 | 4136299 | 39638 | 41 | | 28 | jpn | Japanese | 100000 | 0.988 | 0.911 | 0.942 | 91104 | 1114 | 4135304 | 8896 | 42 | | 29 | hin | Hindi | 100000 | 1.000 | 0.847 | 0.917 | 84721 | 0 | 4136418 | 15279 | 43 | | 30 | eng | English | 100000 | 0.620 | 0.744 | 0.560 | 74428 | 45610 | 4090808 | 25572 | 44 | | 31 | sqi | Albanian | 100000 | 0.999 | 0.708 | 0.828 | 70830 | 97 | 4136321 | 29170 | 45 | | 32 | rus | Russian | 100000 | 0.821 | 0.638 | 0.666 | 63789 | 13928 | 4122490 | 36211 | 46 | | 33 | fra | French | 100000 | 0.990 | 0.644 | 0.777 | 64405 | 654 | 4135764 | 35595 | 47 | | 34 | lav | Latvian | 100000 | 0.995 | 0.674 | 0.802 | 67436 | 321 | 4136097 | 32564 | 48 | | 35 | deu | German | 100000 | 0.991 | 0.733 | 0.840 | 73344 | 634 | 4135784 | 26656 | 49 | | 36 | tur | Turkish | 100000 | 0.999 | 0.747 | 0.854 | 74678 | 92 | 4136326 | 25322 | 50 | | 37 | ara | Arabic | 100000 | 0.986 | 0.653 | 0.781 | 65309 | 935 | 4135483 | 34691 | 51 | | 38 | vie | Vietnamese | 100000 | 0.999 | 0.773 | 0.871 | 77288 | 59 | 4136359 | 22712 | 52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.802 | 0.634 | 0.652 | 63433 | 15634 | 4120784 | 36567 | 53 | | 40 | ben | Bangla | 100000 | 1.000 | 0.639 | 0.780 | 63868 | 0 | 4136418 | 36132 | 54 | | 41 | nld | Dutch | 100000 | 0.996 | 0.676 | 0.804 | 67614 | 251 | 4136167 | 32386 | 55 | | 42 | urd | Urdu | 46523 | 0.998 | 0.677 | 0.806 | 31508 | 57 | 4189838 | 15015 | 56 | | 43 | tam | Tamil | 40165 | 1.000 | 0.954 | 0.976 | 38307 | 2 | 4196251 | 1858 | 57 | | 44 | tel | Telugu | 30416 | 1.000 | 0.945 | 0.972 | 28732 | 0 | 4206002 | 1684 | 58 | | 45 | fil | Filipino | 19314 | 0.996 | 0.550 | 0.708 | 10626 | 43 | 4217061 | 8688 | -------------------------------------------------------------------------------- /results/open-subtitles-v2018-100k-per-lang/results.md: -------------------------------------------------------------------------------- 1 | # Aggregated results for open-subtitles-v2018-100k-per-lang 2 | 3 | | Library | Supported languages | # sentences supported | Aggregated accuracy | Per language metrics | 4 | |:--------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 5 | | fasttext | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [80.16%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext/classification_performance.md#metrics-per-language) | 6 | | fasttext-compressed | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext-compressed/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [75.21%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext-compressed/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext-compressed/classification_performance.md#metrics-per-language) | 7 | | gcld3 | [107](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/gcld3/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [73.08%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/gcld3/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/gcld3/classification_performance.md#metrics-per-language) | 8 | | langdetect | [55](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langdetect/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [79.48%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langdetect/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langdetect/classification_performance.md#metrics-per-language) | 9 | | langid | [97](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langid/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [74.19%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langid/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langid/classification_performance.md#metrics-per-language) | 10 | | pycld2 | [83](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/pycld2/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [68.41%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/pycld2/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/pycld2/classification_performance.md#metrics-per-language) | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05-common-48/fasttext-compressed/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for fasttext-compressed on tatoeba-sentences-2021-06-05-common-48 2 | 3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%) 4 | - **Aggregated accuracy: 97.90%** 5 | 6 |

Supported languages (176)

7 | 8 | afr (Afrikaans), als (Tosk Albanian), amh (Amharic), arg (Aragonese), ara (Arabic), arz (Egyptian Arabic), asm (Assamese), ast (Asturian), ava (Avaric), aze (Azerbaijani), azb (South Azerbaijani), bak (Bashkir), bar (Bavarian), bcl (Central Bikol), bel (Belarusian), bul (Bulgarian), bih (Bihari languages), ben (Bangla), bod (Tibetan), bpy (Bishnupriya), bre (Breton), bos (Bosnian), bxr (Russia Buriat), cat (Catalan), cbk (Chavacano), che (Chechen), ceb (Cebuano), ckb (Central Kurdish), cos (Corsican), ces (Czech), chv (Chuvash), cym (Welsh), dan (Danish), deu (German), diq (Dimli (individual language)), dsb (Lower Sorbian), dty (Dotyali), div (Divehi), ell (Greek), eml (Unknown language [eml]), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fra (French), frr (Northern Frisian), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), grn (Guarani), gom (Goan Konkani), guj (Gujarati), glv (Manx), heb (Hebrew), hin (Hindi), hif (Fiji Hindi), hrv (Croatian), hsb (Upper Sorbian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ina (Interlingua), ind (Indonesian), ile (Interlingue), ilo (Iloko), ido (Ido), isl (Icelandic), ita (Italian), jpn (Japanese), jbo (Lojban), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), krc (Karachay-Balkar), kur (Kurdish), kom (Komi), cor (Cornish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lez (Lezghian), lim (Limburgish), lmo (Lombard), lao (Lao), lrc (Northern Luri), lit (Lithuanian), lav (Latvian), mai (Maithili), mlg (Malagasy), mhr (Eastern Mari), min (Minangkabau), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), mrj (Western Mari), msa (Malay), mlt (Maltese), mwl (Mirandese), mya (Burmese), myv (Erzya), mzn (Mazanderani), nah (Nahuatl languages), nap (Neapolitan), nds (Low German), nep (Nepali), new (Newari), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), oss (Ossetic), pan (Punjabi), pam (Pampanga), pfl (Palatine German), pol (Polish), pms (Piedmontese), pnb (Western Panjabi), pus (Pashto), por (Portuguese), que (Quechua), roh (Romansh), ron (Romanian), rus (Russian), rue (Rusyn), san (Sanskrit), sah (Sakha), srd (Sardinian), scn (Sicilian), sco (Scots), snd (Sindhi), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), srp (Serbian), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tuk (Turkmen), fil (Filipino), tur (Turkish), tat (Tatar), tyv (Tuvinian), uig (Uyghur), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vec (Venetian), vep (Veps), vie (Vietnamese), vls (West Flemish), vol (Volapük), wln (Walloon), war (Waray), wuu (Wu Chinese), xal (Kalmyk), xmf (Mingrelian), yid (Yiddish), yor (Yoruba), yue (Cantonese), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|------:| 14 | | 1 | eng | English | 1479733 | 0.985 | 0.997 | 0.984 | 1475222 | 22413 | 5959481 | 4511 | 15 | | 2 | rus | Russian | 849653 | 0.979 | 0.996 | 0.977 | 846620 | 18559 | 6593415 | 3033 | 16 | | 3 | ita | Italian | 787053 | 0.984 | 0.987 | 0.978 | 776749 | 12310 | 6662264 | 10304 | 17 | | 4 | tur | Turkish | 709573 | 0.998 | 0.994 | 0.995 | 705300 | 1312 | 6750742 | 4273 | 18 | | 5 | deu | German | 553727 | 0.990 | 0.993 | 0.987 | 549727 | 5449 | 6902451 | 4000 | 19 | | 6 | fra | French | 466192 | 0.984 | 0.988 | 0.978 | 460626 | 7670 | 6987765 | 5566 | 20 | | 7 | por | Portuguese | 385737 | 0.986 | 0.965 | 0.968 | 372107 | 5326 | 7070564 | 13630 | 21 | | 8 | spa | Spanish | 338781 | 0.952 | 0.977 | 0.942 | 331141 | 16754 | 7106092 | 7640 | 22 | | 9 | hun | Hungarian | 323048 | 0.992 | 0.981 | 0.982 | 316851 | 2660 | 7135919 | 6197 | 23 | | 10 | jpn | Japanese | 208761 | 0.999 | 0.997 | 0.998 | 208113 | 109 | 7252757 | 648 | 24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.999 | 0.999 | 197021 | 1 | 7264400 | 205 | 25 | | 12 | ukr | Ukrainian | 171674 | 0.991 | 0.916 | 0.948 | 157290 | 1385 | 7288568 | 14384 | 26 | | 13 | nld | Dutch | 144340 | 0.983 | 0.927 | 0.946 | 133805 | 2324 | 7314963 | 10535 | 27 | | 14 | fin | Finnish | 128011 | 0.986 | 0.964 | 0.968 | 123392 | 1776 | 7331840 | 4619 | 28 | | 15 | pol | Polish | 109662 | 0.979 | 0.983 | 0.970 | 107820 | 2367 | 7349598 | 1842 | 29 | | 16 | mkd | Macedonian | 77938 | 0.954 | 0.936 | 0.924 | 72941 | 3508 | 7380181 | 4997 | 30 | | 17 | mar | Marathi | 64126 | 0.993 | 0.984 | 0.985 | 63072 | 434 | 7397067 | 1054 | 31 | | 18 | lit | Lithuanian | 59659 | 0.993 | 0.916 | 0.950 | 54676 | 402 | 7401566 | 4983 | 32 | | 19 | ces | Czech | 57030 | 0.923 | 0.908 | 0.881 | 51757 | 4326 | 7400271 | 5273 | 33 | | 20 | dan | Danish | 49399 | 0.837 | 0.871 | 0.788 | 43048 | 8398 | 7403830 | 6351 | 34 | | 21 | swe | Swedish | 41677 | 0.929 | 0.918 | 0.892 | 38263 | 2925 | 7417025 | 3414 | 35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.984 | 0.991 | 35415 | 17 | 7425619 | 576 | 36 | | 23 | ell | Greek | 34071 | 1.000 | 0.999 | 0.999 | 34042 | 10 | 7427546 | 29 | 37 | | 24 | ron | Romanian | 24943 | 0.978 | 0.891 | 0.922 | 22222 | 509 | 7436175 | 2721 | 38 | | 25 | bul | Bulgarian | 24503 | 0.900 | 0.815 | 0.817 | 19969 | 2215 | 7434909 | 4534 | 39 | | 26 | vie | Vietnamese | 19234 | 0.987 | 0.988 | 0.981 | 19008 | 250 | 7442143 | 226 | 40 | | 27 | fil | Filipino | 16649 | 0.989 | 0.878 | 0.925 | 14619 | 165 | 7444813 | 2030 | 41 | | 28 | slk | Slovak | 14660 | 0.884 | 0.478 | 0.596 | 7005 | 919 | 7446048 | 7655 | 42 | | 29 | ind | Indonesian | 14542 | 0.955 | 0.880 | 0.897 | 12793 | 602 | 7446483 | 1749 | 43 | | 30 | hin | Hindi | 14230 | 0.933 | 0.962 | 0.916 | 13687 | 978 | 7446419 | 543 | 44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.630 | 0.381 | 0.417 | 5421 | 3190 | 7444214 | 8802 | 45 | | 32 | cat | Catalan | 7971 | 0.821 | 0.686 | 0.691 | 5469 | 1196 | 7452460 | 2502 | 46 | | 33 | kor | Korean | 7570 | 0.996 | 0.970 | 0.981 | 7344 | 32 | 7454025 | 226 | 47 | | 34 | hrv | Croatian | 5204 | 0.702 | 0.353 | 0.427 | 1839 | 781 | 7455642 | 3365 | 48 | | 35 | ben | Bangla | 4714 | 0.998 | 0.996 | 0.996 | 4693 | 8 | 7456905 | 21 | 49 | | 36 | afr | Afrikaans | 4031 | 0.844 | 0.627 | 0.675 | 2528 | 468 | 7457128 | 1503 | 50 | | 37 | est | Estonian | 3637 | 0.745 | 0.654 | 0.622 | 2380 | 816 | 7457174 | 1257 | 51 | | 38 | tha | Thai | 3528 | 0.996 | 0.999 | 0.996 | 3524 | 13 | 7458086 | 4 | 52 | | 39 | sqi | Albanian | 2526 | 0.967 | 0.835 | 0.883 | 2109 | 72 | 7459029 | 417 | 53 | | 40 | urd | Urdu | 2008 | 0.987 | 0.954 | 0.964 | 1915 | 26 | 7459593 | 93 | 54 | | 41 | cym | Welsh | 1344 | 0.892 | 0.541 | 0.647 | 727 | 88 | 7460195 | 617 | 55 | | 42 | slv | Slovenian | 1093 | 0.226 | 0.431 | 0.197 | 471 | 1611 | 7458923 | 622 | 56 | | 43 | mal | Malayalam | 827 | 0.981 | 0.999 | 0.980 | 826 | 16 | 7460784 | 1 | 57 | | 44 | tam | Tamil | 334 | 0.991 | 1.000 | 0.991 | 334 | 3 | 7461290 | 0 | 58 | | 45 | tel | Telugu | 254 | 0.973 | 1.000 | 0.973 | 254 | 7 | 7461366 | 0 | 59 | | 46 | pan | Punjabi | 196 | 0.933 | 1.000 | 0.933 | 196 | 14 | 7461417 | 0 | 60 | | 47 | kan | Kannada | 176 | 0.967 | 1.000 | 0.967 | 176 | 6 | 7461445 | 0 | 61 | | 48 | guj | Gujarati | 168 | 0.994 | 1.000 | 0.994 | 168 | 1 | 7461458 | 0 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05-common-48/fasttext/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for fasttext on tatoeba-sentences-2021-06-05-common-48 2 | 3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%) 4 | - **Aggregated accuracy: 98.94%** 5 | 6 |

Supported languages (176)

7 | 8 | afr (Afrikaans), als (Tosk Albanian), amh (Amharic), arg (Aragonese), ara (Arabic), arz (Egyptian Arabic), asm (Assamese), ast (Asturian), ava (Avaric), aze (Azerbaijani), azb (South Azerbaijani), bak (Bashkir), bar (Bavarian), bcl (Central Bikol), bel (Belarusian), bul (Bulgarian), bih (Bihari languages), ben (Bangla), bod (Tibetan), bpy (Bishnupriya), bre (Breton), bos (Bosnian), bxr (Russia Buriat), cat (Catalan), cbk (Chavacano), che (Chechen), ceb (Cebuano), ckb (Central Kurdish), cos (Corsican), ces (Czech), chv (Chuvash), cym (Welsh), dan (Danish), deu (German), diq (Dimli (individual language)), dsb (Lower Sorbian), dty (Dotyali), div (Divehi), ell (Greek), eml (Unknown language [eml]), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fra (French), frr (Northern Frisian), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), grn (Guarani), gom (Goan Konkani), guj (Gujarati), glv (Manx), heb (Hebrew), hin (Hindi), hif (Fiji Hindi), hrv (Croatian), hsb (Upper Sorbian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ina (Interlingua), ind (Indonesian), ile (Interlingue), ilo (Iloko), ido (Ido), isl (Icelandic), ita (Italian), jpn (Japanese), jbo (Lojban), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), krc (Karachay-Balkar), kur (Kurdish), kom (Komi), cor (Cornish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lez (Lezghian), lim (Limburgish), lmo (Lombard), lao (Lao), lrc (Northern Luri), lit (Lithuanian), lav (Latvian), mai (Maithili), mlg (Malagasy), mhr (Eastern Mari), min (Minangkabau), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), mrj (Western Mari), msa (Malay), mlt (Maltese), mwl (Mirandese), mya (Burmese), myv (Erzya), mzn (Mazanderani), nah (Nahuatl languages), nap (Neapolitan), nds (Low German), nep (Nepali), new (Newari), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), oss (Ossetic), pan (Punjabi), pam (Pampanga), pfl (Palatine German), pol (Polish), pms (Piedmontese), pnb (Western Panjabi), pus (Pashto), por (Portuguese), que (Quechua), roh (Romansh), ron (Romanian), rus (Russian), rue (Rusyn), san (Sanskrit), sah (Sakha), srd (Sardinian), scn (Sicilian), sco (Scots), snd (Sindhi), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), srp (Serbian), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tuk (Turkmen), fil (Filipino), tur (Turkish), tat (Tatar), tyv (Tuvinian), uig (Uyghur), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vec (Venetian), vep (Veps), vie (Vietnamese), vls (West Flemish), vol (Volapük), wln (Walloon), war (Waray), wuu (Wu Chinese), xal (Kalmyk), xmf (Mingrelian), yid (Yiddish), yor (Yoruba), yue (Cantonese), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|-----:|--------:|-----:| 14 | | 1 | eng | English | 1479733 | 0.994 | 0.999 | 0.993 | 1478395 | 9216 | 5972678 | 1338 | 15 | | 2 | rus | Russian | 849653 | 0.995 | 0.998 | 0.994 | 848237 | 4129 | 6607845 | 1416 | 16 | | 3 | ita | Italian | 787053 | 0.994 | 0.996 | 0.992 | 783712 | 4984 | 6669590 | 3341 | 17 | | 4 | tur | Turkish | 709573 | 0.999 | 0.998 | 0.998 | 707878 | 812 | 6751242 | 1695 | 18 | | 5 | deu | German | 553727 | 0.996 | 0.997 | 0.994 | 552153 | 2391 | 6905509 | 1574 | 19 | | 6 | fra | French | 466192 | 0.994 | 0.994 | 0.991 | 463322 | 2814 | 6992621 | 2870 | 20 | | 7 | por | Portuguese | 385737 | 0.991 | 0.983 | 0.982 | 379214 | 3624 | 7072266 | 6523 | 21 | | 8 | spa | Spanish | 338781 | 0.979 | 0.987 | 0.972 | 334473 | 7311 | 7115535 | 4308 | 22 | | 9 | hun | Hungarian | 323048 | 0.996 | 0.993 | 0.993 | 320862 | 1315 | 7137264 | 2186 | 23 | | 10 | jpn | Japanese | 208761 | 1.000 | 1.000 | 1.000 | 208682 | 28 | 7252838 | 79 | 24 | | 11 | heb | Hebrew | 197226 | 1.000 | 1.000 | 1.000 | 197205 | 0 | 7264401 | 21 | 25 | | 12 | ukr | Ukrainian | 171674 | 0.996 | 0.978 | 0.985 | 167950 | 738 | 7289215 | 3724 | 26 | | 13 | nld | Dutch | 144340 | 0.988 | 0.955 | 0.966 | 137902 | 1685 | 7315602 | 6438 | 27 | | 14 | fin | Finnish | 128011 | 0.994 | 0.981 | 0.984 | 125521 | 797 | 7332819 | 2490 | 28 | | 15 | pol | Polish | 109662 | 0.988 | 0.992 | 0.984 | 108795 | 1366 | 7350599 | 867 | 29 | | 16 | mkd | Macedonian | 77938 | 0.982 | 0.986 | 0.976 | 76873 | 1395 | 7382294 | 1065 | 30 | | 17 | mar | Marathi | 64126 | 0.999 | 0.998 | 0.998 | 64016 | 96 | 7397405 | 110 | 31 | | 18 | lit | Lithuanian | 59659 | 0.996 | 0.960 | 0.976 | 57282 | 212 | 7401756 | 2377 | 32 | | 19 | ces | Czech | 57030 | 0.945 | 0.951 | 0.923 | 54248 | 3141 | 7401456 | 2782 | 33 | | 20 | dan | Danish | 49399 | 0.856 | 0.907 | 0.820 | 44828 | 7523 | 7404705 | 4571 | 34 | | 21 | swe | Swedish | 41677 | 0.953 | 0.948 | 0.929 | 39504 | 1944 | 7418006 | 2173 | 35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.993 | 0.997 | 35750 | 5 | 7425631 | 241 | 36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34069 | 10 | 7427546 | 2 | 37 | | 24 | ron | Romanian | 24943 | 0.983 | 0.937 | 0.951 | 23375 | 416 | 7436268 | 1568 | 38 | | 25 | bul | Bulgarian | 24503 | 0.966 | 0.944 | 0.939 | 23127 | 802 | 7436322 | 1376 | 39 | | 26 | vie | Vietnamese | 19234 | 0.993 | 0.997 | 0.992 | 19178 | 132 | 7442261 | 56 | 40 | | 27 | fil | Filipino | 16649 | 0.992 | 0.938 | 0.961 | 15621 | 122 | 7444856 | 1028 | 41 | | 28 | slk | Slovak | 14660 | 0.934 | 0.627 | 0.731 | 9190 | 651 | 7446316 | 5470 | 42 | | 29 | ind | Indonesian | 14542 | 0.963 | 0.921 | 0.925 | 13398 | 510 | 7446575 | 1144 | 43 | | 30 | hin | Hindi | 14230 | 0.994 | 0.989 | 0.989 | 14076 | 82 | 7447315 | 154 | 44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.676 | 0.422 | 0.462 | 5997 | 2872 | 7444532 | 8226 | 45 | | 32 | cat | Catalan | 7971 | 0.914 | 0.800 | 0.820 | 6377 | 601 | 7453055 | 1594 | 46 | | 33 | kor | Korean | 7570 | 0.997 | 0.994 | 0.994 | 7525 | 20 | 7454037 | 45 | 47 | | 34 | hrv | Croatian | 5204 | 0.786 | 0.451 | 0.532 | 2349 | 638 | 7455785 | 2855 | 48 | | 35 | ben | Bangla | 4714 | 0.998 | 0.999 | 0.998 | 4709 | 8 | 7456905 | 5 | 49 | | 36 | afr | Afrikaans | 4031 | 0.859 | 0.700 | 0.725 | 2823 | 465 | 7457131 | 1208 | 50 | | 37 | est | Estonian | 3637 | 0.842 | 0.818 | 0.770 | 2975 | 559 | 7457431 | 662 | 51 | | 38 | tha | Thai | 3528 | 0.999 | 1.000 | 0.999 | 3527 | 2 | 7458097 | 1 | 52 | | 39 | sqi | Albanian | 2526 | 0.967 | 0.865 | 0.899 | 2184 | 75 | 7459026 | 342 | 53 | | 40 | urd | Urdu | 2008 | 0.974 | 0.982 | 0.965 | 1972 | 53 | 7459566 | 36 | 54 | | 41 | cym | Welsh | 1344 | 0.965 | 0.721 | 0.813 | 969 | 35 | 7460248 | 375 | 55 | | 42 | slv | Slovenian | 1093 | 0.436 | 0.510 | 0.360 | 557 | 721 | 7459813 | 536 | 56 | | 43 | mal | Malayalam | 827 | 0.992 | 1.000 | 0.992 | 827 | 7 | 7460793 | 0 | 57 | | 44 | tam | Tamil | 334 | 0.991 | 1.000 | 0.991 | 334 | 3 | 7461290 | 0 | 58 | | 45 | tel | Telugu | 254 | 0.981 | 1.000 | 0.981 | 254 | 5 | 7461368 | 0 | 59 | | 46 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7461431 | 0 | 60 | | 47 | kan | Kannada | 176 | 0.926 | 1.000 | 0.926 | 176 | 14 | 7461437 | 0 | 61 | | 48 | guj | Gujarati | 168 | 0.988 | 1.000 | 0.988 | 168 | 2 | 7461457 | 0 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05-common-48/gcld3/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for gcld3 on tatoeba-sentences-2021-06-05-common-48 2 | 3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%) 4 | - **Aggregated accuracy: 86.98%** 5 | 6 |

Supported languages (107)

7 | 8 | afr (Afrikaans), amh (Amharic), ara (Arabic), bul (Bulgarian), bul (Bulgarian), ben (Bangla), bos (Bosnian), cat (Catalan), ceb (Cebuano), cos (Corsican), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fil (Filipino), fra (French), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), guj (Gujarati), hau (Hausa), haw (Hawaiian), hin (Hindi), hin (Hindi), hmn (Hmong), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), ibo (Igbo), isl (Icelandic), ita (Italian), heb (Hebrew), jpn (Japanese), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mri (Maori), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), mya (Burmese), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), nya (Nyanja), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), ron (Romanian), rus (Russian), rus (Russian), snd (Sindhi), sin (Sinhala), slk (Slovak), slv (Slovenian), smo (Samoan), sna (Shona), som (Somali), sqi (Albanian), srp (Serbian), sot (Southern Sotho), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vie (Vietnamese), xho (Xhosa), yid (Yiddish), yor (Yoruba), zho (Chinese), zho (Chinese), zul (Zulu) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:| 14 | | 1 | eng | English | 1479733 | 0.995 | 0.851 | 0.915 | 1258584 | 6421 | 5975473 | 221149 | 15 | | 2 | rus | Russian | 849653 | 0.975 | 0.858 | 0.903 | 729212 | 18499 | 6593475 | 120441 | 16 | | 3 | ita | Italian | 787053 | 0.976 | 0.821 | 0.882 | 646316 | 16123 | 6658451 | 140737 | 17 | | 4 | tur | Turkish | 709573 | 0.994 | 0.894 | 0.939 | 634612 | 3790 | 6748264 | 74961 | 18 | | 5 | deu | German | 553727 | 0.978 | 0.933 | 0.945 | 516822 | 11764 | 6896136 | 36905 | 19 | | 6 | fra | French | 466192 | 0.975 | 0.860 | 0.903 | 400941 | 10202 | 6985233 | 65251 | 20 | | 7 | por | Portuguese | 385737 | 0.916 | 0.885 | 0.864 | 341427 | 31415 | 7044475 | 44310 | 21 | | 8 | spa | Spanish | 338781 | 0.924 | 0.782 | 0.819 | 264941 | 21705 | 7101141 | 73840 | 22 | | 9 | hun | Hungarian | 323048 | 0.969 | 0.895 | 0.917 | 289189 | 9311 | 7129268 | 33859 | 23 | | 10 | jpn | Japanese | 208761 | 0.983 | 0.999 | 0.983 | 208552 | 3592 | 7249274 | 209 | 24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.991 | 0.995 | 195374 | 48 | 7264353 | 1852 | 25 | | 12 | ukr | Ukrainian | 171674 | 0.802 | 0.889 | 0.764 | 152579 | 37666 | 7252287 | 19095 | 26 | | 13 | nld | Dutch | 144340 | 0.877 | 0.854 | 0.816 | 123199 | 17204 | 7300083 | 21141 | 27 | | 14 | fin | Finnish | 128011 | 0.954 | 0.902 | 0.907 | 115404 | 5554 | 7328062 | 12607 | 28 | | 15 | pol | Polish | 109662 | 0.918 | 0.931 | 0.888 | 102084 | 9094 | 7342871 | 7578 | 29 | | 16 | mkd | Macedonian | 77938 | 0.875 | 0.741 | 0.759 | 57730 | 8238 | 7375451 | 20208 | 30 | | 17 | mar | Marathi | 64126 | 0.989 | 0.911 | 0.944 | 58406 | 632 | 7396869 | 5720 | 31 | | 18 | lit | Lithuanian | 59659 | 0.908 | 0.883 | 0.856 | 52661 | 5352 | 7396616 | 6998 | 32 | | 19 | ces | Czech | 57030 | 0.885 | 0.813 | 0.803 | 46341 | 6049 | 7398548 | 10689 | 33 | | 20 | dan | Danish | 49399 | 0.656 | 0.746 | 0.590 | 36848 | 19288 | 7392940 | 12551 | 34 | | 21 | swe | Swedish | 41677 | 0.794 | 0.861 | 0.746 | 35870 | 9310 | 7410640 | 5807 | 35 | | 22 | ara | Arabic | 35991 | 0.999 | 0.911 | 0.952 | 32774 | 30 | 7425606 | 3217 | 36 | | 23 | ell | Greek | 34071 | 0.806 | 1.000 | 0.806 | 34062 | 8213 | 7419343 | 9 | 37 | | 24 | ron | Romanian | 24943 | 0.623 | 0.808 | 0.580 | 20164 | 12187 | 7424497 | 4779 | 38 | | 25 | bul | Bulgarian | 24503 | 0.327 | 0.844 | 0.318 | 20688 | 42483 | 7394641 | 3815 | 39 | | 26 | vie | Vietnamese | 19234 | 0.890 | 0.981 | 0.882 | 18870 | 2332 | 7440061 | 364 | 40 | | 27 | fil | Filipino | 16649 | 0.746 | 0.780 | 0.675 | 12994 | 4426 | 7440552 | 3655 | 41 | | 28 | slk | Slovak | 14660 | 0.425 | 0.727 | 0.394 | 10664 | 14404 | 7432563 | 3996 | 42 | | 29 | ind | Indonesian | 14542 | 0.688 | 0.640 | 0.577 | 9304 | 4210 | 7442875 | 5238 | 43 | | 30 | hin | Hindi | 14230 | 0.527 | 0.880 | 0.509 | 12527 | 11232 | 7436165 | 1703 | 44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.288 | 0.829 | 0.280 | 11791 | 29106 | 7418298 | 2432 | 45 | | 32 | cat | Catalan | 7971 | 0.181 | 0.818 | 0.177 | 6520 | 29565 | 7424091 | 1451 | 46 | | 33 | kor | Korean | 7570 | 0.917 | 0.996 | 0.915 | 7536 | 685 | 7453372 | 34 | 47 | | 34 | hrv | Croatian | 5204 | 0.270 | 0.447 | 0.231 | 2324 | 6278 | 7450145 | 2880 | 48 | | 35 | ben | Bangla | 4714 | 1.000 | 0.998 | 0.999 | 4704 | 0 | 7456913 | 10 | 49 | | 36 | afr | Afrikaans | 4031 | 0.147 | 0.865 | 0.145 | 3485 | 20233 | 7437363 | 546 | 50 | | 37 | est | Estonian | 3637 | 0.202 | 0.796 | 0.197 | 2894 | 11430 | 7446560 | 743 | 51 | | 38 | tha | Thai | 3528 | 0.995 | 0.998 | 0.994 | 3522 | 18 | 7458081 | 6 | 52 | | 39 | sqi | Albanian | 2526 | 0.332 | 0.865 | 0.323 | 2184 | 4404 | 7454697 | 342 | 53 | | 40 | urd | Urdu | 2008 | 0.882 | 0.961 | 0.867 | 1930 | 257 | 7459362 | 78 | 54 | | 41 | cym | Welsh | 1344 | 0.105 | 0.824 | 0.104 | 1108 | 9469 | 7450814 | 236 | 55 | | 42 | slv | Slovenian | 1093 | 0.060 | 0.724 | 0.059 | 791 | 12479 | 7448055 | 302 | 56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460800 | 0 | 57 | | 44 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7461293 | 0 | 58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461373 | 0 | 59 | | 46 | pan | Punjabi | 196 | 1.000 | 0.995 | 0.997 | 195 | 0 | 7461431 | 1 | 60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461451 | 0 | 61 | | 48 | guj | Gujarati | 168 | 1.000 | 0.982 | 0.991 | 165 | 0 | 7461459 | 3 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05-common-48/langdetect/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for langdetect on tatoeba-sentences-2021-06-05-common-48 2 | 3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%) 4 | - **Aggregated accuracy: 92.47%** 5 | 6 |

Supported languages (55)

7 | 8 | afr (Afrikaans), ara (Arabic), bul (Bulgarian), ben (Bangla), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), eng (English), spa (Spanish), est (Estonian), fas (Persian), fin (Finnish), fra (French), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hun (Hungarian), ind (Indonesian), ita (Italian), jpn (Japanese), kan (Kannada), kor (Korean), lit (Lithuanian), lav (Latvian), mkd (Macedonian), mal (Malayalam), mar (Marathi), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), pan (Punjabi), pol (Polish), por (Portuguese), ron (Romanian), rus (Russian), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), zho (Chinese), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|------:| 14 | | 1 | eng | English | 1479733 | 0.988 | 0.933 | 0.954 | 1381214 | 16940 | 5964954 | 98519 | 15 | | 2 | rus | Russian | 849653 | 0.970 | 0.916 | 0.928 | 778060 | 24120 | 6587854 | 71593 | 16 | | 3 | ita | Italian | 787053 | 0.974 | 0.897 | 0.922 | 705735 | 19187 | 6655387 | 81318 | 17 | | 4 | tur | Turkish | 709573 | 0.996 | 0.971 | 0.982 | 689314 | 2543 | 6749511 | 20259 | 18 | | 5 | deu | German | 553727 | 0.985 | 0.967 | 0.969 | 535515 | 8255 | 6899645 | 18212 | 19 | | 6 | fra | French | 466192 | 0.944 | 0.947 | 0.920 | 441289 | 25946 | 6969489 | 24903 | 20 | | 7 | por | Portuguese | 385737 | 0.877 | 0.900 | 0.836 | 346971 | 48693 | 7027197 | 38766 | 21 | | 8 | spa | Spanish | 338781 | 0.916 | 0.831 | 0.838 | 281382 | 25647 | 7097199 | 57399 | 22 | | 9 | hun | Hungarian | 323048 | 0.991 | 0.950 | 0.965 | 306757 | 2935 | 7135644 | 16291 | 23 | | 10 | jpn | Japanese | 208761 | 1.000 | 0.999 | 1.000 | 208592 | 0 | 7252866 | 169 | 24 | | 11 | heb | Hebrew | 197226 | 1.000 | 1.000 | 1.000 | 197226 | 0 | 7264401 | 0 | 25 | | 12 | ukr | Ukrainian | 171674 | 0.895 | 0.796 | 0.803 | 136621 | 15996 | 7273957 | 35053 | 26 | | 13 | nld | Dutch | 144340 | 0.872 | 0.815 | 0.793 | 117650 | 17344 | 7299943 | 26690 | 27 | | 14 | fin | Finnish | 128011 | 0.943 | 0.971 | 0.930 | 124354 | 7511 | 7326105 | 3657 | 28 | | 15 | pol | Polish | 109662 | 0.985 | 0.972 | 0.971 | 106609 | 1641 | 7350324 | 3053 | 29 | | 16 | mkd | Macedonian | 77938 | 0.684 | 0.889 | 0.656 | 69298 | 32058 | 7351631 | 8640 | 30 | | 17 | mar | Marathi | 64126 | 0.997 | 0.932 | 0.962 | 59755 | 190 | 7397311 | 4371 | 31 | | 18 | lit | Lithuanian | 59659 | 0.934 | 0.944 | 0.908 | 56302 | 4006 | 7397962 | 3357 | 32 | | 19 | ces | Czech | 57030 | 0.937 | 0.848 | 0.865 | 48335 | 3225 | 7401372 | 8695 | 33 | | 20 | dan | Danish | 49399 | 0.697 | 0.697 | 0.606 | 34438 | 14948 | 7397280 | 14961 | 34 | | 21 | swe | Swedish | 41677 | 0.815 | 0.852 | 0.761 | 35494 | 8046 | 7411904 | 6183 | 35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.979 | 0.989 | 35231 | 4 | 7425632 | 760 | 36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 2 | 7427554 | 0 | 37 | | 24 | ron | Romanian | 24943 | 0.541 | 0.942 | 0.532 | 23508 | 19948 | 7416736 | 1435 | 38 | | 25 | bul | Bulgarian | 24503 | 0.284 | 0.783 | 0.273 | 19182 | 48422 | 7388702 | 5321 | 39 | | 26 | vie | Vietnamese | 19234 | 0.971 | 0.999 | 0.970 | 19220 | 580 | 7441813 | 14 | 40 | | 27 | fil | Filipino | 16649 | 0.579 | 0.943 | 0.569 | 15707 | 11441 | 7433537 | 942 | 41 | | 28 | slk | Slovak | 14660 | 0.520 | 0.762 | 0.481 | 11175 | 10312 | 7436655 | 3485 | 42 | | 29 | ind | Indonesian | 14542 | 0.496 | 0.943 | 0.488 | 13717 | 13953 | 7433132 | 825 | 43 | | 30 | hin | Hindi | 14230 | 0.785 | 0.957 | 0.772 | 13622 | 3722 | 7443675 | 608 | 44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.250 | 0.816 | 0.243 | 11613 | 34844 | 7412560 | 2610 | 45 | | 32 | cat | Catalan | 7971 | 0.143 | 0.839 | 0.141 | 6686 | 39974 | 7413682 | 1285 | 46 | | 33 | kor | Korean | 7570 | 0.986 | 0.999 | 0.985 | 7560 | 108 | 7453949 | 10 | 47 | | 34 | hrv | Croatian | 5204 | 0.333 | 0.803 | 0.320 | 4181 | 8360 | 7448063 | 1023 | 48 | | 35 | ben | Bangla | 4714 | 1.000 | 1.000 | 1.000 | 4714 | 0 | 7456913 | 0 | 49 | | 36 | afr | Afrikaans | 4031 | 0.072 | 0.855 | 0.072 | 3447 | 44438 | 7413158 | 584 | 50 | | 37 | est | Estonian | 3637 | 0.195 | 0.859 | 0.192 | 3124 | 12874 | 7445116 | 513 | 51 | | 38 | tha | Thai | 3528 | 1.000 | 1.000 | 1.000 | 3528 | 0 | 7458099 | 0 | 52 | | 39 | sqi | Albanian | 2526 | 0.571 | 0.947 | 0.562 | 2391 | 1794 | 7457307 | 135 | 53 | | 40 | urd | Urdu | 2008 | 0.921 | 0.992 | 0.918 | 1992 | 170 | 7459449 | 16 | 54 | | 41 | cym | Welsh | 1344 | 0.143 | 0.940 | 0.142 | 1263 | 7564 | 7452719 | 81 | 55 | | 42 | slv | Slovenian | 1093 | 0.076 | 0.767 | 0.075 | 838 | 10179 | 7450355 | 255 | 56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460800 | 0 | 57 | | 44 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7461293 | 0 | 58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461373 | 0 | 59 | | 46 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7461431 | 0 | 60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461451 | 0 | 61 | | 48 | guj | Gujarati | 168 | 1.000 | 1.000 | 1.000 | 168 | 0 | 7461459 | 0 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05-common-48/langid/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for langid on tatoeba-sentences-2021-06-05-common-48 2 | 3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%) 4 | - **Aggregated accuracy: 90.15%** 5 | 6 |

Supported languages (97)

7 | 8 | afr (Afrikaans), amh (Amharic), arg (Aragonese), ara (Arabic), asm (Assamese), aze (Azerbaijani), bel (Belarusian), bul (Bulgarian), ben (Bangla), bre (Breton), bos (Bosnian), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), dzo (Dzongkha), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fao (Faroese), fra (French), gle (Irish), glg (Galician), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), isl (Icelandic), ita (Italian), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), nob (Norwegian Bokmål), nep (Nepali), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), que (Quechua), ron (Romanian), rus (Russian), kin (Kinyarwanda), sme (Northern Sami), sin (Sinhala), slk (Slovak), slv (Slovenian), sqi (Albanian), srp (Serbian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), uig (Uyghur), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), vol (Volapük), wln (Walloon), xho (Xhosa), zho (Chinese), zul (Zulu) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:| 14 | | 1 | eng | English | 1479733 | 0.965 | 0.973 | 0.952 | 1439789 | 52447 | 5929447 | 39944 | 15 | | 2 | rus | Russian | 849653 | 0.968 | 0.823 | 0.876 | 699008 | 23301 | 6588673 | 150645 | 16 | | 3 | ita | Italian | 787053 | 0.973 | 0.885 | 0.915 | 696690 | 19395 | 6655179 | 90363 | 17 | | 4 | tur | Turkish | 709573 | 0.997 | 0.919 | 0.955 | 652061 | 1817 | 6750237 | 57512 | 18 | | 5 | deu | German | 553727 | 0.958 | 0.974 | 0.946 | 539237 | 23461 | 6884439 | 14490 | 19 | | 6 | fra | French | 466192 | 0.921 | 0.935 | 0.893 | 436032 | 37228 | 6958207 | 30160 | 20 | | 7 | por | Portuguese | 385737 | 0.936 | 0.822 | 0.850 | 317068 | 21734 | 7054156 | 68669 | 21 | | 8 | spa | Spanish | 338781 | 0.863 | 0.837 | 0.796 | 283448 | 45158 | 7077688 | 55333 | 22 | | 9 | hun | Hungarian | 323048 | 0.980 | 0.929 | 0.944 | 300114 | 6226 | 7132353 | 22934 | 23 | | 10 | jpn | Japanese | 208761 | 0.999 | 1.000 | 0.999 | 208702 | 109 | 7252757 | 59 | 24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.999 | 1.000 | 197110 | 10 | 7264391 | 116 | 25 | | 12 | ukr | Ukrainian | 171674 | 0.751 | 0.774 | 0.677 | 132961 | 44090 | 7245863 | 38713 | 26 | | 13 | nld | Dutch | 144340 | 0.885 | 0.901 | 0.844 | 130115 | 16989 | 7300298 | 14225 | 27 | | 14 | fin | Finnish | 128011 | 0.953 | 0.931 | 0.920 | 119175 | 5915 | 7327701 | 8836 | 28 | | 15 | pol | Polish | 109662 | 0.960 | 0.977 | 0.950 | 107088 | 4405 | 7347560 | 2574 | 29 | | 16 | mkd | Macedonian | 77938 | 0.619 | 0.482 | 0.464 | 37554 | 23137 | 7360552 | 40384 | 30 | | 17 | mar | Marathi | 64126 | 0.988 | 0.700 | 0.815 | 44902 | 563 | 7396938 | 19224 | 31 | | 18 | lit | Lithuanian | 59659 | 0.909 | 0.915 | 0.872 | 54565 | 5495 | 7396473 | 5094 | 32 | | 19 | ces | Czech | 57030 | 0.893 | 0.837 | 0.822 | 47732 | 5701 | 7398896 | 9298 | 33 | | 20 | dan | Danish | 49399 | 0.767 | 0.602 | 0.612 | 29753 | 9059 | 7403169 | 19646 | 34 | | 21 | swe | Swedish | 41677 | 0.808 | 0.802 | 0.735 | 33438 | 7931 | 7412019 | 8239 | 35 | | 22 | ara | Arabic | 35991 | 0.999 | 0.950 | 0.973 | 34184 | 30 | 7425606 | 1807 | 36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 14 | 7427542 | 0 | 37 | | 24 | ron | Romanian | 24943 | 0.696 | 0.906 | 0.671 | 22605 | 9895 | 7426789 | 2338 | 38 | | 25 | bul | Bulgarian | 24503 | 0.211 | 0.624 | 0.199 | 15278 | 57018 | 7380106 | 9225 | 39 | | 26 | vie | Vietnamese | 19234 | 0.959 | 0.998 | 0.958 | 19192 | 819 | 7441574 | 42 | 40 | | 27 | fil | Filipino | 16649 | 0.907 | 0.792 | 0.810 | 13181 | 1357 | 7443621 | 3468 | 41 | | 28 | slk | Slovak | 14660 | 0.545 | 0.690 | 0.486 | 10119 | 8443 | 7438524 | 4541 | 42 | | 29 | ind | Indonesian | 14542 | 0.615 | 0.731 | 0.553 | 10632 | 6650 | 7440435 | 3910 | 43 | | 30 | hin | Hindi | 14230 | 0.420 | 0.901 | 0.410 | 12825 | 17724 | 7429673 | 1405 | 44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.315 | 0.770 | 0.301 | 10958 | 23798 | 7423606 | 3265 | 45 | | 32 | cat | Catalan | 7971 | 0.218 | 0.720 | 0.209 | 5739 | 20640 | 7433016 | 2232 | 46 | | 33 | kor | Korean | 7570 | 0.990 | 1.000 | 0.990 | 7568 | 77 | 7453980 | 2 | 47 | | 34 | hrv | Croatian | 5204 | 0.482 | 0.650 | 0.427 | 3384 | 3631 | 7452792 | 1820 | 48 | | 35 | ben | Bangla | 4714 | 0.999 | 0.977 | 0.988 | 4605 | 3 | 7456910 | 109 | 49 | | 36 | afr | Afrikaans | 4031 | 0.322 | 0.462 | 0.271 | 1861 | 3919 | 7453677 | 2170 | 50 | | 37 | est | Estonian | 3637 | 0.306 | 0.689 | 0.286 | 2505 | 5688 | 7452302 | 1132 | 51 | | 38 | tha | Thai | 3528 | 0.999 | 1.000 | 0.999 | 3528 | 3 | 7458096 | 0 | 52 | | 39 | sqi | Albanian | 2526 | 0.805 | 0.905 | 0.772 | 2285 | 555 | 7458546 | 241 | 53 | | 40 | urd | Urdu | 2008 | 0.855 | 0.947 | 0.835 | 1901 | 322 | 7459297 | 107 | 54 | | 41 | cym | Welsh | 1344 | 0.330 | 0.682 | 0.306 | 916 | 1860 | 7458423 | 428 | 55 | | 42 | slv | Slovenian | 1093 | 0.079 | 0.681 | 0.078 | 744 | 8640 | 7451894 | 349 | 56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460800 | 0 | 57 | | 44 | tam | Tamil | 334 | 0.991 | 1.000 | 0.991 | 334 | 3 | 7461290 | 0 | 58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461373 | 0 | 59 | | 46 | pan | Punjabi | 196 | 0.990 | 1.000 | 0.990 | 196 | 2 | 7461429 | 0 | 60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461451 | 0 | 61 | | 48 | guj | Gujarati | 168 | 0.971 | 1.000 | 0.971 | 168 | 5 | 7461454 | 0 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05-common-48/pycld2/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for pycld2 on tatoeba-sentences-2021-06-05-common-48 2 | 3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%) 4 | - **Aggregated accuracy: 87.12%** 5 | 6 |

Supported languages (83)

7 | 8 | afr (Afrikaans), sqi (Albanian), ara (Arabic), hye (Armenian), aze (Azerbaijani), eus (Basque), bel (Belarusian), ben (Bangla), bih (Bihari languages), bul (Bulgarian), cat (Catalan), ceb (Cebuano), chr (Cherokee), hrv (Croatian), ces (Czech), zho (Chinese), dan (Danish), div (Divehi), nld (Dutch), eng (English), est (Estonian), fin (Finnish), fra (French), glg (Galician), lug (Ganda), kat (Georgian), deu (German), ell (Greek), guj (Gujarati), hat (Haitian Creole), heb (Hebrew), hin (Hindi), hmn (Hmong), hun (Hungarian), isl (Icelandic), ind (Indonesian), iku (Inuktitut), gle (Irish), ita (Italian), jav (Javanese), jpn (Japanese), kan (Kannada), khm (Khmer), kin (Kinyarwanda), kor (Korean), lao (Lao), lav (Latvian), lif (Limbu), lit (Lithuanian), mkd (Macedonian), msa (Malay), mal (Malayalam), mlt (Maltese), mar (Marathi), nep (Nepali), nob (Norwegian Bokmål), ori (Odia), fas (Persian), pol (Polish), por (Portuguese), pan (Punjabi), ron (Romanian), rus (Russian), gla (Scottish Gaelic), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), spa (Spanish), swa (Swahili), swe (Swedish), syr (Syriac), fil (Filipino), tam (Tamil), tel (Telugu), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), cym (Welsh), yid (Yiddish), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:| 14 | | 1 | eng | English | 1479733 | 0.955 | 0.970 | 0.941 | 1435203 | 68178 | 5913716 | 44530 | 15 | | 2 | rus | Russian | 849653 | 0.998 | 0.830 | 0.905 | 705549 | 1619 | 6610355 | 144104 | 16 | | 3 | ita | Italian | 787053 | 0.999 | 0.689 | 0.816 | 542549 | 441 | 6674133 | 244504 | 17 | | 4 | tur | Turkish | 709573 | 1.000 | 0.923 | 0.960 | 654731 | 110 | 6751944 | 54842 | 18 | | 5 | deu | German | 553727 | 1.000 | 0.954 | 0.976 | 528244 | 180 | 6907720 | 25483 | 19 | | 6 | fra | French | 466192 | 0.999 | 0.845 | 0.915 | 394106 | 372 | 6995063 | 72086 | 20 | | 7 | por | Portuguese | 385737 | 0.982 | 0.865 | 0.912 | 333763 | 6080 | 7069810 | 51974 | 21 | | 8 | spa | Spanish | 338781 | 0.995 | 0.798 | 0.883 | 270207 | 1476 | 7121370 | 68574 | 22 | | 9 | hun | Hungarian | 323048 | 1.000 | 0.935 | 0.966 | 302013 | 131 | 7138448 | 21035 | 23 | | 10 | jpn | Japanese | 208761 | 1.000 | 0.999 | 1.000 | 208635 | 0 | 7252866 | 126 | 24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.841 | 0.914 | 165882 | 4 | 7264397 | 31344 | 25 | | 12 | ukr | Ukrainian | 171674 | 0.992 | 0.791 | 0.877 | 135799 | 1148 | 7288805 | 35875 | 26 | | 13 | nld | Dutch | 144340 | 0.994 | 0.820 | 0.897 | 118356 | 664 | 7316623 | 25984 | 27 | | 14 | fin | Finnish | 128011 | 0.999 | 0.909 | 0.951 | 116372 | 159 | 7333457 | 11639 | 28 | | 15 | pol | Polish | 109662 | 0.999 | 0.926 | 0.961 | 101512 | 68 | 7351897 | 8150 | 29 | | 16 | mkd | Macedonian | 77938 | 0.973 | 0.477 | 0.635 | 37213 | 1038 | 7382651 | 40725 | 30 | | 17 | mar | Marathi | 64126 | 1.000 | 0.967 | 0.983 | 62024 | 24 | 7397477 | 2102 | 31 | | 18 | lit | Lithuanian | 59659 | 0.997 | 0.914 | 0.952 | 54501 | 144 | 7401824 | 5158 | 32 | | 19 | ces | Czech | 57030 | 0.971 | 0.891 | 0.917 | 50816 | 1511 | 7403086 | 6214 | 33 | | 20 | dan | Danish | 49399 | 0.866 | 0.698 | 0.730 | 34494 | 5317 | 7406911 | 14905 | 34 | | 21 | swe | Swedish | 41677 | 0.995 | 0.761 | 0.861 | 31709 | 145 | 7419805 | 9968 | 35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.776 | 0.874 | 27916 | 1 | 7425635 | 8075 | 36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 14 | 7427542 | 0 | 37 | | 24 | ron | Romanian | 24943 | 0.963 | 0.811 | 0.866 | 20227 | 780 | 7435904 | 4716 | 38 | | 25 | bul | Bulgarian | 24503 | 0.854 | 0.700 | 0.722 | 17140 | 2925 | 7434199 | 7363 | 39 | | 26 | vie | Vietnamese | 19234 | 0.995 | 0.991 | 0.990 | 19062 | 103 | 7442290 | 172 | 40 | | 27 | fil | Filipino | 16649 | 0.994 | 0.789 | 0.878 | 13136 | 75 | 7444903 | 3513 | 41 | | 28 | slk | Slovak | 14660 | 0.704 | 0.788 | 0.643 | 11559 | 4854 | 7442113 | 3101 | 42 | | 29 | ind | Indonesian | 14542 | 0.867 | 0.775 | 0.770 | 11270 | 1727 | 7445358 | 3272 | 43 | | 30 | hin | Hindi | 14230 | 0.918 | 0.973 | 0.907 | 13848 | 1230 | 7446167 | 382 | 44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.566 | 0.796 | 0.528 | 11327 | 8676 | 7438728 | 2896 | 45 | | 32 | cat | Catalan | 7971 | 0.807 | 0.685 | 0.681 | 5464 | 1305 | 7452351 | 2507 | 46 | | 33 | kor | Korean | 7570 | 1.000 | 0.991 | 0.995 | 7500 | 0 | 7454057 | 70 | 47 | | 34 | hrv | Croatian | 5204 | 0.940 | 0.565 | 0.690 | 2942 | 189 | 7456234 | 2262 | 48 | | 35 | ben | Bangla | 4714 | 1.000 | 0.777 | 0.874 | 3662 | 0 | 7456913 | 1052 | 49 | | 36 | afr | Afrikaans | 4031 | 0.446 | 0.826 | 0.426 | 3330 | 4129 | 7453467 | 701 | 50 | | 37 | est | Estonian | 3637 | 0.908 | 0.752 | 0.790 | 2734 | 276 | 7457714 | 903 | 51 | | 38 | tha | Thai | 3528 | 1.000 | 1.000 | 1.000 | 3528 | 0 | 7458099 | 0 | 52 | | 39 | sqi | Albanian | 2526 | 0.962 | 0.909 | 0.918 | 2295 | 90 | 7459011 | 231 | 53 | | 40 | urd | Urdu | 2008 | 0.997 | 0.948 | 0.971 | 1903 | 5 | 7459614 | 105 | 54 | | 41 | cym | Welsh | 1344 | 0.968 | 0.845 | 0.890 | 1136 | 37 | 7460246 | 208 | 55 | | 42 | slv | Slovenian | 1093 | 0.802 | 0.550 | 0.604 | 601 | 148 | 7460386 | 492 | 56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460800 | 0 | 57 | | 44 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7461293 | 0 | 58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461373 | 0 | 59 | | 46 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7461431 | 0 | 60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461451 | 0 | 61 | | 48 | guj | Gujarati | 168 | 1.000 | 1.000 | 1.000 | 168 | 0 | 7461459 | 0 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05-common-48/results.md: -------------------------------------------------------------------------------- 1 | # Aggregated results for tatoeba-sentences-2021-06-05-common-48 2 | 3 | | Library | Supported languages | # sentences supported | Aggregated accuracy | Per language metrics | 4 | |:--------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 5 | | fasttext | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [98.94%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext/classification_performance.md#metrics-per-language) | 6 | | fasttext-compressed | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext-compressed/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [97.90%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext-compressed/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext-compressed/classification_performance.md#metrics-per-language) | 7 | | gcld3 | [107](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/gcld3/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [86.98%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/gcld3/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/gcld3/classification_performance.md#metrics-per-language) | 8 | | langdetect | [55](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langdetect/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [92.47%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langdetect/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langdetect/classification_performance.md#metrics-per-language) | 9 | | langid | [97](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langid/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [90.15%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langid/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langid/classification_performance.md#metrics-per-language) | 10 | | pycld2 | [83](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/pycld2/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [87.12%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/pycld2/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/pycld2/classification_performance.md#metrics-per-language) | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/fasttext-compressed/c5.xlarge_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for fasttext-compressed on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 76041.91094156951 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.01315064268661526 ms/example 8 | - Standard deviation: 0.009660729486966587 9 | - Median: 0.011923 ms/example 10 | - 90th percentile: 0.019079 ms/example 11 | - 95th percentile: 0.022503 ms/example 12 | - 99th percentile: 0.033758 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/fasttext-compressed/mbp_m1_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for fasttext-compressed on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 93406.46830679392 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.010705896691388609 ms/example 8 | - Standard deviation: 0.006436589155683585 9 | - Median: 0.01 ms/example 10 | - 90th percentile: 0.014 ms/example 11 | - 95th percentile: 0.016 ms/example 12 | - 99th percentile: 0.023 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/fasttext/c5.xlarge_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for fasttext on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 105253.30924161937 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.009500888924113552 ms/example 8 | - Standard deviation: 0.00581991143719852 9 | - Median: 0.008891 ms/example 10 | - 90th percentile: 0.013018 ms/example 11 | - 95th percentile: 0.014964 ms/example 12 | - 99th percentile: 0.021675540000000038 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/fasttext/mbp_m1_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for fasttext on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 112223.08654995833 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.008910822458575225 ms/example 8 | - Standard deviation: 0.004349694022231488 9 | - Median: 0.008 ms/example 10 | - 90th percentile: 0.011 ms/example 11 | - 95th percentile: 0.013 ms/example 12 | - 99th percentile: 0.017 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/gcld3/c5.xlarge_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for gcld3 on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 13371.689002229674 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.07478486822668805 ms/example 8 | - Standard deviation: 0.03576524584839307 9 | - Median: 0.068239 ms/example 10 | - 90th percentile: 0.104779 ms/example 11 | - 95th percentile: 0.123671 ms/example 12 | - 99th percentile: 0.20031665000000037 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/gcld3/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for gcld3 on tatoeba-sentences-2021-06-05 2 | 3 | - Dataset coverage (sentences in supported languages): 8261834 (85.70%) 4 | - **Aggregated accuracy: 87.11%** 5 | 6 |

Supported languages (107)

7 | 8 | afr (Afrikaans), amh (Amharic), ara (Arabic), bul (Bulgarian), bul (Bulgarian), ben (Bangla), bos (Bosnian), cat (Catalan), ceb (Cebuano), cos (Corsican), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fil (Filipino), fra (French), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), guj (Gujarati), hau (Hausa), haw (Hawaiian), hin (Hindi), hin (Hindi), hmn (Hmong), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), ibo (Igbo), isl (Icelandic), ita (Italian), heb (Hebrew), jpn (Japanese), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mri (Maori), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), mya (Burmese), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), nya (Nyanja), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), ron (Romanian), rus (Russian), rus (Russian), snd (Sindhi), sin (Sinhala), slk (Slovak), slv (Slovenian), smo (Samoan), sna (Shona), som (Somali), sqi (Albanian), srp (Serbian), sot (Southern Sotho), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vie (Vietnamese), xho (Xhosa), yid (Yiddish), yor (Yoruba), zho (Chinese), zho (Chinese), zul (Zulu) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:| 14 | | 1 | eng | English | 1479733 | 0.995 | 0.851 | 0.915 | 1258584 | 6819 | 6775282 | 221149 | 15 | | 2 | rus | Russian | 849653 | 0.974 | 0.858 | 0.902 | 729212 | 19236 | 7392945 | 120441 | 16 | | 3 | ita | Italian | 787053 | 0.970 | 0.821 | 0.877 | 646316 | 20005 | 7454776 | 140737 | 17 | | 4 | tur | Turkish | 709573 | 0.993 | 0.894 | 0.938 | 634612 | 4617 | 7547644 | 74961 | 18 | | 5 | epo | Esperanto | 659632 | 0.963 | 0.922 | 0.925 | 608152 | 23690 | 7578512 | 51480 | 19 | | 6 | deu | German | 553727 | 0.977 | 0.933 | 0.944 | 516822 | 12347 | 7695760 | 36905 | 20 | | 7 | fra | French | 466192 | 0.973 | 0.860 | 0.902 | 400941 | 11073 | 7784569 | 65251 | 21 | | 8 | por | Portuguese | 385737 | 0.910 | 0.885 | 0.859 | 341427 | 33858 | 7842239 | 44310 | 22 | | 9 | spa | Spanish | 338781 | 0.908 | 0.782 | 0.806 | 264941 | 26864 | 7896189 | 73840 | 23 | | 10 | hun | Hungarian | 323048 | 0.968 | 0.895 | 0.916 | 289189 | 9667 | 7929119 | 33859 | 24 | | 11 | jpn | Japanese | 208761 | 0.978 | 0.999 | 0.977 | 208552 | 4759 | 8048314 | 209 | 25 | | 12 | heb | Hebrew | 197226 | 0.998 | 0.991 | 0.993 | 195374 | 409 | 8064199 | 1852 | 26 | | 13 | ukr | Ukrainian | 171674 | 0.800 | 0.889 | 0.762 | 152579 | 38095 | 8052065 | 19095 | 27 | | 14 | nld | Dutch | 144340 | 0.876 | 0.854 | 0.814 | 123199 | 17516 | 8099978 | 21141 | 28 | | 15 | fin | Finnish | 128011 | 0.941 | 0.902 | 0.895 | 115404 | 7235 | 8126588 | 12607 | 29 | | 16 | pol | Polish | 109662 | 0.915 | 0.931 | 0.885 | 102084 | 9477 | 8142695 | 7578 | 30 | | 17 | mkd | Macedonian | 77938 | 0.862 | 0.741 | 0.749 | 57730 | 9232 | 8174664 | 20208 | 31 | | 18 | mar | Marathi | 64126 | 0.989 | 0.911 | 0.943 | 58406 | 654 | 8197054 | 5720 | 32 | | 19 | lit | Lithuanian | 59659 | 0.841 | 0.883 | 0.797 | 52661 | 9950 | 8192225 | 6998 | 33 | | 20 | ces | Czech | 57030 | 0.881 | 0.813 | 0.799 | 46341 | 6282 | 8198522 | 10689 | 34 | | 21 | dan | Danish | 49399 | 0.652 | 0.746 | 0.587 | 36848 | 19626 | 8192809 | 12551 | 35 | | 22 | srp | Serbian | 45176 | 0.317 | 0.449 | 0.265 | 20304 | 43763 | 8172895 | 24872 | 36 | | 23 | swe | Swedish | 41677 | 0.786 | 0.861 | 0.739 | 35870 | 9753 | 8210404 | 5807 | 37 | | 24 | lat | Latin | 39718 | 0.500 | 0.729 | 0.457 | 28963 | 28998 | 8193118 | 10755 | 38 | | 25 | ara | Arabic | 35991 | 0.999 | 0.911 | 0.952 | 32774 | 32 | 8225811 | 3217 | 39 | | 26 | ell | Greek | 34071 | 0.730 | 1.000 | 0.730 | 34062 | 12617 | 8215146 | 9 | 40 | | 27 | ron | Romanian | 24943 | 0.592 | 0.808 | 0.553 | 20164 | 13920 | 8222971 | 4779 | 41 | | 28 | bul | Bulgarian | 24503 | 0.318 | 0.844 | 0.309 | 20688 | 44456 | 8192875 | 3815 | 42 | | 29 | vie | Vietnamese | 19234 | 0.883 | 0.981 | 0.876 | 18870 | 2495 | 8240105 | 364 | 43 | | 30 | fil | Filipino | 16649 | 0.733 | 0.780 | 0.664 | 12994 | 4735 | 8240450 | 3655 | 44 | | 31 | slk | Slovak | 14660 | 0.411 | 0.727 | 0.381 | 10664 | 15304 | 8231870 | 3996 | 45 | | 32 | ind | Indonesian | 14542 | 0.642 | 0.640 | 0.544 | 9304 | 5180 | 8242112 | 5238 | 46 | | 33 | hin | Hindi | 14230 | 0.508 | 0.880 | 0.491 | 12527 | 12125 | 8235479 | 1703 | 47 | | 34 | nob | Norwegian Bokmål | 14223 | 0.285 | 0.829 | 0.277 | 11791 | 29626 | 8217985 | 2432 | 48 | | 35 | isl | Icelandic | 11091 | 0.523 | 0.945 | 0.515 | 10484 | 9560 | 8241183 | 607 | 49 | | 36 | cat | Catalan | 7971 | 0.176 | 0.818 | 0.173 | 6520 | 30461 | 8223402 | 1451 | 50 | | 37 | kor | Korean | 7570 | 0.915 | 0.996 | 0.913 | 7536 | 703 | 8253561 | 34 | 51 | | 38 | yid | Yiddish | 6895 | 0.790 | 0.944 | 0.772 | 6512 | 1728 | 8253211 | 383 | 52 | | 39 | eus | Basque | 6166 | 0.439 | 0.861 | 0.424 | 5306 | 6789 | 8248879 | 860 | 53 | | 40 | kat | Georgian | 5732 | 1.000 | 0.998 | 0.999 | 5720 | 2 | 8256100 | 12 | 54 | | 41 | hrv | Croatian | 5204 | 0.139 | 0.447 | 0.128 | 2324 | 14392 | 8242238 | 2880 | 55 | | 42 | ben | Bangla | 4714 | 1.000 | 0.998 | 0.999 | 4704 | 0 | 8257120 | 10 | 56 | | 43 | glg | Galician | 4613 | 0.064 | 0.774 | 0.063 | 3569 | 52237 | 8204984 | 1044 | 57 | | 44 | afr | Afrikaans | 4031 | 0.145 | 0.865 | 0.143 | 3485 | 20607 | 8237196 | 546 | 58 | | 45 | kaz | Kazakh | 3685 | 0.404 | 0.932 | 0.398 | 3434 | 5063 | 8253086 | 251 | 59 | | 46 | est | Estonian | 3637 | 0.165 | 0.796 | 0.162 | 2894 | 14623 | 8243574 | 743 | 60 | | 47 | tha | Thai | 3528 | 0.995 | 0.998 | 0.994 | 3522 | 18 | 8258288 | 6 | 61 | | 48 | mon | Mongolian | 2757 | 0.415 | 0.955 | 0.411 | 2633 | 3712 | 8255365 | 124 | 62 | | 49 | sqi | Albanian | 2526 | 0.288 | 0.865 | 0.282 | 2184 | 5395 | 8253913 | 342 | 63 | | 50 | gle | Irish | 2389 | 0.175 | 0.911 | 0.174 | 2177 | 10238 | 8249207 | 212 | 64 | | 51 | hye | Armenian | 2248 | 0.994 | 0.998 | 0.993 | 2243 | 13 | 8259573 | 5 | 65 | | 52 | urd | Urdu | 2008 | 0.882 | 0.961 | 0.867 | 1930 | 258 | 8259568 | 78 | 66 | | 53 | khm | Khmer | 1511 | 1.000 | 0.985 | 0.993 | 1489 | 0 | 8260323 | 22 | 67 | | 54 | ceb | Cebuano | 1478 | 0.148 | 0.571 | 0.141 | 844 | 4846 | 8255510 | 634 | 68 | | 55 | cym | Welsh | 1344 | 0.103 | 0.824 | 0.102 | 1108 | 9667 | 8250823 | 236 | 69 | | 56 | slv | Slovenian | 1093 | 0.048 | 0.724 | 0.047 | 791 | 15714 | 8245027 | 302 | 70 | | 57 | gla | Scottish Gaelic | 1033 | 0.090 | 0.867 | 0.089 | 896 | 9060 | 8251741 | 137 | 71 | | 58 | uzb | Uzbek | 855 | 0.091 | 0.581 | 0.088 | 497 | 4957 | 8256022 | 358 | 72 | | 59 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 8261007 | 0 | 73 | | 60 | ltz | Luxembourgish | 805 | 0.031 | 0.867 | 0.031 | 698 | 21519 | 8239510 | 107 | 74 | | 61 | jav | Javanese | 615 | 0.030 | 0.361 | 0.029 | 222 | 7128 | 8254091 | 393 | 75 | | 62 | bos | Bosnian | 567 | 0.012 | 0.388 | 0.011 | 220 | 18770 | 8242497 | 347 | 76 | | 63 | mya | Burmese | 433 | 1.000 | 1.000 | 1.000 | 433 | 0 | 8261401 | 0 | 77 | | 64 | mri | Maori | 388 | 0.040 | 0.711 | 0.040 | 276 | 6622 | 8254824 | 112 | 78 | | 65 | fry | Western Frisian | 355 | 0.012 | 0.738 | 0.012 | 262 | 21054 | 8240425 | 93 | 79 | | 66 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 8261500 | 0 | 80 | | 67 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 8261580 | 0 | 81 | | 68 | kir | Kyrgyz | 254 | 0.027 | 0.878 | 0.027 | 223 | 7974 | 8253606 | 31 | 82 | | 69 | xho | Xhosa | 252 | 0.039 | 0.683 | 0.039 | 172 | 4230 | 8257352 | 80 | 83 | | 70 | lao | Lao | 219 | 1.000 | 1.000 | 1.000 | 219 | 0 | 8261615 | 0 | 84 | | 71 | amh | Amharic | 211 | 1.000 | 0.991 | 0.995 | 209 | 0 | 8261623 | 2 | 85 | | 72 | mlt | Maltese | 208 | 0.013 | 0.817 | 0.013 | 170 | 12489 | 8249137 | 38 | 86 | | 73 | pan | Punjabi | 196 | 1.000 | 0.995 | 0.997 | 195 | 0 | 8261638 | 1 | 87 | | 74 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 8261658 | 0 | 88 | | 75 | guj | Gujarati | 168 | 1.000 | 0.982 | 0.991 | 165 | 0 | 8261666 | 3 | 89 | | 76 | haw | Hawaiian | 155 | 0.010 | 0.839 | 0.010 | 130 | 12567 | 8249112 | 25 | 90 | | 77 | som | Somali | 80 | 0.019 | 0.912 | 0.019 | 73 | 3674 | 8258080 | 7 | 91 | | 78 | zul | Zulu | 77 | 0.007 | 0.753 | 0.007 | 58 | 7839 | 8253918 | 19 | 92 | | 79 | smo | Samoan | 76 | 0.008 | 0.763 | 0.008 | 58 | 7277 | 8254481 | 18 | 93 | | 80 | hat | Haitian Creole | 64 | 0.004 | 0.797 | 0.004 | 51 | 13257 | 8248513 | 13 | 94 | | 81 | tgk | Tajik | 63 | 0.008 | 0.889 | 0.008 | 56 | 7343 | 8254428 | 7 | 95 | | 82 | hau | Hausa | 60 | 0.006 | 0.800 | 0.006 | 48 | 7835 | 8253939 | 12 | 96 | | 83 | mlg | Malagasy | 59 | 0.003 | 0.831 | 0.003 | 49 | 14975 | 8246800 | 10 | 97 | | 84 | sin | Sinhala | 45 | 1.000 | 1.000 | 1.000 | 45 | 0 | 8261789 | 0 | 98 | | 85 | pus | Pashto | 44 | 0.098 | 0.864 | 0.097 | 38 | 350 | 8261440 | 6 | 99 | | 86 | sna | Shona | 42 | 0.006 | 0.667 | 0.005 | 28 | 5062 | 8256730 | 14 | 100 | | 87 | yor | Yoruba | 37 | 0.001 | 0.108 | 0.001 | 4 | 3799 | 8257998 | 33 | 101 | | 88 | ibo | Igbo | 32 | 0.002 | 0.688 | 0.002 | 22 | 10813 | 8250989 | 10 | 102 | | 89 | sun | Sundanese | 31 | 0.003 | 0.548 | 0.003 | 17 | 5505 | 8256298 | 14 | 103 | | 90 | nya | Nyanja | 24 | 0.010 | 0.833 | 0.010 | 20 | 2072 | 8259738 | 4 | 104 | | 91 | cos | Corsican | 24 | 0.000 | 0.583 | 0.000 | 14 | 35270 | 8226540 | 10 | 105 | | 92 | snd | Sindhi | 6 | 0.003 | 0.833 | 0.003 | 5 | 1440 | 8260388 | 1 | 106 | | 93 | sot | Southern Sotho | 2 | 0.001 | 1.000 | 0.001 | 2 | 3240 | 8258592 | 0 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/gcld3/mbp_m1_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for gcld3 on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 17494.196652851704 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.05716181313401386 ms/example 8 | - Standard deviation: 0.02535098112972992 9 | - Median: 0.052 ms/example 10 | - 90th percentile: 0.08 ms/example 11 | - 95th percentile: 0.095 ms/example 12 | - 99th percentile: 0.148 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/langdetect/c5.xlarge_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for langdetect on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 239.74773586805924 examples/s. 5 | 6 | ## Latency 7 | - Average: 4.171050860519207 ms/example 8 | - Standard deviation: 4.538692148545434 9 | - Median: 2.715283 ms/example 10 | - 90th percentile: 8.73063 ms/example 11 | - 95th percentile: 12.929049399999995 ms/example 12 | - 99th percentile: 24.198700639999984 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/langdetect/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for langdetect on tatoeba-sentences-2021-06-05 2 | 3 | - Dataset coverage (sentences in supported languages): 7461707 (77.40%) 4 | - **Aggregated accuracy: 92.45%** 5 | 6 |

Supported languages (55)

7 | 8 | afr (Afrikaans), ara (Arabic), bul (Bulgarian), ben (Bangla), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), eng (English), spa (Spanish), est (Estonian), fas (Persian), fin (Finnish), fra (French), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hun (Hungarian), ind (Indonesian), ita (Italian), jpn (Japanese), kan (Kannada), kor (Korean), lit (Lithuanian), lav (Latvian), mkd (Macedonian), mal (Malayalam), mar (Marathi), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), pan (Punjabi), pol (Polish), por (Portuguese), ron (Romanian), rus (Russian), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), zho (Chinese), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|------:| 14 | | 1 | eng | English | 1479733 | 0.988 | 0.933 | 0.954 | 1380911 | 17010 | 5964964 | 98822 | 15 | | 2 | rus | Russian | 849653 | 0.970 | 0.916 | 0.928 | 777866 | 24227 | 6587827 | 71787 | 16 | | 3 | ita | Italian | 787053 | 0.973 | 0.897 | 0.922 | 705630 | 19337 | 6655317 | 81423 | 17 | | 4 | tur | Turkish | 709573 | 0.996 | 0.971 | 0.982 | 689294 | 2589 | 6749545 | 20279 | 18 | | 5 | deu | German | 553727 | 0.985 | 0.967 | 0.968 | 535363 | 8311 | 6899669 | 18364 | 19 | | 6 | fra | French | 466192 | 0.945 | 0.946 | 0.920 | 441184 | 25895 | 6969620 | 25008 | 20 | | 7 | por | Portuguese | 385737 | 0.877 | 0.899 | 0.836 | 346968 | 48757 | 7027213 | 38769 | 21 | | 8 | spa | Spanish | 338781 | 0.917 | 0.830 | 0.838 | 281211 | 25423 | 7097503 | 57570 | 22 | | 9 | hun | Hungarian | 323048 | 0.990 | 0.950 | 0.965 | 306776 | 3022 | 7135637 | 16272 | 23 | | 10 | jpn | Japanese | 208761 | 1.000 | 0.999 | 1.000 | 208586 | 0 | 7252946 | 175 | 24 | | 11 | heb | Hebrew | 197226 | 1.000 | 1.000 | 1.000 | 197225 | 0 | 7264481 | 1 | 25 | | 12 | ukr | Ukrainian | 171674 | 0.895 | 0.796 | 0.803 | 136663 | 16080 | 7273953 | 35011 | 26 | | 13 | nld | Dutch | 144340 | 0.871 | 0.815 | 0.793 | 117639 | 17432 | 7299935 | 26701 | 27 | | 14 | fin | Finnish | 128011 | 0.943 | 0.971 | 0.930 | 124344 | 7534 | 7326162 | 3667 | 28 | | 15 | pol | Polish | 109662 | 0.985 | 0.972 | 0.971 | 106595 | 1645 | 7350400 | 3067 | 29 | | 16 | mkd | Macedonian | 77938 | 0.684 | 0.889 | 0.656 | 69298 | 32067 | 7351702 | 8640 | 30 | | 17 | mar | Marathi | 64126 | 0.997 | 0.932 | 0.962 | 59783 | 194 | 7397387 | 4343 | 31 | | 18 | lit | Lithuanian | 59659 | 0.933 | 0.944 | 0.907 | 56303 | 4063 | 7397985 | 3356 | 32 | | 19 | ces | Czech | 57030 | 0.937 | 0.848 | 0.864 | 48335 | 3243 | 7401434 | 8695 | 33 | | 20 | dan | Danish | 49399 | 0.695 | 0.697 | 0.604 | 34415 | 15080 | 7397228 | 14984 | 34 | | 21 | swe | Swedish | 41677 | 0.815 | 0.852 | 0.761 | 35494 | 8039 | 7411991 | 6183 | 35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.979 | 0.989 | 35240 | 4 | 7425712 | 751 | 36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 2 | 7427634 | 0 | 37 | | 24 | ron | Romanian | 24943 | 0.539 | 0.942 | 0.531 | 23490 | 20051 | 7416713 | 1453 | 38 | | 25 | bul | Bulgarian | 24503 | 0.284 | 0.782 | 0.273 | 19154 | 48399 | 7388805 | 5349 | 39 | | 26 | vie | Vietnamese | 19234 | 0.969 | 0.999 | 0.969 | 19220 | 607 | 7441866 | 14 | 40 | | 27 | fil | Filipino | 16649 | 0.579 | 0.941 | 0.568 | 15674 | 11413 | 7433645 | 975 | 41 | | 28 | slk | Slovak | 14660 | 0.519 | 0.762 | 0.480 | 11167 | 10366 | 7436681 | 3493 | 42 | | 29 | ind | Indonesian | 14542 | 0.495 | 0.942 | 0.488 | 13698 | 13963 | 7433202 | 844 | 43 | | 30 | hin | Hindi | 14230 | 0.786 | 0.957 | 0.773 | 13618 | 3697 | 7443780 | 612 | 44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.251 | 0.818 | 0.244 | 11639 | 34806 | 7412678 | 2584 | 45 | | 32 | cat | Catalan | 7971 | 0.143 | 0.841 | 0.141 | 6702 | 40066 | 7413670 | 1269 | 46 | | 33 | kor | Korean | 7570 | 0.985 | 0.999 | 0.984 | 7559 | 119 | 7454018 | 11 | 47 | | 34 | hrv | Croatian | 5204 | 0.334 | 0.805 | 0.321 | 4189 | 8362 | 7448141 | 1015 | 48 | | 35 | ben | Bangla | 4714 | 1.000 | 1.000 | 1.000 | 4714 | 0 | 7456993 | 0 | 49 | | 36 | afr | Afrikaans | 4031 | 0.072 | 0.854 | 0.071 | 3441 | 44440 | 7413236 | 590 | 50 | | 37 | est | Estonian | 3637 | 0.196 | 0.860 | 0.193 | 3129 | 12850 | 7445220 | 508 | 51 | | 38 | tha | Thai | 3528 | 1.000 | 1.000 | 1.000 | 3528 | 0 | 7458179 | 0 | 52 | | 39 | sqi | Albanian | 2526 | 0.564 | 0.947 | 0.555 | 2391 | 1846 | 7457335 | 135 | 53 | | 40 | urd | Urdu | 2008 | 0.922 | 0.992 | 0.918 | 1991 | 169 | 7459530 | 17 | 54 | | 41 | cym | Welsh | 1344 | 0.142 | 0.940 | 0.142 | 1264 | 7614 | 7452749 | 80 | 55 | | 42 | slv | Slovenian | 1093 | 0.074 | 0.752 | 0.073 | 822 | 10262 | 7450352 | 271 | 56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460880 | 0 | 57 | | 44 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7461373 | 0 | 58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461453 | 0 | 59 | | 46 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7461511 | 0 | 60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461531 | 0 | 61 | | 48 | guj | Gujarati | 168 | 1.000 | 1.000 | 1.000 | 168 | 0 | 7461539 | 0 | 62 | | 49 | som | Somali | 80 | 0.014 | 0.988 | 0.014 | 79 | 5524 | 7456103 | 1 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/langdetect/mbp_m1_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for langdetect on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 234.35885994454733 examples/s. 5 | 6 | ## Latency 7 | - Average: 4.26696050764462 ms/example 8 | - Standard deviation: 7.405121715428491 9 | - Median: 2.649 ms/example 10 | - 90th percentile: 9.023 ms/example 11 | - 95th percentile: 13.456 ms/example 12 | - 99th percentile: 25.658 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/langid/c5.xlarge_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for langid on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 896.817073745205 examples/s. 5 | 6 | ## Latency 7 | - Average: 1.1150545961662974 ms/example 8 | - Standard deviation: 0.5163837711515092 9 | - Median: 1.060864 ms/example 10 | - 90th percentile: 1.34925 ms/example 11 | - 95th percentile: 1.472716 ms/example 12 | - 99th percentile: 1.7471153599999993 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/langid/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for langid on tatoeba-sentences-2021-06-05 2 | 3 | - Dataset coverage (sentences in supported languages): 8298609 (86.08%) 4 | - **Aggregated accuracy: 89.00%** 5 | 6 |

Supported languages (97)

7 | 8 | afr (Afrikaans), amh (Amharic), arg (Aragonese), ara (Arabic), asm (Assamese), aze (Azerbaijani), bel (Belarusian), bul (Bulgarian), ben (Bangla), bre (Breton), bos (Bosnian), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), dzo (Dzongkha), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fao (Faroese), fra (French), gle (Irish), glg (Galician), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), isl (Icelandic), ita (Italian), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), nob (Norwegian Bokmål), nep (Nepali), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), que (Quechua), ron (Romanian), rus (Russian), kin (Kinyarwanda), sme (Northern Sami), sin (Sinhala), slk (Slovak), slv (Slovenian), sqi (Albanian), srp (Serbian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), uig (Uyghur), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), vol (Volapük), wln (Walloon), xho (Xhosa), zho (Chinese), zul (Zulu) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:------------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:| 14 | | 1 | eng | English | 1479733 | 0.953 | 0.973 | 0.941 | 1439789 | 70832 | 6748044 | 39944 | 15 | | 2 | rus | Russian | 849653 | 0.966 | 0.823 | 0.875 | 699008 | 24596 | 7424360 | 150645 | 16 | | 3 | ita | Italian | 787053 | 0.954 | 0.885 | 0.898 | 696690 | 33674 | 7477882 | 90363 | 17 | | 4 | tur | Turkish | 709573 | 0.995 | 0.919 | 0.953 | 652061 | 3202 | 7585834 | 57512 | 18 | | 5 | epo | Esperanto | 659632 | 0.983 | 0.854 | 0.907 | 563174 | 9712 | 7629265 | 96458 | 19 | | 6 | deu | German | 553727 | 0.950 | 0.974 | 0.938 | 539237 | 28349 | 7716533 | 14490 | 20 | | 7 | fra | French | 466192 | 0.897 | 0.935 | 0.870 | 436032 | 49971 | 7782446 | 30160 | 21 | | 8 | por | Portuguese | 385737 | 0.915 | 0.822 | 0.832 | 317068 | 29566 | 7883306 | 68669 | 22 | | 9 | spa | Spanish | 338781 | 0.784 | 0.837 | 0.728 | 283448 | 78313 | 7881515 | 55333 | 23 | | 10 | hun | Hungarian | 323048 | 0.977 | 0.929 | 0.942 | 300114 | 7043 | 7968518 | 22934 | 24 | | 11 | jpn | Japanese | 208761 | 0.997 | 1.000 | 0.997 | 208702 | 585 | 8089263 | 59 | 25 | | 12 | heb | Hebrew | 197226 | 1.000 | 0.999 | 1.000 | 197110 | 14 | 8101369 | 116 | 26 | | 13 | ukr | Ukrainian | 171674 | 0.748 | 0.774 | 0.675 | 132961 | 44688 | 8082247 | 38713 | 27 | | 14 | nld | Dutch | 144340 | 0.869 | 0.901 | 0.830 | 130115 | 19610 | 8134659 | 14225 | 28 | | 15 | fin | Finnish | 128011 | 0.941 | 0.931 | 0.909 | 119175 | 7527 | 8163071 | 8836 | 29 | | 16 | pol | Polish | 109662 | 0.944 | 0.977 | 0.933 | 107088 | 6349 | 8182598 | 2574 | 30 | | 17 | mkd | Macedonian | 77938 | 0.582 | 0.482 | 0.443 | 37554 | 27012 | 8193659 | 40384 | 31 | | 18 | mar | Marathi | 64126 | 0.988 | 0.700 | 0.815 | 44902 | 563 | 8233920 | 19224 | 32 | | 19 | lit | Lithuanian | 59659 | 0.774 | 0.915 | 0.747 | 54565 | 15938 | 8223012 | 5094 | 33 | | 20 | ces | Czech | 57030 | 0.879 | 0.837 | 0.809 | 47732 | 6594 | 8234985 | 9298 | 34 | | 21 | dan | Danish | 49399 | 0.739 | 0.602 | 0.594 | 29753 | 10483 | 8238727 | 19646 | 35 | | 22 | srp | Serbian | 45176 | 0.214 | 0.392 | 0.184 | 17727 | 64947 | 8188486 | 27449 | 36 | | 23 | swe | Swedish | 41677 | 0.782 | 0.802 | 0.713 | 33438 | 9345 | 8247587 | 8239 | 37 | | 24 | lat | Latin | 39718 | 0.939 | 0.196 | 0.322 | 7803 | 509 | 8258382 | 31915 | 38 | | 25 | ara | Arabic | 35991 | 0.999 | 0.950 | 0.973 | 34184 | 46 | 8262572 | 1807 | 39 | | 26 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 15 | 8264523 | 0 | 40 | | 27 | ron | Romanian | 24943 | 0.637 | 0.906 | 0.617 | 22605 | 12867 | 8260799 | 2338 | 41 | | 28 | bul | Bulgarian | 24503 | 0.209 | 0.624 | 0.197 | 15278 | 57734 | 8216372 | 9225 | 42 | | 29 | vie | Vietnamese | 19234 | 0.956 | 0.998 | 0.955 | 19192 | 886 | 8278489 | 42 | 43 | | 30 | fil | Filipino | 16649 | 0.876 | 0.792 | 0.786 | 13181 | 1860 | 8280100 | 3468 | 44 | | 31 | slk | Slovak | 14660 | 0.525 | 0.690 | 0.470 | 10119 | 9140 | 8274809 | 4541 | 45 | | 32 | ind | Indonesian | 14542 | 0.528 | 0.731 | 0.481 | 10632 | 9494 | 8274573 | 3910 | 46 | | 33 | hin | Hindi | 14230 | 0.420 | 0.901 | 0.410 | 12825 | 17727 | 8266652 | 1405 | 47 | | 34 | nob | Norwegian Bokmål | 14223 | 0.305 | 0.770 | 0.292 | 10958 | 24918 | 8259468 | 3265 | 48 | | 35 | bel | Belarusian | 12633 | 0.437 | 0.879 | 0.424 | 11106 | 14321 | 8271655 | 1527 | 49 | | 36 | isl | Icelandic | 11091 | 0.874 | 0.929 | 0.846 | 10300 | 1484 | 8286034 | 791 | 50 | | 37 | cat | Catalan | 7971 | 0.200 | 0.720 | 0.193 | 5739 | 22950 | 8267688 | 2232 | 51 | | 38 | uig | Uyghur | 7792 | 0.961 | 0.989 | 0.956 | 7707 | 316 | 8290501 | 85 | 52 | | 39 | kor | Korean | 7570 | 0.989 | 1.000 | 0.989 | 7568 | 83 | 8290956 | 2 | 53 | | 40 | bre | Breton | 7195 | 0.423 | 0.630 | 0.377 | 4535 | 6179 | 8285235 | 2660 | 54 | | 41 | eus | Basque | 6166 | 0.324 | 0.866 | 0.316 | 5338 | 11150 | 8281293 | 828 | 55 | | 42 | kat | Georgian | 5732 | 0.997 | 0.996 | 0.995 | 5710 | 19 | 8292858 | 22 | 56 | | 43 | oci | Occitan | 5693 | 0.341 | 0.505 | 0.292 | 2873 | 5547 | 8287369 | 2820 | 57 | | 44 | aze | Azerbaijani | 5348 | 0.233 | 0.756 | 0.224 | 4044 | 13343 | 8279918 | 1304 | 58 | | 45 | hrv | Croatian | 5204 | 0.155 | 0.650 | 0.149 | 3384 | 18473 | 8274932 | 1820 | 59 | | 46 | ben | Bangla | 4714 | 0.659 | 0.977 | 0.654 | 4605 | 2380 | 8291515 | 109 | 60 | | 47 | glg | Galician | 4613 | 0.066 | 0.520 | 0.064 | 2400 | 33789 | 8260207 | 2213 | 61 | | 48 | vol | Volapük | 4132 | 0.554 | 0.265 | 0.313 | 1093 | 880 | 8293597 | 3039 | 62 | | 49 | afr | Afrikaans | 4031 | 0.300 | 0.462 | 0.255 | 1861 | 4338 | 8290240 | 2170 | 63 | | 50 | kaz | Kazakh | 3685 | 0.368 | 0.943 | 0.364 | 3476 | 5968 | 8288956 | 209 | 64 | | 51 | est | Estonian | 3637 | 0.226 | 0.689 | 0.215 | 2505 | 8560 | 8286412 | 1132 | 65 | | 52 | tha | Thai | 3528 | 0.998 | 1.000 | 0.998 | 3528 | 7 | 8295074 | 0 | 66 | | 53 | asm | Assamese | 2912 | 0.853 | 0.227 | 0.348 | 662 | 114 | 8295583 | 2250 | 67 | | 54 | mon | Mongolian | 2757 | 0.288 | 0.950 | 0.286 | 2618 | 6463 | 8289389 | 139 | 68 | | 55 | sqi | Albanian | 2526 | 0.764 | 0.905 | 0.735 | 2285 | 704 | 8295379 | 241 | 69 | | 56 | gle | Irish | 2389 | 0.392 | 0.840 | 0.378 | 2007 | 3108 | 8293112 | 382 | 70 | | 57 | hye | Armenian | 2248 | 0.992 | 0.911 | 0.946 | 2048 | 17 | 8296344 | 200 | 71 | | 58 | urd | Urdu | 2008 | 0.853 | 0.947 | 0.833 | 1901 | 328 | 8296273 | 107 | 72 | | 59 | nno | Norwegian Nynorsk | 1576 | 0.144 | 0.501 | 0.134 | 789 | 4696 | 8292337 | 787 | 73 | | 60 | khm | Khmer | 1511 | 0.997 | 0.975 | 0.985 | 1473 | 4 | 8297094 | 38 | 74 | | 61 | cym | Welsh | 1344 | 0.300 | 0.682 | 0.281 | 916 | 2133 | 8295132 | 428 | 75 | | 62 | slv | Slovenian | 1093 | 0.044 | 0.681 | 0.044 | 744 | 16127 | 8281389 | 349 | 76 | | 63 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 8297782 | 0 | 77 | | 64 | ltz | Luxembourgish | 805 | 0.383 | 0.343 | 0.280 | 276 | 444 | 8297360 | 529 | 78 | | 65 | jav | Javanese | 615 | 0.105 | 0.486 | 0.100 | 299 | 2542 | 8295452 | 316 | 79 | | 66 | bos | Bosnian | 567 | 0.011 | 0.053 | 0.010 | 30 | 2722 | 8295320 | 537 | 80 | | 67 | que | Quechua | 422 | 0.549 | 0.382 | 0.380 | 161 | 132 | 8298055 | 261 | 81 | | 68 | fao | Faroese | 402 | 0.253 | 0.281 | 0.191 | 113 | 333 | 8297874 | 289 | 82 | | 69 | ori | Odia | 374 | 1.000 | 0.992 | 0.996 | 371 | 0 | 8298235 | 3 | 83 | | 70 | tam | Tamil | 334 | 0.988 | 1.000 | 0.988 | 334 | 4 | 8298271 | 0 | 84 | | 71 | tel | Telugu | 254 | 0.996 | 1.000 | 0.996 | 254 | 1 | 8298354 | 0 | 85 | | 72 | kir | Kyrgyz | 254 | 0.049 | 0.118 | 0.041 | 30 | 588 | 8297767 | 224 | 86 | | 73 | xho | Xhosa | 252 | 0.110 | 0.575 | 0.106 | 145 | 1174 | 8297183 | 107 | 87 | | 74 | lao | Lao | 219 | 0.952 | 1.000 | 0.952 | 219 | 11 | 8298379 | 0 | 88 | | 75 | amh | Amharic | 211 | 0.770 | 1.000 | 0.770 | 211 | 63 | 8298335 | 0 | 89 | | 76 | mlt | Maltese | 208 | 0.034 | 0.817 | 0.034 | 170 | 4791 | 8293610 | 38 | 90 | | 77 | pan | Punjabi | 196 | 0.985 | 1.000 | 0.985 | 196 | 3 | 8298410 | 0 | 91 | | 78 | sme | Northern Sami | 181 | 0.190 | 0.320 | 0.158 | 58 | 248 | 8298180 | 123 | 92 | | 79 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 8298433 | 0 | 93 | | 80 | guj | Gujarati | 168 | 0.966 | 1.000 | 0.966 | 168 | 6 | 8298435 | 0 | 94 | | 81 | arg | Aragonese | 103 | 0.002 | 0.029 | 0.002 | 3 | 1529 | 8296977 | 100 | 95 | | 82 | zul | Zulu | 77 | 0.054 | 0.286 | 0.051 | 22 | 383 | 8298149 | 55 | 96 | | 83 | hat | Haitian Creole | 64 | 0.008 | 0.406 | 0.008 | 26 | 3176 | 8295369 | 38 | 97 | | 84 | mlg | Malagasy | 59 | 0.017 | 0.559 | 0.017 | 33 | 1949 | 8296601 | 26 | 98 | | 85 | wln | Walloon | 53 | 0.015 | 0.528 | 0.015 | 28 | 1847 | 8296709 | 25 | 99 | | 86 | sin | Sinhala | 45 | 0.738 | 1.000 | 0.738 | 45 | 16 | 8298548 | 0 | 100 | | 87 | pus | Pashto | 44 | 0.178 | 0.432 | 0.159 | 19 | 88 | 8298477 | 25 | 101 | | 88 | kin | Kinyarwanda | 28 | 0.005 | 0.214 | 0.005 | 6 | 1135 | 8297446 | 22 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/langid/mbp_m1_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for langid on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 1268.5839534203296 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.7882805054437436 ms/example 8 | - Standard deviation: 0.47807791658786386 9 | - Median: 0.718 ms/example 10 | - 90th percentile: 1.072 ms/example 11 | - 95th percentile: 1.202 ms/example 12 | - 99th percentile: 1.59 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/pycld2/c5.xlarge_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for pycld2 on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 208037.00199835165 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.004806837199124429 ms/example 8 | - Standard deviation: 0.00467348719666187 9 | - Median: 0.004474 ms/example 10 | - 90th percentile: 0.006108 ms/example 11 | - 95th percentile: 0.007004 ms/example 12 | - 99th percentile: 0.010547 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/pycld2/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for pycld2 on tatoeba-sentences-2021-06-05 2 | 3 | - Dataset coverage (sentences in supported languages): 7569549 (78.52%) 4 | - **Aggregated accuracy: 86.95%** 5 | 6 |

Supported languages (83)

7 | 8 | afr (Afrikaans), sqi (Albanian), ara (Arabic), hye (Armenian), aze (Azerbaijani), eus (Basque), bel (Belarusian), ben (Bangla), bih (Bihari languages), bul (Bulgarian), cat (Catalan), ceb (Cebuano), chr (Cherokee), hrv (Croatian), ces (Czech), zho (Chinese), dan (Danish), div (Divehi), nld (Dutch), eng (English), est (Estonian), fin (Finnish), fra (French), glg (Galician), lug (Ganda), kat (Georgian), deu (German), ell (Greek), guj (Gujarati), hat (Haitian Creole), heb (Hebrew), hin (Hindi), hmn (Hmong), hun (Hungarian), isl (Icelandic), ind (Indonesian), iku (Inuktitut), gle (Irish), ita (Italian), jav (Javanese), jpn (Japanese), kan (Kannada), khm (Khmer), kin (Kinyarwanda), kor (Korean), lao (Lao), lav (Latvian), lif (Limbu), lit (Lithuanian), mkd (Macedonian), msa (Malay), mal (Malayalam), mlt (Maltese), mar (Marathi), nep (Nepali), nob (Norwegian Bokmål), ori (Odia), fas (Persian), pol (Polish), por (Portuguese), pan (Punjabi), ron (Romanian), rus (Russian), gla (Scottish Gaelic), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), spa (Spanish), swa (Swahili), swe (Swedish), syr (Syriac), fil (Filipino), tam (Tamil), tel (Telugu), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), cym (Welsh), yid (Yiddish), zho (Chinese) 9 | 10 |

Stats per language

11 | 12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn | 13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:| 14 | | 1 | eng | English | 1479733 | 0.954 | 0.970 | 0.940 | 1435203 | 68766 | 6021050 | 44530 | 15 | | 2 | rus | Russian | 849653 | 0.998 | 0.830 | 0.905 | 705549 | 1667 | 6718229 | 144104 | 16 | | 3 | ita | Italian | 787053 | 0.999 | 0.689 | 0.816 | 542549 | 442 | 6782054 | 244504 | 17 | | 4 | tur | Turkish | 709573 | 1.000 | 0.923 | 0.960 | 654731 | 191 | 6859785 | 54842 | 18 | | 5 | deu | German | 553727 | 1.000 | 0.954 | 0.976 | 528244 | 184 | 7015638 | 25483 | 19 | | 6 | fra | French | 466192 | 0.999 | 0.845 | 0.915 | 394106 | 374 | 7102983 | 72086 | 20 | | 7 | por | Portuguese | 385737 | 0.981 | 0.865 | 0.912 | 333763 | 6371 | 7177441 | 51974 | 21 | | 8 | spa | Spanish | 338781 | 0.994 | 0.798 | 0.883 | 270207 | 1677 | 7229091 | 68574 | 22 | | 9 | hun | Hungarian | 323048 | 1.000 | 0.935 | 0.966 | 302013 | 133 | 7246368 | 21035 | 23 | | 10 | jpn | Japanese | 208761 | 1.000 | 0.999 | 1.000 | 208635 | 0 | 7360788 | 126 | 24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.841 | 0.914 | 165882 | 10 | 7372313 | 31344 | 25 | | 12 | ukr | Ukrainian | 171674 | 0.991 | 0.791 | 0.877 | 135799 | 1168 | 7396707 | 35875 | 26 | | 13 | nld | Dutch | 144340 | 0.994 | 0.820 | 0.897 | 118356 | 664 | 7424545 | 25984 | 27 | | 14 | fin | Finnish | 128011 | 0.999 | 0.909 | 0.951 | 116372 | 161 | 7441377 | 11639 | 28 | | 15 | pol | Polish | 109662 | 0.999 | 0.926 | 0.961 | 101512 | 75 | 7459812 | 8150 | 29 | | 16 | mkd | Macedonian | 77938 | 0.969 | 0.477 | 0.633 | 37213 | 1178 | 7490433 | 40725 | 30 | | 17 | mar | Marathi | 64126 | 1.000 | 0.967 | 0.983 | 62024 | 24 | 7505399 | 2102 | 31 | | 18 | lit | Lithuanian | 59659 | 0.997 | 0.914 | 0.952 | 54501 | 144 | 7509746 | 5158 | 32 | | 19 | ces | Czech | 57030 | 0.970 | 0.891 | 0.916 | 50816 | 1551 | 7510968 | 6214 | 33 | | 20 | dan | Danish | 49399 | 0.866 | 0.698 | 0.729 | 34494 | 5341 | 7514809 | 14905 | 34 | | 21 | srp | Serbian | 45176 | 0.246 | 0.564 | 0.225 | 25486 | 77950 | 7446423 | 19690 | 35 | | 22 | swe | Swedish | 41677 | 0.995 | 0.761 | 0.861 | 31709 | 145 | 7527727 | 9968 | 36 | | 23 | ara | Arabic | 35991 | 1.000 | 0.776 | 0.874 | 27916 | 1 | 7533557 | 8075 | 37 | | 24 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 14 | 7535464 | 0 | 38 | | 25 | ron | Romanian | 24943 | 0.963 | 0.811 | 0.865 | 20227 | 787 | 7543819 | 4716 | 39 | | 26 | bul | Bulgarian | 24503 | 0.852 | 0.700 | 0.721 | 17140 | 2967 | 7542079 | 7363 | 40 | | 27 | vie | Vietnamese | 19234 | 0.995 | 0.991 | 0.990 | 19062 | 103 | 7550212 | 172 | 41 | | 28 | fil | Filipino | 16649 | 0.988 | 0.789 | 0.872 | 13136 | 166 | 7552734 | 3513 | 42 | | 29 | slk | Slovak | 14660 | 0.693 | 0.788 | 0.634 | 11559 | 5110 | 7549779 | 3101 | 43 | | 30 | ind | Indonesian | 14542 | 0.864 | 0.775 | 0.768 | 11270 | 1773 | 7553234 | 3272 | 44 | | 31 | hin | Hindi | 14230 | 0.918 | 0.973 | 0.907 | 13848 | 1230 | 7554089 | 382 | 45 | | 32 | nob | Norwegian Bokmål | 14223 | 0.566 | 0.796 | 0.528 | 11327 | 8682 | 7546644 | 2896 | 46 | | 33 | bel | Belarusian | 12633 | 0.929 | 0.885 | 0.876 | 11176 | 855 | 7556061 | 1457 | 47 | | 34 | isl | Icelandic | 11091 | 0.996 | 0.925 | 0.957 | 10261 | 43 | 7558415 | 830 | 48 | | 35 | cat | Catalan | 7971 | 0.806 | 0.685 | 0.680 | 5464 | 1317 | 7560261 | 2507 | 49 | | 36 | kor | Korean | 7570 | 1.000 | 0.991 | 0.995 | 7500 | 0 | 7561979 | 70 | 50 | | 37 | yid | Yiddish | 6895 | 0.991 | 0.937 | 0.959 | 6460 | 60 | 7562594 | 435 | 51 | | 38 | eus | Basque | 6166 | 0.972 | 0.893 | 0.918 | 5505 | 158 | 7563225 | 661 | 52 | | 39 | kat | Georgian | 5732 | 1.000 | 1.000 | 1.000 | 5731 | 0 | 7563817 | 1 | 53 | | 40 | aze | Azerbaijani | 5348 | 0.509 | 0.870 | 0.490 | 4651 | 4486 | 7559715 | 697 | 54 | | 41 | hrv | Croatian | 5204 | 0.270 | 0.565 | 0.244 | 2942 | 7960 | 7556385 | 2262 | 55 | | 42 | ben | Bangla | 4714 | 1.000 | 0.777 | 0.874 | 3662 | 0 | 7564835 | 1052 | 56 | | 43 | glg | Galician | 4613 | 0.292 | 0.668 | 0.273 | 3081 | 7456 | 7557480 | 1532 | 57 | | 44 | afr | Afrikaans | 4031 | 0.446 | 0.826 | 0.426 | 3330 | 4133 | 7561385 | 701 | 58 | | 45 | est | Estonian | 3637 | 0.907 | 0.752 | 0.789 | 2734 | 279 | 7565633 | 903 | 59 | | 46 | tha | Thai | 3528 | 1.000 | 1.000 | 1.000 | 3528 | 0 | 7566021 | 0 | 60 | | 47 | sqi | Albanian | 2526 | 0.962 | 0.909 | 0.917 | 2295 | 91 | 7566932 | 231 | 61 | | 48 | gle | Irish | 2389 | 0.938 | 0.884 | 0.883 | 2112 | 140 | 7567020 | 277 | 62 | | 49 | hye | Armenian | 2248 | 1.000 | 1.000 | 1.000 | 2247 | 0 | 7567301 | 1 | 63 | | 50 | urd | Urdu | 2008 | 0.997 | 0.948 | 0.971 | 1903 | 5 | 7567536 | 105 | 64 | | 51 | khm | Khmer | 1511 | 1.000 | 0.991 | 0.996 | 1498 | 0 | 7568038 | 13 | 65 | | 52 | ceb | Cebuano | 1478 | 0.617 | 0.551 | 0.493 | 815 | 506 | 7567565 | 663 | 66 | | 53 | cym | Welsh | 1344 | 0.968 | 0.845 | 0.890 | 1136 | 37 | 7568168 | 208 | 67 | | 54 | slv | Slovenian | 1093 | 0.739 | 0.550 | 0.568 | 601 | 212 | 7568244 | 492 | 68 | | 55 | gla | Scottish Gaelic | 1033 | 0.927 | 0.909 | 0.886 | 939 | 74 | 7568442 | 94 | 69 | | 56 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7568722 | 0 | 70 | | 57 | jav | Javanese | 615 | 0.806 | 0.610 | 0.641 | 375 | 90 | 7568844 | 240 | 71 | | 58 | ori | Odia | 374 | 1.000 | 1.000 | 1.000 | 374 | 0 | 7569175 | 0 | 72 | | 59 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7569215 | 0 | 73 | | 60 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7569295 | 0 | 74 | | 61 | lao | Lao | 219 | 1.000 | 1.000 | 1.000 | 219 | 0 | 7569330 | 0 | 75 | | 62 | mlt | Maltese | 208 | 0.243 | 0.803 | 0.236 | 167 | 521 | 7568820 | 41 | 76 | | 63 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7569353 | 0 | 77 | | 64 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7569373 | 0 | 78 | | 65 | guj | Gujarati | 168 | 1.000 | 1.000 | 1.000 | 168 | 0 | 7569381 | 0 | 79 | | 66 | hat | Haitian Creole | 64 | 0.114 | 0.594 | 0.110 | 38 | 294 | 7569191 | 26 | 80 | | 67 | sin | Sinhala | 45 | 1.000 | 1.000 | 1.000 | 45 | 0 | 7569504 | 0 | 81 | | 68 | chr | Cherokee | 28 | 1.000 | 0.964 | 0.982 | 27 | 0 | 7569521 | 1 | 82 | | 69 | kin | Kinyarwanda | 28 | 0.050 | 0.679 | 0.049 | 19 | 361 | 7569160 | 9 | 83 | | 70 | div | Divehi | 26 | 1.000 | 1.000 | 1.000 | 26 | 0 | 7569523 | 0 | 84 | | 71 | lug | Ganda | 2 | 0.032 | 1.000 | 0.032 | 2 | 60 | 7569487 | 0 | -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/pycld2/mbp_m1_speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for pycld2 on tatoeba-sentences-2021-06-05 2 | 3 | ## Throughput 4 | 258366.48209271693 examples/s. 5 | 6 | ## Latency 7 | - Average: 0.0038704710916841827 ms/example 8 | - Standard deviation: 0.004183726893975661 9 | - Median: 0.004 ms/example 10 | - 90th percentile: 0.005 ms/example 11 | - 95th percentile: 0.005 ms/example 12 | - 99th percentile: 0.008 ms/example -------------------------------------------------------------------------------- /results/tatoeba-sentences-2021-06-05/results.md: -------------------------------------------------------------------------------- 1 | # Aggregated results for tatoeba-sentences-2021-06-05 2 | 3 | | Library | Supported languages | # sentences supported | Aggregated accuracy | Per language metrics | 4 | |:--------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 5 | | fasttext | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext/classification_performance.md#supported-languages) | 9,640,185 (87.64%) | [98.27%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext/classification_performance.md#metrics-per-language) | 6 | | fasttext-compressed | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext-compressed/classification_performance.md#supported-languages) | 9,640,185 (87.64%) | [96.81%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext-compressed/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext-compressed/classification_performance.md#metrics-per-language) | 7 | | gcld3 | [107](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/gcld3/classification_performance.md#supported-languages) | 9,640,185 (85.70%) | [87.11%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/gcld3/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/gcld3/classification_performance.md#metrics-per-language) | 8 | | langdetect | [55](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langdetect/classification_performance.md#supported-languages) | 9,640,185 (77.40%) | [92.45%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langdetect/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langdetect/classification_performance.md#metrics-per-language) | 9 | | langid | [97](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langid/classification_performance.md#supported-languages) | 9,640,185 (86.08%) | [89.00%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langid/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langid/classification_performance.md#metrics-per-language) | 10 | | pycld2 | [83](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/pycld2/classification_performance.md#supported-languages) | 9,640,185 (78.52%) | [86.95%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/pycld2/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/pycld2/classification_performance.md#metrics-per-language) | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import time 2 | import tqdm 3 | import numpy as np 4 | import os 5 | import argparse 6 | import csv 7 | 8 | import datasets 9 | 10 | from benchmarks import BENCHMARKS 11 | 12 | 13 | def report_basic_timings(elapsed_in_fn, total_elapsed): 14 | spent_in_fn = np.sum(elapsed_in_fn) 15 | overhead = total_elapsed - spent_in_fn 16 | overhead_pct = overhead / spent_in_fn * 100 17 | avg = np.mean(elapsed_in_fn) 18 | std = np.std(elapsed_in_fn) 19 | throughput = 1/avg * 10**9 20 | print(f"In fn: total_time={spent_in_fn/10**9}s avg={avg}ns stddev={std}ns throughput={throughput}/s") 21 | print(f"Benchmark: total_time={total_elapsed/10**9} overhead: {overhead_pct}%") 22 | 23 | 24 | def save_predictions(dst, results, original_dataset=None): 25 | with open(dst, mode='w') as fd: 26 | writer = csv.writer(fd, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 27 | for idx, lang, prob in zip(original_dataset.index, results['lang'], results['prob']): 28 | writer.writerow([idx, lang.decode('utf-8'), prob]) 29 | 30 | 31 | if __name__ == "__main__": 32 | # os.chdir(os.path.dirname(__file__)) 33 | 34 | parser = argparse.ArgumentParser(description='Run benchmark for given model.') 35 | parser.add_argument('benchmarks', nargs='+', choices=list(BENCHMARKS.keys())) 36 | parser.add_argument('--examples-lo', '-lo', type=int) 37 | parser.add_argument('--examples-hi', '-hi', type=int) 38 | parser.add_argument('--dataset', '-d', type=str, choices=datasets.names(), required=True) 39 | args = parser.parse_args() 40 | 41 | print(f'Loading dataset {args.dataset}...') 42 | dataset = datasets.get(args.dataset) 43 | 44 | for benchmark_name in args.benchmarks: 45 | benchmark = BENCHMARKS[benchmark_name] 46 | print() 47 | print(f'Loaded benchmark {benchmark_name}') 48 | 49 | supported_dataset = datasets.get_supported_dataset_subset(dataset, benchmark['supported_languages_alpha3']) 50 | lo = args.examples_lo or 0 51 | hi = args.examples_hi or len(supported_dataset) 52 | print(f'Benchmark supports {len(supported_dataset)}/{len(dataset)} ({100*len(supported_dataset)/len(dataset)}%) items') 53 | benchmark_dataset = supported_dataset[lo:hi] 54 | print(f'Getting the chosen slice of the dataset (lo={lo} hi={hi}). Size={len(benchmark_dataset)}') 55 | 56 | print(f'Running {benchmark_name}...') 57 | total_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC) 58 | elapsed = np.zeros((hi-lo,)) 59 | predictions = benchmark['run'](tqdm.tqdm(benchmark_dataset.text), elapsed) 60 | 61 | os.makedirs(f'results/{args.dataset}/{benchmark_name}/', exist_ok=True) 62 | 63 | total_elapsed = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - total_start_time 64 | report_basic_timings(elapsed_in_fn=elapsed, total_elapsed=total_elapsed) 65 | np.save(f'results/{args.dataset}/{benchmark_name}/times.npy', elapsed) 66 | 67 | save_predictions(f'results/{args.dataset}/{benchmark_name}/results.csv', predictions, original_dataset=benchmark_dataset) 68 | -------------------------------------------------------------------------------- /templates/classification_performance.md: -------------------------------------------------------------------------------- 1 | # Classification performance for {{benchmark_name}} on {{dataset_name}} 2 | 3 | - Dataset coverage (sentences in supported languages): {{dataset_len}} ({{dataset_supported_pct}}) 4 | - **Aggregated accuracy: {{accuracy}}** 5 | 6 |

Supported languages ({{supported_languages_count}})

7 | 8 | {{supported_languages_list_str}} 9 | 10 |

Stats per language

11 | 12 | {{stats_per_language}} 13 | -------------------------------------------------------------------------------- /templates/dataset_results.md: -------------------------------------------------------------------------------- 1 | # Aggregated results for {{dataset_name}} 2 | 3 | {{results_table}} 4 | -------------------------------------------------------------------------------- /templates/speed_performance.md: -------------------------------------------------------------------------------- 1 | # Speed performance for {{benchmark_name}} on {{dataset_name}} 2 | 3 | ## Throughput 4 | {{throughput}} examples/s. 5 | 6 | ## Latency 7 | - Average: {{latency_avg}} ms/example 8 | - Standard deviation: {{latency_std}} 9 | - Median: {{latency_p50}} ms/example 10 | - 90th percentile: {{latency_p90}} ms/example 11 | - 95th percentile: {{latency_p95}} ms/example 12 | - 99th percentile: {{latency_p99}} ms/example 13 | --------------------------------------------------------------------------------