├── templates
├── dataset_results.md
├── speed_performance.md
└── classification_performance.md
├── Dockerfile
├── requirements.txt
├── results
├── tatoeba-sentences-2021-06-05
│ ├── gcld3
│ │ ├── mbp_m1_speed_performance.md
│ │ ├── c5.xlarge_speed_performance.md
│ │ └── classification_performance.md
│ ├── langid
│ │ ├── mbp_m1_speed_performance.md
│ │ ├── c5.xlarge_speed_performance.md
│ │ └── classification_performance.md
│ ├── fasttext
│ │ ├── mbp_m1_speed_performance.md
│ │ └── c5.xlarge_speed_performance.md
│ ├── langdetect
│ │ ├── mbp_m1_speed_performance.md
│ │ ├── c5.xlarge_speed_performance.md
│ │ └── classification_performance.md
│ ├── pycld2
│ │ ├── mbp_m1_speed_performance.md
│ │ ├── c5.xlarge_speed_performance.md
│ │ └── classification_performance.md
│ ├── fasttext-compressed
│ │ ├── mbp_m1_speed_performance.md
│ │ └── c5.xlarge_speed_performance.md
│ └── results.md
├── open-subtitles-v2018-100k-per-lang
│ ├── results.md
│ ├── langdetect
│ │ └── classification_performance.md
│ ├── pycld2
│ │ └── classification_performance.md
│ ├── langid
│ │ └── classification_performance.md
│ ├── gcld3
│ │ └── classification_performance.md
│ ├── fasttext
│ │ └── classification_performance.md
│ └── fasttext-compressed
│ │ └── classification_performance.md
└── tatoeba-sentences-2021-06-05-common-48
│ ├── results.md
│ ├── langdetect
│ └── classification_performance.md
│ ├── pycld2
│ └── classification_performance.md
│ ├── langid
│ └── classification_performance.md
│ ├── gcld3
│ └── classification_performance.md
│ ├── fasttext
│ └── classification_performance.md
│ └── fasttext-compressed
│ └── classification_performance.md
├── datasets
├── WiLI-2018
│ └── download
├── tatoeba-sentences-2021-06-05
│ └── download
├── tatoeba-sentences-2021-06-05-common-48
│ ├── download
│ └── stats.md
└── open-subtitles-v2018-100k-per-lang
│ ├── stats.md
│ └── download
├── get_memory_usage.py
├── LICENSE
├── README.md
├── models
├── langid.py
├── langdetect.py
├── gcld3.py
├── fasttext.py
└── pycld2.py
├── analyze_dataset.py
├── benchmarks.py
├── datasets.py
├── .gitignore
├── run.py
└── analyze.py
/templates/dataset_results.md:
--------------------------------------------------------------------------------
1 | # Aggregated results for {{dataset_name}}
2 |
3 | {{results_table}}
4 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8.9-buster
2 |
3 | RUN apt update
4 | RUN apt install -y protobuf-compiler
5 |
6 | WORKDIR /src
7 |
8 | # other libraries
9 | COPY requirements.txt ./
10 | RUN pip install -r requirements.txt
11 |
12 | COPY run.py ./
13 | COPY datasets ./datasets
14 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Jinja2==3.0.1
2 | fasttext==0.9.2
3 | gcld3==3.0.13
4 | ipython
5 | langcodes==3.1.0
6 | langdetect==1.0.9
7 | langid==1.1.6
8 | language-data==1.0
9 | pandas==1.2.4
10 | psutil==5.8.0
11 | pycld2==0.41
12 | scikit-learn==0.24.2
13 | tabulate==0.8.9
14 | tqdm==4.61.0
15 | ipdb==0.13.9
16 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/gcld3/mbp_m1_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for gcld3 on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 17494.196652851704 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.05716181313401386 ms/example
8 | - Standard deviation: 0.02535098112972992
9 | - Median: 0.052 ms/example
10 | - 90th percentile: 0.08 ms/example
11 | - 95th percentile: 0.095 ms/example
12 | - 99th percentile: 0.148 ms/example
--------------------------------------------------------------------------------
/templates/speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for {{benchmark_name}} on {{dataset_name}}
2 |
3 | ## Throughput
4 | {{throughput}} examples/s.
5 |
6 | ## Latency
7 | - Average: {{latency_avg}} ms/example
8 | - Standard deviation: {{latency_std}}
9 | - Median: {{latency_p50}} ms/example
10 | - 90th percentile: {{latency_p90}} ms/example
11 | - 95th percentile: {{latency_p95}} ms/example
12 | - 99th percentile: {{latency_p99}} ms/example
13 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/langid/mbp_m1_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for langid on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 1268.5839534203296 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.7882805054437436 ms/example
8 | - Standard deviation: 0.47807791658786386
9 | - Median: 0.718 ms/example
10 | - 90th percentile: 1.072 ms/example
11 | - 95th percentile: 1.202 ms/example
12 | - 99th percentile: 1.59 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/fasttext/mbp_m1_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for fasttext on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 112223.08654995833 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.008910822458575225 ms/example
8 | - Standard deviation: 0.004349694022231488
9 | - Median: 0.008 ms/example
10 | - 90th percentile: 0.011 ms/example
11 | - 95th percentile: 0.013 ms/example
12 | - 99th percentile: 0.017 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/langdetect/mbp_m1_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for langdetect on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 234.35885994454733 examples/s.
5 |
6 | ## Latency
7 | - Average: 4.26696050764462 ms/example
8 | - Standard deviation: 7.405121715428491
9 | - Median: 2.649 ms/example
10 | - 90th percentile: 9.023 ms/example
11 | - 95th percentile: 13.456 ms/example
12 | - 99th percentile: 25.658 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/pycld2/mbp_m1_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for pycld2 on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 258366.48209271693 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.0038704710916841827 ms/example
8 | - Standard deviation: 0.004183726893975661
9 | - Median: 0.004 ms/example
10 | - 90th percentile: 0.005 ms/example
11 | - 95th percentile: 0.005 ms/example
12 | - 99th percentile: 0.008 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/pycld2/c5.xlarge_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for pycld2 on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 208037.00199835165 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.004806837199124429 ms/example
8 | - Standard deviation: 0.00467348719666187
9 | - Median: 0.004474 ms/example
10 | - 90th percentile: 0.006108 ms/example
11 | - 95th percentile: 0.007004 ms/example
12 | - 99th percentile: 0.010547 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/langid/c5.xlarge_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for langid on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 896.817073745205 examples/s.
5 |
6 | ## Latency
7 | - Average: 1.1150545961662974 ms/example
8 | - Standard deviation: 0.5163837711515092
9 | - Median: 1.060864 ms/example
10 | - 90th percentile: 1.34925 ms/example
11 | - 95th percentile: 1.472716 ms/example
12 | - 99th percentile: 1.7471153599999993 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/fasttext-compressed/mbp_m1_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for fasttext-compressed on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 93406.46830679392 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.010705896691388609 ms/example
8 | - Standard deviation: 0.006436589155683585
9 | - Median: 0.01 ms/example
10 | - 90th percentile: 0.014 ms/example
11 | - 95th percentile: 0.016 ms/example
12 | - 99th percentile: 0.023 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/gcld3/c5.xlarge_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for gcld3 on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 13371.689002229674 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.07478486822668805 ms/example
8 | - Standard deviation: 0.03576524584839307
9 | - Median: 0.068239 ms/example
10 | - 90th percentile: 0.104779 ms/example
11 | - 95th percentile: 0.123671 ms/example
12 | - 99th percentile: 0.20031665000000037 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/fasttext/c5.xlarge_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for fasttext on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 105253.30924161937 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.009500888924113552 ms/example
8 | - Standard deviation: 0.00581991143719852
9 | - Median: 0.008891 ms/example
10 | - 90th percentile: 0.013018 ms/example
11 | - 95th percentile: 0.014964 ms/example
12 | - 99th percentile: 0.021675540000000038 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/fasttext-compressed/c5.xlarge_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for fasttext-compressed on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 76041.91094156951 examples/s.
5 |
6 | ## Latency
7 | - Average: 0.01315064268661526 ms/example
8 | - Standard deviation: 0.009660729486966587
9 | - Median: 0.011923 ms/example
10 | - 90th percentile: 0.019079 ms/example
11 | - 95th percentile: 0.022503 ms/example
12 | - 99th percentile: 0.033758 ms/example
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/langdetect/c5.xlarge_speed_performance.md:
--------------------------------------------------------------------------------
1 | # Speed performance for langdetect on tatoeba-sentences-2021-06-05
2 |
3 | ## Throughput
4 | 239.74773586805924 examples/s.
5 |
6 | ## Latency
7 | - Average: 4.171050860519207 ms/example
8 | - Standard deviation: 4.538692148545434
9 | - Median: 2.715283 ms/example
10 | - 90th percentile: 8.73063 ms/example
11 | - 95th percentile: 12.929049399999995 ms/example
12 | - 99th percentile: 24.198700639999984 ms/example
--------------------------------------------------------------------------------
/templates/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for {{benchmark_name}} on {{dataset_name}}
2 |
3 | - Dataset coverage (sentences in supported languages): {{dataset_len}} ({{dataset_supported_pct}})
4 | - **Aggregated accuracy: {{accuracy}}**
5 |
6 |
Supported languages ({{supported_languages_count}})
7 |
8 | {{supported_languages_list_str}}
9 |
10 | Stats per language
11 |
12 | {{stats_per_language}}
13 |
--------------------------------------------------------------------------------
/datasets/WiLI-2018/download:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | cd $SCRIPT_DIR
5 |
6 | FILENAME="wili-2018.zip"
7 | if [[ -f $FILENAME ]]; then
8 | echo "Dataset already downloaded"
9 | else
10 | echo "Downloading the dataset..."
11 | curl https://zenodo.org/record/841984/files/wili-2018.zip?download=1 > ${FILENAME}
12 | fi
13 |
14 | if [[ ! -f "labels.csv" ]]; then
15 | echo "Extracting the dataset..."
16 | unzip $FILENAME
17 | fi
18 |
--------------------------------------------------------------------------------
/get_memory_usage.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from benchmarks import BENCHMARKS
3 |
4 |
5 | MB = 1024*1024
6 |
7 |
8 | if __name__ == "__main__":
9 | parser = argparse.ArgumentParser(description='Calculates memory usage for loading the model and running one inference request.')
10 | parser.add_argument('benchmarks', nargs='+')
11 |
12 | args = parser.parse_args()
13 |
14 | mem_usage = {}
15 | for benchmark_name in args.benchmarks:
16 | mem_usage[benchmark_name] = BENCHMARKS[benchmark_name]['measure_memory']() / MB
17 | print(mem_usage)
18 |
--------------------------------------------------------------------------------
/datasets/tatoeba-sentences-2021-06-05/download:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | cd $SCRIPT_DIR
5 |
6 | FILENAME="tatoeba-sentences-2021-06-05.tar.bz2"
7 | if [[ -f $FILENAME ]]; then
8 | echo "Dataset already downloaded"
9 | else
10 | echo "Downloading the dataset..."
11 | wget https://modelpredict.s3.amazonaws.com/datasets/${FILENAME}
12 | fi
13 |
14 | if [[ ! -f "sentences.csv" ]]; then
15 | echo "Extracting the dataset..."
16 | tar xvfj $FILENAME
17 | fi
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/datasets/tatoeba-sentences-2021-06-05-common-48/download:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | cd $SCRIPT_DIR
5 |
6 | FILENAME="tatoeba-sentences-2021-06-05-common-48.tar.bz2"
7 | if [[ -f $FILENAME ]]; then
8 | echo "Dataset already downloaded"
9 | else
10 | echo "Downloading the dataset..."
11 | wget https://modelpredict.s3.amazonaws.com/datasets/${FILENAME}
12 | fi
13 |
14 | if [[ ! -f "sentences.csv" ]]; then
15 | echo "Extracting the dataset..."
16 | tar xvfj $FILENAME
17 | fi
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 modelpredict
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # language-identification-survey
2 | Live survey of off-the-shelf language identification tools for python
3 |
4 | ## Reproducing benchmark
5 |
6 | ### 1. Download the dataset
7 | ```bash
8 | ./datasets/tatoeba-sentences-2021-06-05/download
9 | ```
10 |
11 | ### 2. Run the language inference for benchmarks
12 |
13 | Available benchmarks:
14 | - fasttext
15 | - fasttext-compressed
16 | - gcld3
17 | - langdetect
18 | - langid
19 | - pycld2
20 |
21 | Available datasets:
22 | - tatoeba-sentences-2021-06-05
23 | - tatoeba-sentences-2021-06-05-common-48
24 | - open-subtitles-v2018-100k-per-lang
25 |
26 | On the host machine.
27 | ```bash
28 | python run.py
29 | ```
30 |
31 | In docker:
32 | ```bash
33 | docker build -t bench .
34 | docker run -v `pwd`:/src -t -i bench python /src/run.py
35 | ```
36 |
37 | ### 3. Run analysis
38 | ```bash
39 | python analyze.py --correctness
40 | python analyze.py --timings
41 | ```
42 |
43 | ### 4. Get memory usage for different models
44 | ```bash
45 | python get_memory_usage.py
46 | # e.g. python get_memory_usage.py fasttext
47 | # e.g. python get_memory_usage.py fasttext-compressed
48 | ```
49 |
50 | It will print memory usage in MB (bytes/1024/1024).
51 |
--------------------------------------------------------------------------------
/models/langid.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import psutil
4 | import os
5 |
6 |
7 | SUPPORTED_LANGUAGES = "af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, es, et, eu, fa, fi, fo, fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja, jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, sl, sq, sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu".split(", ")
8 |
9 |
10 | def measure_memory():
11 | p = psutil.Process(os.getpid())
12 | mem_before = p.memory_info().rss
13 | import langid
14 | langid.classify("hello darkness my ol' fren")
15 | return p.memory_info().rss - mem_before
16 |
17 |
18 | def run(dataset, elapsed):
19 | import langid
20 | lang = np.chararray(len(dataset), itemsize=10)
21 | prob = np.zeros((len(dataset),), dtype=np.float)
22 |
23 | for i, text in enumerate(dataset):
24 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC)
25 | result = langid.classify(text)
26 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time
27 |
28 | lang[i] = result[0]
29 | prob[i] = result[1]
30 |
31 | return dict(lang=lang, prob=prob)
32 |
--------------------------------------------------------------------------------
/models/langdetect.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import psutil
4 | import os
5 |
6 |
7 | SUPPORTED_LANGUAGES = ("af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he, " + \
8 | "hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl, " + \
9 | "pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw").split(", ")
10 |
11 |
12 | def measure_memory():
13 | p = psutil.Process(os.getpid())
14 | mem_before = p.memory_info().rss
15 | import langdetect
16 | langdetect.detect_langs("hello darkness my ol' fren")
17 | return p.memory_info().rss - mem_before
18 |
19 |
20 | def run(dataset, elapsed):
21 | import langdetect
22 | lang = np.chararray(len(dataset), itemsize=10)
23 | prob = np.zeros((len(dataset),), dtype=np.float)
24 |
25 | for i, text in enumerate(dataset):
26 | try:
27 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC)
28 | result = langdetect.detect_langs(text)
29 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time
30 | except:
31 | result = None
32 |
33 | if result:
34 | lang[i] = result[0].lang
35 | prob[i] = result[0].prob
36 | else:
37 | lang[i] = 'n/a'
38 | prob[i] = float('nan')
39 |
40 | return dict(lang=lang, prob=prob)
41 |
--------------------------------------------------------------------------------
/models/gcld3.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import psutil
4 | import time
5 |
6 | # https://github.com/google/cld3
7 |
8 | SUPPORTED_LANGUAGES = "af am ar bg bg-Latn bn bs ca ceb co cs cy da de el el-Latn en eo es et eu fa fi fil fr fy ga gd gl gu ha haw hi hi-Latn hmn hr ht hu hy id ig is it iw ja ja-Latn jv ka kk km kn ko ku ky la lb lo lt lv mg mi mk ml mn mr ms mt my ne nl no ny pa pl ps pt ro ru ru-Latn sd si sk sl sm sn so sq sr st su sv sw ta te tg th tr uk ur uz vi xh yi yo zh zh-Latn zu".split(" ")
9 |
10 |
11 | def measure_memory():
12 | p = psutil.Process(os.getpid())
13 | mem_before = p.memory_info().rss
14 | import gcld3
15 | model = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=3000)
16 | model.FindLanguage(text="hello darkness my ol' fren")
17 | return p.memory_info().rss - mem_before
18 |
19 |
20 | def run(dataset, elapsed):
21 | import gcld3
22 | detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=3000)
23 |
24 | lang = np.chararray(len(dataset), itemsize=10)
25 | prob = np.zeros((len(dataset),), dtype=np.float)
26 | for i, text in enumerate(dataset):
27 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC)
28 | result = detector.FindLanguage(text=text)
29 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time
30 |
31 | lang[i] = result.language
32 | prob[i] = result.probability
33 |
34 | return dict(lang=lang, prob=prob)
35 |
--------------------------------------------------------------------------------
/analyze_dataset.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import datasets
5 | from langcodes import Language
6 |
7 |
8 | def get_language_name(alpha3):
9 | try:
10 | return Language.get(alpha3).display_name()
11 | except:
12 | print(f"Failed to get name for language '{alpha3}'")
13 | return "--"
14 |
15 |
16 | def get_stats_table(df):
17 | df = df.copy()
18 | df['text_len'] = df['text'].str.len()
19 |
20 | # calculate stats (count, pct, mean(text_len)) per language, sorted by count DESC
21 | counts = df.groupby('alpha3').agg({'text_len': ['count', 'mean']}).reset_index()
22 | counts.columns = ['alpha3', 'sentences', 'mean_len']
23 | counts['dataset_percentage'] = (counts['sentences'] / counts['sentences'].sum() * 100).apply(lambda x: "{:.2f}%".format(x))
24 | counts.sort_values(['sentences'], ascending=False, inplace=True)
25 | counts.reset_index(inplace=True)
26 | counts.index += 1
27 |
28 | # assign language name
29 | counts['language'] = counts['alpha3'].apply(get_language_name)
30 | return counts[['alpha3', 'language', 'sentences', 'dataset_percentage', 'mean_len']]
31 |
32 |
33 | if __name__ == "__main__":
34 | parser = argparse.ArgumentParser(description='Write aggregated results files.')
35 | parser.add_argument('--dataset', '-d', type=str, choices=datasets.names(), required=True)
36 | args = parser.parse_args()
37 |
38 | print(f"Dumping stats for dataset {args.dataset}")
39 |
40 | ds = datasets.get(args.dataset)
41 | stats_df = get_stats_table(ds)
42 |
43 | with open(os.path.join('datasets', args.dataset, 'stats.md'), 'w') as fd:
44 | stats_df.to_markdown(fd, index=True)
45 |
--------------------------------------------------------------------------------
/benchmarks.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | from langcodes import Language
4 | from models import gcld3, langid, langdetect, pycld2, fasttext
5 |
6 |
7 | BENCHMARKS = {
8 | 'fasttext': {
9 | 'run': partial(fasttext.run, model_path=fasttext.MODEL_BIN),
10 | 'measure_memory': partial(fasttext.measure_memory, model_path=fasttext.MODEL_BIN),
11 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in fasttext.SUPPORTED_LANGUAGES],
12 | },
13 | 'fasttext-compressed': {
14 | 'run': partial(fasttext.run, model_path=fasttext.MODEL_COMPRESSED),
15 | 'measure_memory': partial(fasttext.measure_memory, model_path=fasttext.MODEL_COMPRESSED),
16 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in fasttext.SUPPORTED_LANGUAGES],
17 | },
18 | 'gcld3': {
19 | 'run': gcld3.run,
20 | 'measure_memory': gcld3.measure_memory,
21 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in gcld3.SUPPORTED_LANGUAGES],
22 | },
23 | 'langdetect': {
24 | 'run': langdetect.run,
25 | 'measure_memory': langdetect.measure_memory,
26 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in langdetect.SUPPORTED_LANGUAGES],
27 | },
28 | 'langid': {
29 | 'run': langid.run,
30 | 'measure_memory': langid.measure_memory,
31 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in langid.SUPPORTED_LANGUAGES],
32 | },
33 | 'pycld2': {
34 | 'run': pycld2.run,
35 | 'measure_memory': pycld2.measure_memory,
36 | 'supported_languages_alpha3': [Language.get(lang).to_alpha3() for lang in pycld2.SUPPORTED_LANGUAGES],
37 | },
38 | }
39 |
40 |
41 | def common_languages():
42 | supported_languages = set(BENCHMARKS['fasttext']['supported_languages_alpha3'])
43 | for b in BENCHMARKS:
44 | supported_languages = supported_languages.intersection(BENCHMARKS[b]['supported_languages_alpha3'])
45 | return supported_languages
46 |
--------------------------------------------------------------------------------
/models/fasttext.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import numpy as np
4 | import psutil
5 |
6 |
7 | MODEL_BIN = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
8 | MODEL_COMPRESSED = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
9 |
10 |
11 | SUPPORTED_LANGUAGES = "af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh".split(" ")
12 |
13 |
14 | def measure_memory(model_path):
15 | p = psutil.Process(os.getpid())
16 | mem_before = p.memory_info().rss
17 | import fasttext
18 | download_model(model_path)
19 | model = fasttext.load_model('/tmp/fasttext.model')
20 | model.predict("hello darkness my ol' fren")
21 | return p.memory_info().rss - mem_before
22 |
23 |
24 |
25 | def run(dataset, elapsed, model_path):
26 | import fasttext
27 |
28 | lang = np.chararray(len(dataset), itemsize=15)
29 | prob = np.zeros((len(dataset),), dtype=np.float)
30 |
31 | download_model(model_path)
32 | model = fasttext.load_model('/tmp/fasttext.model')
33 |
34 | for i, text in enumerate(dataset):
35 | # For some reason fasttext likes one line at a time.
36 | text = text.replace('\n', ' ')
37 |
38 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC)
39 | result = model.predict(text)
40 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time
41 |
42 | assert len(result[0]) == 1
43 | lang[i] = result[0][0]
44 | prob[i] = result[1][0]
45 |
46 | return dict(lang=lang, prob=prob)
47 |
48 |
49 | def download_model(path):
50 | os.system(f"wget -O /tmp/fasttext.model {path}")
51 |
--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
1 | import re
2 | from argparse import ArgumentError
3 | from glob import glob
4 |
5 | import pandas as pd
6 | from langcodes import Language
7 |
8 |
9 | __DATASETS = {}
10 |
11 |
12 | def get_alpha3(lang):
13 | l = Language.get(lang)
14 | try:
15 | return l.to_alpha3()
16 | except:
17 | return None
18 |
19 |
20 | def dataset(load_fn):
21 | __DATASETS[load_fn.__name__] = load_fn
22 | return load_fn
23 |
24 |
25 | def get(name):
26 | name = name.replace('-', '_')
27 | if name in __DATASETS:
28 | return __DATASETS[name]()
29 | raise ArgumentError(f"Unkown dataset {name}")
30 |
31 |
32 | def names():
33 | return [name.replace('_', '-') for name in __DATASETS.keys()]
34 |
35 |
36 | @dataset
37 | def tatoeba_sentences_2021_06_05():
38 | dataset_path = 'datasets/tatoeba-sentences-2021-06-05/sentences.csv'
39 | ds = pd.read_csv(dataset_path, sep='\t', index_col=0, names=['language', 'text'], dtype={'language': 'category'})
40 | ds['alpha3'] = ds['language'].apply(get_alpha3).astype("category")
41 | return ds
42 |
43 |
44 | @dataset
45 | def tatoeba_sentences_2021_06_05_common_48():
46 | dataset_path = 'datasets/tatoeba-sentences-2021-06-05-common-48/sentences.csv'
47 | ds = pd.read_csv(dataset_path, index_col=0, names=['language', 'text'], dtype={'language': 'category'})
48 | ds['alpha3'] = ds['language'].apply(get_alpha3).astype("category")
49 | return ds
50 |
51 |
52 | @dataset
53 | def open_subtitles_v2018_100k_per_lang():
54 | dataset_files = 'datasets/open-subtitles-v2018-100k-per-lang/*.txt'
55 | dfs = []
56 | for f in glob(dataset_files):
57 | sentences = open(f, encoding='utf-8').readlines()
58 | language = re.split('[\./]', f)[-2]
59 | data = dict(
60 | text=sentences,
61 | language=language,
62 | )
63 | dfs.append(pd.DataFrame(data=data))
64 |
65 | big_df = pd.concat(dfs).reset_index().drop('index', axis=1)
66 | big_df['language'] = big_df['language'].astype("category")
67 | big_df['alpha3'] = big_df['language'].apply(get_alpha3).astype("category")
68 | return big_df
69 |
70 |
71 | def get_supported_dataset_subset(dataset, supported_languages):
72 | return dataset[dataset['alpha3'].isin(supported_languages)]
73 |
--------------------------------------------------------------------------------
/models/pycld2.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import psutil
4 | import os
5 |
6 |
7 | def get_supported_languages():
8 | from pycld2 import LANGUAGES
9 | # from https://github.com/CLD2Owners/cld2
10 | langs = """
11 | Afrikaans Albanian Arabic Armenian Azerbaijani Basque Belarusian Bengali Bihari Bulgarian Catalan Cebuano Cherokee Croatian Czech Chinese Chinese_T Danish Dhivehi Dutch English Estonian Finnish French Galician Ganda Georgian German Greek Gujarati Haitian_Creole Hebrew Hindi Hmong Hungarian Icelandic Indonesian Inuktitut Irish Italian Javanese Japanese Kannada Khmer Kinyarwanda Korean Laothian Latvian Limbu Lithuanian Macedonian Malay Malayalam Maltese Marathi Nepali Norwegian Oriya Persian Polish Portuguese Punjabi Romanian Russian Scots_Gaelic Serbian Sinhalese Slovak Slovenian Spanish Swahili Swedish Syriac Tagalog Tamil Telugu Thai Turkish Ukrainian Urdu Vietnamese Welsh Yiddish
12 | """.upper().strip().split(" ")
13 | langs.remove("CHINESE_T")
14 | langs.append("CHINESET")
15 | name_to_code = {name.upper(): code for name, code in LANGUAGES}
16 | return [name_to_code[name] for name in langs]
17 |
18 |
19 | SUPPORTED_LANGUAGES = get_supported_languages()
20 |
21 |
22 | def measure_memory():
23 | p = psutil.Process(os.getpid())
24 | mem_before = p.memory_info().rss
25 | import pycld2
26 | pycld2.detect("hello darkness my ol' fren")
27 | return p.memory_info().rss - mem_before
28 |
29 |
30 | def run(dataset, elapsed):
31 | import pycld2
32 | lang = np.chararray(len(dataset), itemsize=10)
33 | prob = np.zeros((len(dataset),), dtype=np.float)
34 |
35 | errored = 0
36 |
37 | for i, text in enumerate(dataset):
38 | try:
39 | iter_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC)
40 | result = pycld2.detect(text)
41 | elapsed[i] = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - iter_start_time
42 |
43 | lang_label = result[2][0][1]
44 | lang[i] = lang_label if lang_label != 'un' else None
45 | prob[i] = float('nan')
46 | except pycld2.error:
47 | # Unfortunately, pycld2 errors on "invalid utf-8" sequence for some texts,
48 | # even though python successfully loads, encodes and decodes them as utf-8.
49 | errored += 1
50 | lang[i] = None
51 | prob[i] = float('nan')
52 |
53 | print(f"pycld2 errored on {errored} texts")
54 |
55 | return dict(lang=lang, prob=prob)
56 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Dataset files
2 | datasets/**/**.bz2
3 | datasets/**/**.gzip
4 | datasets/**/**.csv
5 | datasets/**/**.txt
6 | results/**/**.npy
7 | results/**/results.csv
8 |
9 | .DS_Store
10 |
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 |
16 | # C extensions
17 | *.so
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | pip-wheel-metadata/
34 | share/python-wheels/
35 | *.egg-info/
36 | .installed.cfg
37 | *.egg
38 | MANIFEST
39 |
40 | # PyInstaller
41 | # Usually these files are written by a python script from a template
42 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
43 | *.manifest
44 | *.spec
45 |
46 | # Installer logs
47 | pip-log.txt
48 | pip-delete-this-directory.txt
49 |
50 | # Unit test / coverage reports
51 | htmlcov/
52 | .tox/
53 | .nox/
54 | .coverage
55 | .coverage.*
56 | .cache
57 | nosetests.xml
58 | coverage.xml
59 | *.cover
60 | *.py,cover
61 | .hypothesis/
62 | .pytest_cache/
63 |
64 | # Translations
65 | *.mo
66 | *.pot
67 |
68 | # Django stuff:
69 | *.log
70 | local_settings.py
71 | db.sqlite3
72 | db.sqlite3-journal
73 |
74 | # Flask stuff:
75 | instance/
76 | .webassets-cache
77 |
78 | # Scrapy stuff:
79 | .scrapy
80 |
81 | # Sphinx documentation
82 | docs/_build/
83 |
84 | # PyBuilder
85 | target/
86 |
87 | # Jupyter Notebook
88 | .ipynb_checkpoints
89 |
90 | # IPython
91 | profile_default/
92 | ipython_config.py
93 |
94 | # pyenv
95 | .python-version
96 |
97 | # pipenv
98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | # install all needed dependencies.
102 | #Pipfile.lock
103 |
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 |
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 |
111 | # SageMath parsed files
112 | *.sage.py
113 |
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 |
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 |
127 | # Rope project settings
128 | .ropeproject
129 |
130 | # mkdocs documentation
131 | /site
132 |
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 |
138 | # Pyre type checker
139 | .pyre/
140 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import time
2 | import tqdm
3 | import numpy as np
4 | import os
5 | import argparse
6 | import csv
7 |
8 | import datasets
9 |
10 | from benchmarks import BENCHMARKS
11 |
12 |
13 | def report_basic_timings(elapsed_in_fn, total_elapsed):
14 | spent_in_fn = np.sum(elapsed_in_fn)
15 | overhead = total_elapsed - spent_in_fn
16 | overhead_pct = overhead / spent_in_fn * 100
17 | avg = np.mean(elapsed_in_fn)
18 | std = np.std(elapsed_in_fn)
19 | throughput = 1/avg * 10**9
20 | print(f"In fn: total_time={spent_in_fn/10**9}s avg={avg}ns stddev={std}ns throughput={throughput}/s")
21 | print(f"Benchmark: total_time={total_elapsed/10**9} overhead: {overhead_pct}%")
22 |
23 |
24 | def save_predictions(dst, results, original_dataset=None):
25 | with open(dst, mode='w') as fd:
26 | writer = csv.writer(fd, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
27 | for idx, lang, prob in zip(original_dataset.index, results['lang'], results['prob']):
28 | writer.writerow([idx, lang.decode('utf-8'), prob])
29 |
30 |
31 | if __name__ == "__main__":
32 | # os.chdir(os.path.dirname(__file__))
33 |
34 | parser = argparse.ArgumentParser(description='Run benchmark for given model.')
35 | parser.add_argument('benchmarks', nargs='+', choices=list(BENCHMARKS.keys()))
36 | parser.add_argument('--examples-lo', '-lo', type=int)
37 | parser.add_argument('--examples-hi', '-hi', type=int)
38 | parser.add_argument('--dataset', '-d', type=str, choices=datasets.names(), required=True)
39 | args = parser.parse_args()
40 |
41 | print(f'Loading dataset {args.dataset}...')
42 | dataset = datasets.get(args.dataset)
43 |
44 | for benchmark_name in args.benchmarks:
45 | benchmark = BENCHMARKS[benchmark_name]
46 | print()
47 | print(f'Loaded benchmark {benchmark_name}')
48 |
49 | supported_dataset = datasets.get_supported_dataset_subset(dataset, benchmark['supported_languages_alpha3'])
50 | lo = args.examples_lo or 0
51 | hi = args.examples_hi or len(supported_dataset)
52 | print(f'Benchmark supports {len(supported_dataset)}/{len(dataset)} ({100*len(supported_dataset)/len(dataset)}%) items')
53 | benchmark_dataset = supported_dataset[lo:hi]
54 | print(f'Getting the chosen slice of the dataset (lo={lo} hi={hi}). Size={len(benchmark_dataset)}')
55 |
56 | print(f'Running {benchmark_name}...')
57 | total_start_time = time.clock_gettime_ns(time.CLOCK_MONOTONIC)
58 | elapsed = np.zeros((hi-lo,))
59 | predictions = benchmark['run'](tqdm.tqdm(benchmark_dataset.text), elapsed)
60 |
61 | os.makedirs(f'results/{args.dataset}/{benchmark_name}/', exist_ok=True)
62 |
63 | total_elapsed = time.clock_gettime_ns(time.CLOCK_MONOTONIC) - total_start_time
64 | report_basic_timings(elapsed_in_fn=elapsed, total_elapsed=total_elapsed)
65 | np.save(f'results/{args.dataset}/{benchmark_name}/times.npy', elapsed)
66 |
67 | save_predictions(f'results/{args.dataset}/{benchmark_name}/results.csv', predictions, original_dataset=benchmark_dataset)
68 |
--------------------------------------------------------------------------------
/datasets/open-subtitles-v2018-100k-per-lang/stats.md:
--------------------------------------------------------------------------------
1 | | | alpha3 | language | sentences | dataset_percentage | mean_len |
2 | |---:|:---------|:-----------------|------------:|:---------------------|-----------:|
3 | | 1 | ara | Arabic | 100000 | 2.36% | 25.5612 |
4 | | 2 | kor | Korean | 100000 | 2.36% | 15.2265 |
5 | | 3 | lav | Latvian | 100000 | 2.36% | 28.2011 |
6 | | 4 | mkd | Macedonian | 100000 | 2.36% | 26.6284 |
7 | | 5 | mal | Malayalam | 100000 | 2.36% | 31.8701 |
8 | | 6 | nld | Dutch | 100000 | 2.36% | 31.3679 |
9 | | 7 | nob | Norwegian Bokmål | 100000 | 2.36% | 28.3095 |
10 | | 8 | pol | Polish | 100000 | 2.36% | 28.6661 |
11 | | 9 | por | Portuguese | 100000 | 2.36% | 31.2119 |
12 | | 10 | ron | Romanian | 100000 | 2.36% | 30.8544 |
13 | | 11 | rus | Russian | 100000 | 2.36% | 28.5364 |
14 | | 12 | slk | Slovak | 100000 | 2.36% | 28.1906 |
15 | | 13 | slv | Slovenian | 100000 | 2.36% | 27.4234 |
16 | | 14 | sqi | Albanian | 100000 | 2.36% | 28.1297 |
17 | | 15 | swe | Swedish | 100000 | 2.36% | 29.2429 |
18 | | 16 | tha | Thai | 100000 | 2.36% | 25.2803 |
19 | | 17 | tur | Turkish | 100000 | 2.36% | 30.4281 |
20 | | 18 | ukr | Ukrainian | 100000 | 2.36% | 26.5965 |
21 | | 19 | vie | Vietnamese | 100000 | 2.36% | 28.6742 |
22 | | 20 | bul | Bulgarian | 100000 | 2.36% | 28.3935 |
23 | | 21 | lit | Lithuanian | 100000 | 2.36% | 26.8903 |
24 | | 22 | jpn | Japanese | 100000 | 2.36% | 12.6401 |
25 | | 23 | est | Estonian | 100000 | 2.36% | 30.0332 |
26 | | 24 | ben | Bangla | 100000 | 2.36% | 24.8633 |
27 | | 25 | cat | Catalan | 100000 | 2.36% | 30.6194 |
28 | | 26 | ces | Czech | 100000 | 2.36% | 27.2082 |
29 | | 27 | dan | Danish | 100000 | 2.36% | 27.9852 |
30 | | 28 | deu | German | 100000 | 2.36% | 31.7739 |
31 | | 29 | ell | Greek | 100000 | 2.36% | 30.6192 |
32 | | 30 | ita | Italian | 100000 | 2.36% | 31.1652 |
33 | | 31 | spa | Spanish | 100000 | 2.36% | 32.271 |
34 | | 32 | eng | English | 100000 | 2.36% | 30.3055 |
35 | | 33 | fas | Persian | 100000 | 2.36% | 25.1484 |
36 | | 34 | fin | Finnish | 100000 | 2.36% | 29.2801 |
37 | | 35 | fra | French | 100000 | 2.36% | 30.8136 |
38 | | 36 | heb | Hebrew | 100000 | 2.36% | 25.2828 |
39 | | 37 | hin | Hindi | 100000 | 2.36% | 26.7333 |
40 | | 38 | hrv | Croatian | 100000 | 2.36% | 28.8623 |
41 | | 39 | hun | Hungarian | 100000 | 2.36% | 30.0959 |
42 | | 40 | ind | Indonesian | 100000 | 2.36% | 29.4894 |
43 | | 41 | zho | Chinese | 100000 | 2.36% | 12.5245 |
44 | | 42 | urd | Urdu | 46523 | 1.10% | 27.2766 |
45 | | 43 | tam | Tamil | 40165 | 0.95% | 29.4984 |
46 | | 44 | tel | Telugu | 30416 | 0.72% | 26.5348 |
47 | | 45 | fil | Filipino | 19314 | 0.46% | 31.8587 |
--------------------------------------------------------------------------------
/datasets/tatoeba-sentences-2021-06-05-common-48/stats.md:
--------------------------------------------------------------------------------
1 | | | alpha3 | language | sentences | dataset_percentage | mean_len |
2 | |---:|:---------|:-----------------|------------:|:---------------------|-----------:|
3 | | 1 | eng | English | 1479733 | 19.83% | 39.3277 |
4 | | 2 | rus | Russian | 849653 | 11.39% | 33.4655 |
5 | | 3 | ita | Italian | 787053 | 10.55% | 33.4897 |
6 | | 4 | tur | Turkish | 709573 | 9.51% | 34.7355 |
7 | | 5 | deu | German | 553727 | 7.42% | 47.4774 |
8 | | 6 | fra | French | 466192 | 6.25% | 41.3866 |
9 | | 7 | por | Portuguese | 385737 | 5.17% | 38.2929 |
10 | | 8 | spa | Spanish | 338781 | 4.54% | 38.8894 |
11 | | 9 | hun | Hungarian | 323048 | 4.33% | 34.0299 |
12 | | 10 | jpn | Japanese | 208761 | 2.80% | 18.2659 |
13 | | 11 | heb | Hebrew | 197226 | 2.64% | 25.5678 |
14 | | 12 | ukr | Ukrainian | 171674 | 2.30% | 27.8153 |
15 | | 13 | nld | Dutch | 144340 | 1.93% | 34.7853 |
16 | | 14 | fin | Finnish | 128011 | 1.72% | 35.7946 |
17 | | 15 | pol | Polish | 109662 | 1.47% | 33.2333 |
18 | | 16 | mkd | Macedonian | 77938 | 1.04% | 27.3793 |
19 | | 17 | mar | Marathi | 64126 | 0.86% | 27.587 |
20 | | 18 | lit | Lithuanian | 59659 | 0.80% | 30.1439 |
21 | | 19 | ces | Czech | 57030 | 0.76% | 28.3683 |
22 | | 20 | dan | Danish | 49399 | 0.66% | 33.7159 |
23 | | 21 | swe | Swedish | 41677 | 0.56% | 30.1428 |
24 | | 22 | ara | Arabic | 35991 | 0.48% | 26.7817 |
25 | | 23 | ell | Greek | 34071 | 0.46% | 30.3915 |
26 | | 24 | ron | Romanian | 24943 | 0.33% | 34.4097 |
27 | | 25 | bul | Bulgarian | 24503 | 0.33% | 31.7201 |
28 | | 26 | vie | Vietnamese | 19234 | 0.26% | 38.7891 |
29 | | 27 | fil | Filipino | 16649 | 0.22% | 36.8098 |
30 | | 28 | slk | Slovak | 14660 | 0.20% | 25.7422 |
31 | | 29 | ind | Indonesian | 14542 | 0.19% | 37.4785 |
32 | | 30 | hin | Hindi | 14230 | 0.19% | 27.6058 |
33 | | 31 | nob | Norwegian Bokmål | 14223 | 0.19% | 37.4732 |
34 | | 32 | cat | Catalan | 7971 | 0.11% | 37.334 |
35 | | 33 | kor | Korean | 7570 | 0.10% | 16.8085 |
36 | | 34 | hrv | Croatian | 5204 | 0.07% | 30.058 |
37 | | 35 | ben | Bangla | 4714 | 0.06% | 23.7809 |
38 | | 36 | afr | Afrikaans | 4031 | 0.05% | 29.676 |
39 | | 37 | est | Estonian | 3637 | 0.05% | 27.6646 |
40 | | 38 | tha | Thai | 3528 | 0.05% | 20.5697 |
41 | | 39 | sqi | Albanian | 2526 | 0.03% | 32.2743 |
42 | | 40 | urd | Urdu | 2008 | 0.03% | 30.7495 |
43 | | 41 | cym | Welsh | 1344 | 0.02% | 29.3058 |
44 | | 42 | slv | Slovenian | 1093 | 0.01% | 28.4282 |
45 | | 43 | mal | Malayalam | 827 | 0.01% | 36.8222 |
46 | | 44 | tam | Tamil | 334 | 0.00% | 35.2784 |
47 | | 45 | tel | Telugu | 254 | 0.00% | 28.0157 |
48 | | 46 | pan | Punjabi | 196 | 0.00% | 32.8622 |
49 | | 47 | kan | Kannada | 176 | 0.00% | 35.3636 |
50 | | 48 | guj | Gujarati | 168 | 0.00% | 24.244 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/results.md:
--------------------------------------------------------------------------------
1 | # Aggregated results for tatoeba-sentences-2021-06-05
2 |
3 | | Library | Supported languages | # sentences supported | Aggregated accuracy | Per language metrics |
4 | |:--------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
5 | | fasttext | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext/classification_performance.md#supported-languages) | 9,640,185 (87.64%) | [98.27%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext/classification_performance.md#metrics-per-language) |
6 | | fasttext-compressed | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext-compressed/classification_performance.md#supported-languages) | 9,640,185 (87.64%) | [96.81%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext-compressed/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/fasttext-compressed/classification_performance.md#metrics-per-language) |
7 | | gcld3 | [107](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/gcld3/classification_performance.md#supported-languages) | 9,640,185 (85.70%) | [87.11%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/gcld3/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/gcld3/classification_performance.md#metrics-per-language) |
8 | | langdetect | [55](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langdetect/classification_performance.md#supported-languages) | 9,640,185 (77.40%) | [92.45%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langdetect/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langdetect/classification_performance.md#metrics-per-language) |
9 | | langid | [97](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langid/classification_performance.md#supported-languages) | 9,640,185 (86.08%) | [89.00%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langid/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/langid/classification_performance.md#metrics-per-language) |
10 | | pycld2 | [83](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/pycld2/classification_performance.md#supported-languages) | 9,640,185 (78.52%) | [86.95%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/pycld2/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05/pycld2/classification_performance.md#metrics-per-language) |
--------------------------------------------------------------------------------
/results/open-subtitles-v2018-100k-per-lang/results.md:
--------------------------------------------------------------------------------
1 | # Aggregated results for open-subtitles-v2018-100k-per-lang
2 |
3 | | Library | Supported languages | # sentences supported | Aggregated accuracy | Per language metrics |
4 | |:--------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
5 | | fasttext | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [80.16%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext/classification_performance.md#metrics-per-language) |
6 | | fasttext-compressed | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext-compressed/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [75.21%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext-compressed/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/fasttext-compressed/classification_performance.md#metrics-per-language) |
7 | | gcld3 | [107](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/gcld3/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [73.08%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/gcld3/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/gcld3/classification_performance.md#metrics-per-language) |
8 | | langdetect | [55](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langdetect/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [79.48%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langdetect/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langdetect/classification_performance.md#metrics-per-language) |
9 | | langid | [97](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langid/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [74.19%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langid/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/langid/classification_performance.md#metrics-per-language) |
10 | | pycld2 | [83](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/pycld2/classification_performance.md#supported-languages) | 4,236,418 (100.00%) | [68.41%](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/pycld2/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/open-subtitles-v2018-100k-per-lang/pycld2/classification_performance.md#metrics-per-language) |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05-common-48/results.md:
--------------------------------------------------------------------------------
1 | # Aggregated results for tatoeba-sentences-2021-06-05-common-48
2 |
3 | | Library | Supported languages | # sentences supported | Aggregated accuracy | Per language metrics |
4 | |:--------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
5 | | fasttext | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [98.94%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext/classification_performance.md#metrics-per-language) |
6 | | fasttext-compressed | [176](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext-compressed/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [97.90%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext-compressed/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/fasttext-compressed/classification_performance.md#metrics-per-language) |
7 | | gcld3 | [107](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/gcld3/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [86.98%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/gcld3/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/gcld3/classification_performance.md#metrics-per-language) |
8 | | langdetect | [55](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langdetect/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [92.47%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langdetect/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langdetect/classification_performance.md#metrics-per-language) |
9 | | langid | [97](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langid/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [90.15%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langid/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/langid/classification_performance.md#metrics-per-language) |
10 | | pycld2 | [83](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/pycld2/classification_performance.md#supported-languages) | 7,461,627 (100.00%) | [87.12%](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/pycld2/classification_performance.md) | [See metrics](https://github.com/modelpredict/language-identification-survey/blob/main/results/tatoeba-sentences-2021-06-05-common-48/pycld2/classification_performance.md#metrics-per-language) |
--------------------------------------------------------------------------------
/results/open-subtitles-v2018-100k-per-lang/langdetect/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for langdetect on open-subtitles-v2018-100k-per-lang
2 |
3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%)
4 | - **Aggregated accuracy: 79.48%**
5 |
6 | Supported languages (55)
7 |
8 | afr (Afrikaans), ara (Arabic), bul (Bulgarian), ben (Bangla), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), eng (English), spa (Spanish), est (Estonian), fas (Persian), fin (Finnish), fra (French), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hun (Hungarian), ind (Indonesian), ita (Italian), jpn (Japanese), kan (Kannada), kor (Korean), lit (Lithuanian), lav (Latvian), mkd (Macedonian), mal (Malayalam), mar (Marathi), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), pan (Punjabi), pol (Polish), por (Portuguese), ron (Romanian), rus (Russian), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), zho (Chinese), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|------:|--------:|------:|
14 | | 1 | ell | Greek | 100000 | 1.000 | 0.996 | 0.998 | 99615 | 0 | 4136418 | 385 |
15 | | 2 | swe | Swedish | 100000 | 0.853 | 0.734 | 0.739 | 73427 | 12668 | 4123750 | 26573 |
16 | | 3 | mkd | Macedonian | 100000 | 0.732 | 0.831 | 0.682 | 83087 | 30346 | 4106072 | 16913 |
17 | | 4 | tha | Thai | 100000 | 1.000 | 0.959 | 0.979 | 95853 | 0 | 4136418 | 4147 |
18 | | 5 | cat | Catalan | 100000 | 0.790 | 0.649 | 0.651 | 64878 | 17269 | 4119149 | 35122 |
19 | | 6 | bul | Bulgarian | 100000 | 0.718 | 0.694 | 0.620 | 69446 | 27311 | 4109107 | 30554 |
20 | | 7 | fin | Finnish | 100000 | 0.828 | 0.883 | 0.785 | 88258 | 18356 | 4118062 | 11742 |
21 | | 8 | dan | Danish | 100000 | 0.681 | 0.589 | 0.550 | 58899 | 27614 | 4108804 | 41101 |
22 | | 9 | hun | Hungarian | 100000 | 0.915 | 0.823 | 0.833 | 82309 | 7622 | 4128796 | 17691 |
23 | | 10 | kor | Korean | 100000 | 0.774 | 0.962 | 0.763 | 96150 | 28018 | 4108400 | 3850 |
24 | | 11 | spa | Spanish | 100000 | 0.791 | 0.707 | 0.680 | 70674 | 18621 | 4117797 | 29326 |
25 | | 12 | zho | Chinese | 100000 | 0.983 | 0.593 | 0.734 | 59251 | 1047 | 4135371 | 40749 |
26 | | 13 | slk | Slovak | 100000 | 0.766 | 0.628 | 0.624 | 62796 | 19226 | 4117192 | 37204 |
27 | | 14 | ron | Romanian | 100000 | 0.862 | 0.774 | 0.766 | 77434 | 12373 | 4124045 | 22566 |
28 | | 15 | ind | Indonesian | 100000 | 0.705 | 0.783 | 0.643 | 78346 | 32709 | 4103709 | 21654 |
29 | | 16 | est | Estonian | 100000 | 0.814 | 0.777 | 0.729 | 77744 | 17757 | 4118661 | 22256 |
30 | | 17 | por | Portuguese | 100000 | 0.676 | 0.751 | 0.608 | 75080 | 35971 | 4100447 | 24920 |
31 | | 18 | hrv | Croatian | 100000 | 0.670 | 0.652 | 0.568 | 65173 | 32078 | 4104340 | 34827 |
32 | | 19 | heb | Hebrew | 100000 | 1.000 | 0.990 | 0.995 | 98997 | 0 | 4136418 | 1003 |
33 | | 20 | ita | Italian | 100000 | 0.734 | 0.780 | 0.666 | 78012 | 28203 | 4108215 | 21988 |
34 | | 21 | slv | Slovenian | 100000 | 0.657 | 0.653 | 0.559 | 65297 | 34095 | 4102323 | 34703 |
35 | | 22 | ces | Czech | 100000 | 0.825 | 0.706 | 0.704 | 70563 | 14998 | 4121420 | 29437 |
36 | | 23 | mal | Malayalam | 100000 | 1.000 | 0.976 | 0.988 | 97615 | 0 | 4136418 | 2385 |
37 | | 24 | lit | Lithuanian | 100000 | 0.873 | 0.770 | 0.772 | 77002 | 11212 | 4125206 | 22998 |
38 | | 25 | ukr | Ukrainian | 100000 | 0.851 | 0.545 | 0.628 | 54530 | 9511 | 4126907 | 45470 |
39 | | 26 | pol | Polish | 100000 | 0.902 | 0.867 | 0.844 | 86670 | 9390 | 4127028 | 13330 |
40 | | 27 | fas | Persian | 100000 | 0.959 | 0.829 | 0.872 | 82872 | 3550 | 4132868 | 17128 |
41 | | 28 | jpn | Japanese | 100000 | 0.999 | 0.944 | 0.971 | 94447 | 59 | 4136359 | 5553 |
42 | | 29 | hin | Hindi | 100000 | 1.000 | 0.819 | 0.901 | 81904 | 0 | 4136418 | 18096 |
43 | | 30 | eng | English | 100000 | 0.593 | 0.749 | 0.539 | 74888 | 51451 | 4084967 | 25112 |
44 | | 31 | sqi | Albanian | 100000 | 0.932 | 0.808 | 0.839 | 80757 | 5914 | 4130504 | 19243 |
45 | | 32 | rus | Russian | 100000 | 0.648 | 0.793 | 0.597 | 79285 | 43117 | 4093301 | 20715 |
46 | | 33 | fra | French | 100000 | 0.757 | 0.776 | 0.682 | 77581 | 24894 | 4111524 | 22419 |
47 | | 34 | lav | Latvian | 100000 | 0.920 | 0.811 | 0.831 | 81127 | 7057 | 4129361 | 18873 |
48 | | 35 | deu | German | 100000 | 0.670 | 0.841 | 0.630 | 84127 | 41422 | 4094996 | 15873 |
49 | | 36 | tur | Turkish | 100000 | 0.895 | 0.851 | 0.829 | 85053 | 10012 | 4126406 | 14947 |
50 | | 37 | ara | Arabic | 100000 | 0.895 | 0.936 | 0.868 | 93567 | 11010 | 4125408 | 6433 |
51 | | 38 | vie | Vietnamese | 100000 | 0.927 | 0.917 | 0.890 | 91653 | 7171 | 4129247 | 8347 |
52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.635 | 0.647 | 0.541 | 64662 | 37243 | 4099175 | 35338 |
53 | | 40 | ben | Bangla | 100000 | 1.000 | 0.971 | 0.985 | 97127 | 0 | 4136418 | 2873 |
54 | | 41 | nld | Dutch | 100000 | 0.774 | 0.687 | 0.658 | 68682 | 20042 | 4116376 | 31318 |
55 | | 42 | urd | Urdu | 46523 | 0.880 | 0.884 | 0.832 | 41104 | 5590 | 4184305 | 5419 |
56 | | 43 | tam | Tamil | 40165 | 1.000 | 0.952 | 0.975 | 38236 | 2 | 4196251 | 1929 |
57 | | 44 | tel | Telugu | 30416 | 1.000 | 0.941 | 0.970 | 28630 | 0 | 4206002 | 1786 |
58 | | 45 | fil | Filipino | 19314 | 0.378 | 0.746 | 0.356 | 14401 | 23647 | 4193457 | 4913 |
--------------------------------------------------------------------------------
/results/open-subtitles-v2018-100k-per-lang/pycld2/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for pycld2 on open-subtitles-v2018-100k-per-lang
2 |
3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%)
4 | - **Aggregated accuracy: 68.41%**
5 |
6 | Supported languages (83)
7 |
8 | afr (Afrikaans), sqi (Albanian), ara (Arabic), hye (Armenian), aze (Azerbaijani), eus (Basque), bel (Belarusian), ben (Bangla), bih (Bihari languages), bul (Bulgarian), cat (Catalan), ceb (Cebuano), chr (Cherokee), hrv (Croatian), ces (Czech), zho (Chinese), dan (Danish), div (Divehi), nld (Dutch), eng (English), est (Estonian), fin (Finnish), fra (French), glg (Galician), lug (Ganda), kat (Georgian), deu (German), ell (Greek), guj (Gujarati), hat (Haitian Creole), heb (Hebrew), hin (Hindi), hmn (Hmong), hun (Hungarian), isl (Icelandic), ind (Indonesian), iku (Inuktitut), gle (Irish), ita (Italian), jav (Javanese), jpn (Japanese), kan (Kannada), khm (Khmer), kin (Kinyarwanda), kor (Korean), lao (Lao), lav (Latvian), lif (Limbu), lit (Lithuanian), mkd (Macedonian), msa (Malay), mal (Malayalam), mlt (Maltese), mar (Marathi), nep (Nepali), nob (Norwegian Bokmål), ori (Odia), fas (Persian), pol (Polish), por (Portuguese), pan (Punjabi), ron (Romanian), rus (Russian), gla (Scottish Gaelic), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), spa (Spanish), swa (Swahili), swe (Swedish), syr (Syriac), fil (Filipino), tam (Tamil), tel (Telugu), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), cym (Welsh), yid (Yiddish), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|------:|--------:|------:|
14 | | 1 | ell | Greek | 100000 | 1.000 | 0.997 | 0.998 | 99664 | 26 | 4136392 | 336 |
15 | | 2 | swe | Swedish | 100000 | 0.995 | 0.632 | 0.771 | 63168 | 297 | 4136121 | 36832 |
16 | | 3 | mkd | Macedonian | 100000 | 0.973 | 0.452 | 0.612 | 45227 | 1246 | 4135172 | 54773 |
17 | | 4 | tha | Thai | 100000 | 1.000 | 0.961 | 0.980 | 96100 | 0 | 4136418 | 3900 |
18 | | 5 | cat | Catalan | 100000 | 0.991 | 0.484 | 0.648 | 48417 | 464 | 4135954 | 51583 |
19 | | 6 | bul | Bulgarian | 100000 | 0.969 | 0.570 | 0.710 | 57021 | 1805 | 4134613 | 42979 |
20 | | 7 | fin | Finnish | 100000 | 0.996 | 0.747 | 0.852 | 74729 | 300 | 4136118 | 25271 |
21 | | 8 | dan | Danish | 100000 | 0.873 | 0.573 | 0.659 | 57328 | 8331 | 4128087 | 42672 |
22 | | 9 | hun | Hungarian | 100000 | 0.999 | 0.764 | 0.865 | 76416 | 109 | 4136309 | 23584 |
23 | | 10 | kor | Korean | 100000 | 1.000 | 0.891 | 0.942 | 89101 | 0 | 4136418 | 10899 |
24 | | 11 | spa | Spanish | 100000 | 0.971 | 0.648 | 0.769 | 64824 | 1918 | 4134500 | 35176 |
25 | | 12 | zho | Chinese | 100000 | 0.999 | 0.684 | 0.812 | 68435 | 35 | 4136383 | 31565 |
26 | | 13 | slk | Slovak | 100000 | 0.904 | 0.658 | 0.732 | 65838 | 7024 | 4129394 | 34162 |
27 | | 14 | ron | Romanian | 100000 | 0.995 | 0.666 | 0.796 | 66604 | 367 | 4136051 | 33396 |
28 | | 15 | ind | Indonesian | 100000 | 0.970 | 0.626 | 0.752 | 62612 | 1912 | 4134506 | 37388 |
29 | | 16 | est | Estonian | 100000 | 0.993 | 0.668 | 0.797 | 66835 | 448 | 4135970 | 33165 |
30 | | 17 | por | Portuguese | 100000 | 0.938 | 0.667 | 0.760 | 66656 | 4394 | 4132024 | 33344 |
31 | | 18 | hrv | Croatian | 100000 | 0.968 | 0.468 | 0.625 | 46827 | 1560 | 4134858 | 53173 |
32 | | 19 | heb | Hebrew | 100000 | 1.000 | 0.619 | 0.765 | 61882 | 1 | 4136417 | 38118 |
33 | | 20 | ita | Italian | 100000 | 0.992 | 0.504 | 0.667 | 50426 | 396 | 4136022 | 49574 |
34 | | 21 | slv | Slovenian | 100000 | 0.991 | 0.444 | 0.612 | 44391 | 384 | 4136034 | 55609 |
35 | | 22 | ces | Czech | 100000 | 0.895 | 0.716 | 0.760 | 71587 | 8361 | 4128057 | 28413 |
36 | | 23 | mal | Malayalam | 100000 | 1.000 | 0.977 | 0.988 | 97651 | 0 | 4136418 | 2349 |
37 | | 24 | lit | Lithuanian | 100000 | 0.995 | 0.659 | 0.791 | 65892 | 340 | 4136078 | 34108 |
38 | | 25 | ukr | Ukrainian | 100000 | 0.989 | 0.456 | 0.622 | 45595 | 484 | 4135934 | 54405 |
39 | | 26 | pol | Polish | 100000 | 0.997 | 0.729 | 0.841 | 72937 | 221 | 4136197 | 27063 |
40 | | 27 | fas | Persian | 100000 | 0.998 | 0.604 | 0.752 | 60362 | 119 | 4136299 | 39638 |
41 | | 28 | jpn | Japanese | 100000 | 0.988 | 0.911 | 0.942 | 91104 | 1114 | 4135304 | 8896 |
42 | | 29 | hin | Hindi | 100000 | 1.000 | 0.847 | 0.917 | 84721 | 0 | 4136418 | 15279 |
43 | | 30 | eng | English | 100000 | 0.620 | 0.744 | 0.560 | 74428 | 45610 | 4090808 | 25572 |
44 | | 31 | sqi | Albanian | 100000 | 0.999 | 0.708 | 0.828 | 70830 | 97 | 4136321 | 29170 |
45 | | 32 | rus | Russian | 100000 | 0.821 | 0.638 | 0.666 | 63789 | 13928 | 4122490 | 36211 |
46 | | 33 | fra | French | 100000 | 0.990 | 0.644 | 0.777 | 64405 | 654 | 4135764 | 35595 |
47 | | 34 | lav | Latvian | 100000 | 0.995 | 0.674 | 0.802 | 67436 | 321 | 4136097 | 32564 |
48 | | 35 | deu | German | 100000 | 0.991 | 0.733 | 0.840 | 73344 | 634 | 4135784 | 26656 |
49 | | 36 | tur | Turkish | 100000 | 0.999 | 0.747 | 0.854 | 74678 | 92 | 4136326 | 25322 |
50 | | 37 | ara | Arabic | 100000 | 0.986 | 0.653 | 0.781 | 65309 | 935 | 4135483 | 34691 |
51 | | 38 | vie | Vietnamese | 100000 | 0.999 | 0.773 | 0.871 | 77288 | 59 | 4136359 | 22712 |
52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.802 | 0.634 | 0.652 | 63433 | 15634 | 4120784 | 36567 |
53 | | 40 | ben | Bangla | 100000 | 1.000 | 0.639 | 0.780 | 63868 | 0 | 4136418 | 36132 |
54 | | 41 | nld | Dutch | 100000 | 0.996 | 0.676 | 0.804 | 67614 | 251 | 4136167 | 32386 |
55 | | 42 | urd | Urdu | 46523 | 0.998 | 0.677 | 0.806 | 31508 | 57 | 4189838 | 15015 |
56 | | 43 | tam | Tamil | 40165 | 1.000 | 0.954 | 0.976 | 38307 | 2 | 4196251 | 1858 |
57 | | 44 | tel | Telugu | 30416 | 1.000 | 0.945 | 0.972 | 28732 | 0 | 4206002 | 1684 |
58 | | 45 | fil | Filipino | 19314 | 0.996 | 0.550 | 0.708 | 10626 | 43 | 4217061 | 8688 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05-common-48/langdetect/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for langdetect on tatoeba-sentences-2021-06-05-common-48
2 |
3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%)
4 | - **Aggregated accuracy: 92.47%**
5 |
6 | Supported languages (55)
7 |
8 | afr (Afrikaans), ara (Arabic), bul (Bulgarian), ben (Bangla), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), eng (English), spa (Spanish), est (Estonian), fas (Persian), fin (Finnish), fra (French), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hun (Hungarian), ind (Indonesian), ita (Italian), jpn (Japanese), kan (Kannada), kor (Korean), lit (Lithuanian), lav (Latvian), mkd (Macedonian), mal (Malayalam), mar (Marathi), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), pan (Punjabi), pol (Polish), por (Portuguese), ron (Romanian), rus (Russian), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), zho (Chinese), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|------:|
14 | | 1 | eng | English | 1479733 | 0.988 | 0.933 | 0.954 | 1381214 | 16940 | 5964954 | 98519 |
15 | | 2 | rus | Russian | 849653 | 0.970 | 0.916 | 0.928 | 778060 | 24120 | 6587854 | 71593 |
16 | | 3 | ita | Italian | 787053 | 0.974 | 0.897 | 0.922 | 705735 | 19187 | 6655387 | 81318 |
17 | | 4 | tur | Turkish | 709573 | 0.996 | 0.971 | 0.982 | 689314 | 2543 | 6749511 | 20259 |
18 | | 5 | deu | German | 553727 | 0.985 | 0.967 | 0.969 | 535515 | 8255 | 6899645 | 18212 |
19 | | 6 | fra | French | 466192 | 0.944 | 0.947 | 0.920 | 441289 | 25946 | 6969489 | 24903 |
20 | | 7 | por | Portuguese | 385737 | 0.877 | 0.900 | 0.836 | 346971 | 48693 | 7027197 | 38766 |
21 | | 8 | spa | Spanish | 338781 | 0.916 | 0.831 | 0.838 | 281382 | 25647 | 7097199 | 57399 |
22 | | 9 | hun | Hungarian | 323048 | 0.991 | 0.950 | 0.965 | 306757 | 2935 | 7135644 | 16291 |
23 | | 10 | jpn | Japanese | 208761 | 1.000 | 0.999 | 1.000 | 208592 | 0 | 7252866 | 169 |
24 | | 11 | heb | Hebrew | 197226 | 1.000 | 1.000 | 1.000 | 197226 | 0 | 7264401 | 0 |
25 | | 12 | ukr | Ukrainian | 171674 | 0.895 | 0.796 | 0.803 | 136621 | 15996 | 7273957 | 35053 |
26 | | 13 | nld | Dutch | 144340 | 0.872 | 0.815 | 0.793 | 117650 | 17344 | 7299943 | 26690 |
27 | | 14 | fin | Finnish | 128011 | 0.943 | 0.971 | 0.930 | 124354 | 7511 | 7326105 | 3657 |
28 | | 15 | pol | Polish | 109662 | 0.985 | 0.972 | 0.971 | 106609 | 1641 | 7350324 | 3053 |
29 | | 16 | mkd | Macedonian | 77938 | 0.684 | 0.889 | 0.656 | 69298 | 32058 | 7351631 | 8640 |
30 | | 17 | mar | Marathi | 64126 | 0.997 | 0.932 | 0.962 | 59755 | 190 | 7397311 | 4371 |
31 | | 18 | lit | Lithuanian | 59659 | 0.934 | 0.944 | 0.908 | 56302 | 4006 | 7397962 | 3357 |
32 | | 19 | ces | Czech | 57030 | 0.937 | 0.848 | 0.865 | 48335 | 3225 | 7401372 | 8695 |
33 | | 20 | dan | Danish | 49399 | 0.697 | 0.697 | 0.606 | 34438 | 14948 | 7397280 | 14961 |
34 | | 21 | swe | Swedish | 41677 | 0.815 | 0.852 | 0.761 | 35494 | 8046 | 7411904 | 6183 |
35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.979 | 0.989 | 35231 | 4 | 7425632 | 760 |
36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 2 | 7427554 | 0 |
37 | | 24 | ron | Romanian | 24943 | 0.541 | 0.942 | 0.532 | 23508 | 19948 | 7416736 | 1435 |
38 | | 25 | bul | Bulgarian | 24503 | 0.284 | 0.783 | 0.273 | 19182 | 48422 | 7388702 | 5321 |
39 | | 26 | vie | Vietnamese | 19234 | 0.971 | 0.999 | 0.970 | 19220 | 580 | 7441813 | 14 |
40 | | 27 | fil | Filipino | 16649 | 0.579 | 0.943 | 0.569 | 15707 | 11441 | 7433537 | 942 |
41 | | 28 | slk | Slovak | 14660 | 0.520 | 0.762 | 0.481 | 11175 | 10312 | 7436655 | 3485 |
42 | | 29 | ind | Indonesian | 14542 | 0.496 | 0.943 | 0.488 | 13717 | 13953 | 7433132 | 825 |
43 | | 30 | hin | Hindi | 14230 | 0.785 | 0.957 | 0.772 | 13622 | 3722 | 7443675 | 608 |
44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.250 | 0.816 | 0.243 | 11613 | 34844 | 7412560 | 2610 |
45 | | 32 | cat | Catalan | 7971 | 0.143 | 0.839 | 0.141 | 6686 | 39974 | 7413682 | 1285 |
46 | | 33 | kor | Korean | 7570 | 0.986 | 0.999 | 0.985 | 7560 | 108 | 7453949 | 10 |
47 | | 34 | hrv | Croatian | 5204 | 0.333 | 0.803 | 0.320 | 4181 | 8360 | 7448063 | 1023 |
48 | | 35 | ben | Bangla | 4714 | 1.000 | 1.000 | 1.000 | 4714 | 0 | 7456913 | 0 |
49 | | 36 | afr | Afrikaans | 4031 | 0.072 | 0.855 | 0.072 | 3447 | 44438 | 7413158 | 584 |
50 | | 37 | est | Estonian | 3637 | 0.195 | 0.859 | 0.192 | 3124 | 12874 | 7445116 | 513 |
51 | | 38 | tha | Thai | 3528 | 1.000 | 1.000 | 1.000 | 3528 | 0 | 7458099 | 0 |
52 | | 39 | sqi | Albanian | 2526 | 0.571 | 0.947 | 0.562 | 2391 | 1794 | 7457307 | 135 |
53 | | 40 | urd | Urdu | 2008 | 0.921 | 0.992 | 0.918 | 1992 | 170 | 7459449 | 16 |
54 | | 41 | cym | Welsh | 1344 | 0.143 | 0.940 | 0.142 | 1263 | 7564 | 7452719 | 81 |
55 | | 42 | slv | Slovenian | 1093 | 0.076 | 0.767 | 0.075 | 838 | 10179 | 7450355 | 255 |
56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460800 | 0 |
57 | | 44 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7461293 | 0 |
58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461373 | 0 |
59 | | 46 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7461431 | 0 |
60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461451 | 0 |
61 | | 48 | guj | Gujarati | 168 | 1.000 | 1.000 | 1.000 | 168 | 0 | 7461459 | 0 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/langdetect/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for langdetect on tatoeba-sentences-2021-06-05
2 |
3 | - Dataset coverage (sentences in supported languages): 7461707 (77.40%)
4 | - **Aggregated accuracy: 92.45%**
5 |
6 | Supported languages (55)
7 |
8 | afr (Afrikaans), ara (Arabic), bul (Bulgarian), ben (Bangla), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), eng (English), spa (Spanish), est (Estonian), fas (Persian), fin (Finnish), fra (French), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hun (Hungarian), ind (Indonesian), ita (Italian), jpn (Japanese), kan (Kannada), kor (Korean), lit (Lithuanian), lav (Latvian), mkd (Macedonian), mal (Malayalam), mar (Marathi), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), pan (Punjabi), pol (Polish), por (Portuguese), ron (Romanian), rus (Russian), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), zho (Chinese), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|------:|
14 | | 1 | eng | English | 1479733 | 0.988 | 0.933 | 0.954 | 1380911 | 17010 | 5964964 | 98822 |
15 | | 2 | rus | Russian | 849653 | 0.970 | 0.916 | 0.928 | 777866 | 24227 | 6587827 | 71787 |
16 | | 3 | ita | Italian | 787053 | 0.973 | 0.897 | 0.922 | 705630 | 19337 | 6655317 | 81423 |
17 | | 4 | tur | Turkish | 709573 | 0.996 | 0.971 | 0.982 | 689294 | 2589 | 6749545 | 20279 |
18 | | 5 | deu | German | 553727 | 0.985 | 0.967 | 0.968 | 535363 | 8311 | 6899669 | 18364 |
19 | | 6 | fra | French | 466192 | 0.945 | 0.946 | 0.920 | 441184 | 25895 | 6969620 | 25008 |
20 | | 7 | por | Portuguese | 385737 | 0.877 | 0.899 | 0.836 | 346968 | 48757 | 7027213 | 38769 |
21 | | 8 | spa | Spanish | 338781 | 0.917 | 0.830 | 0.838 | 281211 | 25423 | 7097503 | 57570 |
22 | | 9 | hun | Hungarian | 323048 | 0.990 | 0.950 | 0.965 | 306776 | 3022 | 7135637 | 16272 |
23 | | 10 | jpn | Japanese | 208761 | 1.000 | 0.999 | 1.000 | 208586 | 0 | 7252946 | 175 |
24 | | 11 | heb | Hebrew | 197226 | 1.000 | 1.000 | 1.000 | 197225 | 0 | 7264481 | 1 |
25 | | 12 | ukr | Ukrainian | 171674 | 0.895 | 0.796 | 0.803 | 136663 | 16080 | 7273953 | 35011 |
26 | | 13 | nld | Dutch | 144340 | 0.871 | 0.815 | 0.793 | 117639 | 17432 | 7299935 | 26701 |
27 | | 14 | fin | Finnish | 128011 | 0.943 | 0.971 | 0.930 | 124344 | 7534 | 7326162 | 3667 |
28 | | 15 | pol | Polish | 109662 | 0.985 | 0.972 | 0.971 | 106595 | 1645 | 7350400 | 3067 |
29 | | 16 | mkd | Macedonian | 77938 | 0.684 | 0.889 | 0.656 | 69298 | 32067 | 7351702 | 8640 |
30 | | 17 | mar | Marathi | 64126 | 0.997 | 0.932 | 0.962 | 59783 | 194 | 7397387 | 4343 |
31 | | 18 | lit | Lithuanian | 59659 | 0.933 | 0.944 | 0.907 | 56303 | 4063 | 7397985 | 3356 |
32 | | 19 | ces | Czech | 57030 | 0.937 | 0.848 | 0.864 | 48335 | 3243 | 7401434 | 8695 |
33 | | 20 | dan | Danish | 49399 | 0.695 | 0.697 | 0.604 | 34415 | 15080 | 7397228 | 14984 |
34 | | 21 | swe | Swedish | 41677 | 0.815 | 0.852 | 0.761 | 35494 | 8039 | 7411991 | 6183 |
35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.979 | 0.989 | 35240 | 4 | 7425712 | 751 |
36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 2 | 7427634 | 0 |
37 | | 24 | ron | Romanian | 24943 | 0.539 | 0.942 | 0.531 | 23490 | 20051 | 7416713 | 1453 |
38 | | 25 | bul | Bulgarian | 24503 | 0.284 | 0.782 | 0.273 | 19154 | 48399 | 7388805 | 5349 |
39 | | 26 | vie | Vietnamese | 19234 | 0.969 | 0.999 | 0.969 | 19220 | 607 | 7441866 | 14 |
40 | | 27 | fil | Filipino | 16649 | 0.579 | 0.941 | 0.568 | 15674 | 11413 | 7433645 | 975 |
41 | | 28 | slk | Slovak | 14660 | 0.519 | 0.762 | 0.480 | 11167 | 10366 | 7436681 | 3493 |
42 | | 29 | ind | Indonesian | 14542 | 0.495 | 0.942 | 0.488 | 13698 | 13963 | 7433202 | 844 |
43 | | 30 | hin | Hindi | 14230 | 0.786 | 0.957 | 0.773 | 13618 | 3697 | 7443780 | 612 |
44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.251 | 0.818 | 0.244 | 11639 | 34806 | 7412678 | 2584 |
45 | | 32 | cat | Catalan | 7971 | 0.143 | 0.841 | 0.141 | 6702 | 40066 | 7413670 | 1269 |
46 | | 33 | kor | Korean | 7570 | 0.985 | 0.999 | 0.984 | 7559 | 119 | 7454018 | 11 |
47 | | 34 | hrv | Croatian | 5204 | 0.334 | 0.805 | 0.321 | 4189 | 8362 | 7448141 | 1015 |
48 | | 35 | ben | Bangla | 4714 | 1.000 | 1.000 | 1.000 | 4714 | 0 | 7456993 | 0 |
49 | | 36 | afr | Afrikaans | 4031 | 0.072 | 0.854 | 0.071 | 3441 | 44440 | 7413236 | 590 |
50 | | 37 | est | Estonian | 3637 | 0.196 | 0.860 | 0.193 | 3129 | 12850 | 7445220 | 508 |
51 | | 38 | tha | Thai | 3528 | 1.000 | 1.000 | 1.000 | 3528 | 0 | 7458179 | 0 |
52 | | 39 | sqi | Albanian | 2526 | 0.564 | 0.947 | 0.555 | 2391 | 1846 | 7457335 | 135 |
53 | | 40 | urd | Urdu | 2008 | 0.922 | 0.992 | 0.918 | 1991 | 169 | 7459530 | 17 |
54 | | 41 | cym | Welsh | 1344 | 0.142 | 0.940 | 0.142 | 1264 | 7614 | 7452749 | 80 |
55 | | 42 | slv | Slovenian | 1093 | 0.074 | 0.752 | 0.073 | 822 | 10262 | 7450352 | 271 |
56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460880 | 0 |
57 | | 44 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7461373 | 0 |
58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461453 | 0 |
59 | | 46 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7461511 | 0 |
60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461531 | 0 |
61 | | 48 | guj | Gujarati | 168 | 1.000 | 1.000 | 1.000 | 168 | 0 | 7461539 | 0 |
62 | | 49 | som | Somali | 80 | 0.014 | 0.988 | 0.014 | 79 | 5524 | 7456103 | 1 |
--------------------------------------------------------------------------------
/results/open-subtitles-v2018-100k-per-lang/langid/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for langid on open-subtitles-v2018-100k-per-lang
2 |
3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%)
4 | - **Aggregated accuracy: 74.19%**
5 |
6 | Supported languages (97)
7 |
8 | afr (Afrikaans), amh (Amharic), arg (Aragonese), ara (Arabic), asm (Assamese), aze (Azerbaijani), bel (Belarusian), bul (Bulgarian), ben (Bangla), bre (Breton), bos (Bosnian), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), dzo (Dzongkha), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fao (Faroese), fra (French), gle (Irish), glg (Galician), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), isl (Icelandic), ita (Italian), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), nob (Norwegian Bokmål), nep (Nepali), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), que (Quechua), ron (Romanian), rus (Russian), kin (Kinyarwanda), sme (Northern Sami), sin (Sinhala), slk (Slovak), slv (Slovenian), sqi (Albanian), srp (Serbian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), uig (Uyghur), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), vol (Volapük), wln (Walloon), xho (Xhosa), zho (Chinese), zul (Zulu)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|-------:|--------:|------:|
14 | | 1 | ell | Greek | 100000 | 0.999 | 0.996 | 0.998 | 99645 | 69 | 4136349 | 355 |
15 | | 2 | swe | Swedish | 100000 | 0.856 | 0.710 | 0.728 | 70956 | 11937 | 4124481 | 29044 |
16 | | 3 | mkd | Macedonian | 100000 | 0.782 | 0.433 | 0.517 | 43309 | 12082 | 4124336 | 56691 |
17 | | 4 | tha | Thai | 100000 | 1.000 | 0.961 | 0.980 | 96095 | 20 | 4136398 | 3905 |
18 | | 5 | cat | Catalan | 100000 | 0.930 | 0.505 | 0.639 | 50550 | 3830 | 4132588 | 49450 |
19 | | 6 | bul | Bulgarian | 100000 | 0.689 | 0.570 | 0.547 | 57031 | 25798 | 4110620 | 42969 |
20 | | 7 | fin | Finnish | 100000 | 0.813 | 0.826 | 0.749 | 82642 | 18954 | 4117464 | 17358 |
21 | | 8 | dan | Danish | 100000 | 0.674 | 0.521 | 0.514 | 52077 | 25239 | 4111179 | 47923 |
22 | | 9 | hun | Hungarian | 100000 | 0.884 | 0.795 | 0.794 | 79534 | 10437 | 4125981 | 20466 |
23 | | 10 | kor | Korean | 100000 | 0.999 | 0.962 | 0.980 | 96165 | 68 | 4136350 | 3835 |
24 | | 11 | spa | Spanish | 100000 | 0.615 | 0.719 | 0.549 | 71937 | 44996 | 4091422 | 28063 |
25 | | 12 | zho | Chinese | 100000 | 0.890 | 0.916 | 0.855 | 91584 | 11311 | 4125107 | 8416 |
26 | | 13 | slk | Slovak | 100000 | 0.796 | 0.575 | 0.615 | 57535 | 14775 | 4121643 | 42465 |
27 | | 14 | ron | Romanian | 100000 | 0.934 | 0.682 | 0.767 | 68239 | 4830 | 4131588 | 31761 |
28 | | 15 | ind | Indonesian | 100000 | 0.875 | 0.529 | 0.630 | 52912 | 7540 | 4128878 | 47088 |
29 | | 16 | est | Estonian | 100000 | 0.867 | 0.632 | 0.692 | 63160 | 9726 | 4126692 | 36840 |
30 | | 17 | por | Portuguese | 100000 | 0.833 | 0.646 | 0.678 | 64621 | 12978 | 4123440 | 35379 |
31 | | 18 | hrv | Croatian | 100000 | 0.803 | 0.505 | 0.576 | 50470 | 12351 | 4124067 | 49530 |
32 | | 19 | heb | Hebrew | 100000 | 0.999 | 0.974 | 0.985 | 97363 | 121 | 4136297 | 2637 |
33 | | 20 | ita | Italian | 100000 | 0.729 | 0.711 | 0.635 | 71086 | 26488 | 4109930 | 28914 |
34 | | 21 | slv | Slovenian | 100000 | 0.631 | 0.569 | 0.509 | 56891 | 33249 | 4103169 | 43109 |
35 | | 22 | ces | Czech | 100000 | 0.768 | 0.694 | 0.657 | 69366 | 20953 | 4115465 | 30634 |
36 | | 23 | mal | Malayalam | 100000 | 1.000 | 0.977 | 0.988 | 97661 | 2 | 4136416 | 2339 |
37 | | 24 | lit | Lithuanian | 100000 | 0.821 | 0.710 | 0.703 | 71019 | 15497 | 4120921 | 28981 |
38 | | 25 | ukr | Ukrainian | 100000 | 0.805 | 0.500 | 0.574 | 49995 | 12080 | 4124338 | 50005 |
39 | | 26 | pol | Polish | 100000 | 0.841 | 0.862 | 0.787 | 86150 | 16323 | 4120095 | 13850 |
40 | | 27 | fas | Persian | 100000 | 0.830 | 0.526 | 0.604 | 52589 | 10748 | 4125670 | 47411 |
41 | | 28 | jpn | Japanese | 100000 | 0.974 | 0.966 | 0.957 | 96556 | 2623 | 4133795 | 3444 |
42 | | 29 | hin | Hindi | 100000 | 0.999 | 0.788 | 0.881 | 78804 | 65 | 4136353 | 21196 |
43 | | 30 | eng | English | 100000 | 0.239 | 0.912 | 0.236 | 91242 | 291069 | 3845349 | 8758 |
44 | | 31 | sqi | Albanian | 100000 | 0.987 | 0.747 | 0.846 | 74695 | 995 | 4135423 | 25305 |
45 | | 32 | rus | Russian | 100000 | 0.592 | 0.710 | 0.528 | 71033 | 48977 | 4087441 | 28967 |
46 | | 33 | fra | French | 100000 | 0.675 | 0.766 | 0.612 | 76607 | 36932 | 4099486 | 23393 |
47 | | 34 | lav | Latvian | 100000 | 0.930 | 0.749 | 0.804 | 74883 | 5659 | 4130759 | 25117 |
48 | | 35 | deu | German | 100000 | 0.639 | 0.827 | 0.599 | 82719 | 46756 | 4089662 | 17281 |
49 | | 36 | tur | Turkish | 100000 | 0.939 | 0.778 | 0.828 | 77837 | 5089 | 4131329 | 22163 |
50 | | 37 | ara | Arabic | 100000 | 0.847 | 0.888 | 0.804 | 88771 | 15980 | 4120438 | 11229 |
51 | | 38 | vie | Vietnamese | 100000 | 0.962 | 0.899 | 0.912 | 89889 | 3570 | 4132848 | 10111 |
52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.618 | 0.565 | 0.499 | 56473 | 34838 | 4101580 | 43527 |
53 | | 40 | ben | Bangla | 100000 | 1.000 | 0.929 | 0.963 | 92897 | 10 | 4136408 | 7103 |
54 | | 41 | nld | Dutch | 100000 | 0.831 | 0.780 | 0.744 | 77955 | 15815 | 4120603 | 22045 |
55 | | 42 | urd | Urdu | 46523 | 0.710 | 0.763 | 0.640 | 35497 | 14494 | 4175401 | 11026 |
56 | | 43 | tam | Tamil | 40165 | 1.000 | 0.950 | 0.974 | 38148 | 5 | 4196248 | 2017 |
57 | | 44 | tel | Telugu | 30416 | 0.999 | 0.945 | 0.971 | 28735 | 23 | 4205979 | 1681 |
58 | | 45 | fil | Filipino | 19314 | 0.783 | 0.503 | 0.564 | 9714 | 2695 | 4214409 | 9600 |
--------------------------------------------------------------------------------
/results/open-subtitles-v2018-100k-per-lang/gcld3/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for gcld3 on open-subtitles-v2018-100k-per-lang
2 |
3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%)
4 | - **Aggregated accuracy: 73.08%**
5 |
6 | Supported languages (107)
7 |
8 | afr (Afrikaans), amh (Amharic), ara (Arabic), bul (Bulgarian), bul (Bulgarian), ben (Bangla), bos (Bosnian), cat (Catalan), ceb (Cebuano), cos (Corsican), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fil (Filipino), fra (French), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), guj (Gujarati), hau (Hausa), haw (Hawaiian), hin (Hindi), hin (Hindi), hmn (Hmong), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), ibo (Igbo), isl (Icelandic), ita (Italian), heb (Hebrew), jpn (Japanese), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mri (Maori), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), mya (Burmese), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), nya (Nyanja), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), ron (Romanian), rus (Russian), rus (Russian), snd (Sindhi), sin (Sinhala), slk (Slovak), slv (Slovenian), smo (Samoan), sna (Shona), som (Somali), sqi (Albanian), srp (Serbian), sot (Southern Sotho), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vie (Vietnamese), xho (Xhosa), yid (Yiddish), yor (Yoruba), zho (Chinese), zho (Chinese), zul (Zulu)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|------:|--------:|------:|
14 | | 1 | ell | Greek | 100000 | 0.930 | 0.991 | 0.927 | 99142 | 7411 | 4129007 | 858 |
15 | | 2 | swe | Swedish | 100000 | 0.835 | 0.742 | 0.729 | 74175 | 14708 | 4121710 | 25825 |
16 | | 3 | mkd | Macedonian | 100000 | 0.891 | 0.647 | 0.717 | 64667 | 7898 | 4128520 | 35333 |
17 | | 4 | tha | Thai | 100000 | 0.997 | 0.942 | 0.968 | 94203 | 263 | 4136155 | 5797 |
18 | | 5 | cat | Catalan | 100000 | 0.819 | 0.644 | 0.668 | 64430 | 14233 | 4122185 | 35570 |
19 | | 6 | bul | Bulgarian | 100000 | 0.707 | 0.707 | 0.617 | 70659 | 29255 | 4107163 | 29341 |
20 | | 7 | fin | Finnish | 100000 | 0.807 | 0.784 | 0.726 | 78440 | 18791 | 4117627 | 21560 |
21 | | 8 | dan | Danish | 100000 | 0.761 | 0.614 | 0.614 | 61404 | 19263 | 4117155 | 38596 |
22 | | 9 | hun | Hungarian | 100000 | 0.864 | 0.757 | 0.759 | 75722 | 11924 | 4124494 | 24278 |
23 | | 10 | kor | Korean | 100000 | 0.969 | 0.955 | 0.947 | 95514 | 3081 | 4133337 | 4486 |
24 | | 11 | spa | Spanish | 100000 | 0.795 | 0.666 | 0.663 | 66646 | 17195 | 4119223 | 33354 |
25 | | 12 | zho | Chinese | 100000 | 0.919 | 0.858 | 0.854 | 85780 | 7585 | 4128833 | 14220 |
26 | | 13 | slk | Slovak | 100000 | 0.789 | 0.608 | 0.629 | 60799 | 16304 | 4120114 | 39201 |
27 | | 14 | ron | Romanian | 100000 | 0.898 | 0.611 | 0.698 | 61108 | 6976 | 4129442 | 38892 |
28 | | 15 | ind | Indonesian | 100000 | 0.858 | 0.470 | 0.578 | 47018 | 7801 | 4128617 | 52982 |
29 | | 16 | est | Estonian | 100000 | 0.819 | 0.710 | 0.702 | 71021 | 15723 | 4120695 | 28979 |
30 | | 17 | por | Portuguese | 100000 | 0.814 | 0.677 | 0.681 | 67704 | 15509 | 4120909 | 32296 |
31 | | 18 | hrv | Croatian | 100000 | 0.743 | 0.357 | 0.445 | 35706 | 12370 | 4124048 | 64294 |
32 | | 19 | heb | Hebrew | 100000 | 0.999 | 0.929 | 0.962 | 92883 | 66 | 4136352 | 7117 |
33 | | 20 | ita | Italian | 100000 | 0.815 | 0.695 | 0.691 | 69451 | 15732 | 4120686 | 30549 |
34 | | 21 | slv | Slovenian | 100000 | 0.778 | 0.627 | 0.632 | 62668 | 17892 | 4118526 | 37332 |
35 | | 22 | ces | Czech | 100000 | 0.828 | 0.659 | 0.682 | 65903 | 13732 | 4122686 | 34097 |
36 | | 23 | mal | Malayalam | 100000 | 1.000 | 0.958 | 0.979 | 95809 | 0 | 4136418 | 4191 |
37 | | 24 | lit | Lithuanian | 100000 | 0.839 | 0.679 | 0.700 | 67868 | 13048 | 4123370 | 32132 |
38 | | 25 | ukr | Ukrainian | 100000 | 0.756 | 0.576 | 0.592 | 57647 | 18557 | 4117861 | 42353 |
39 | | 26 | pol | Polish | 100000 | 0.883 | 0.774 | 0.782 | 77363 | 10245 | 4126173 | 22637 |
40 | | 27 | fas | Persian | 100000 | 0.869 | 0.552 | 0.643 | 55250 | 8360 | 4128058 | 44750 |
41 | | 28 | jpn | Japanese | 100000 | 0.703 | 0.967 | 0.695 | 96684 | 40843 | 4095575 | 3316 |
42 | | 29 | hin | Hindi | 100000 | 0.816 | 0.735 | 0.711 | 73451 | 16519 | 4119899 | 26549 |
43 | | 30 | eng | English | 100000 | 0.691 | 0.689 | 0.597 | 68859 | 30848 | 4105570 | 31141 |
44 | | 31 | sqi | Albanian | 100000 | 0.945 | 0.692 | 0.781 | 69231 | 3995 | 4132423 | 30769 |
45 | | 32 | rus | Russian | 100000 | 0.637 | 0.696 | 0.560 | 69635 | 39608 | 4096810 | 30365 |
46 | | 33 | fra | French | 100000 | 0.863 | 0.681 | 0.718 | 68140 | 10854 | 4125564 | 31860 |
47 | | 34 | lav | Latvian | 100000 | 0.830 | 0.690 | 0.699 | 68956 | 14159 | 4122259 | 31044 |
48 | | 35 | deu | German | 100000 | 0.838 | 0.760 | 0.740 | 75994 | 14742 | 4121676 | 24006 |
49 | | 36 | tur | Turkish | 100000 | 0.883 | 0.743 | 0.766 | 74271 | 9841 | 4126577 | 25729 |
50 | | 37 | ara | Arabic | 100000 | 0.856 | 0.840 | 0.792 | 83981 | 14088 | 4122330 | 16019 |
51 | | 38 | vie | Vietnamese | 100000 | 0.906 | 0.803 | 0.815 | 80261 | 8292 | 4128126 | 19739 |
52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.683 | 0.676 | 0.587 | 67633 | 31322 | 4105096 | 32367 |
53 | | 40 | ben | Bangla | 100000 | 1.000 | 0.955 | 0.977 | 95545 | 0 | 4136418 | 4455 |
54 | | 41 | nld | Dutch | 100000 | 0.841 | 0.731 | 0.728 | 73077 | 13823 | 4122595 | 26923 |
55 | | 42 | urd | Urdu | 46523 | 0.923 | 0.766 | 0.809 | 35638 | 2984 | 4186911 | 10885 |
56 | | 43 | tam | Tamil | 40165 | 0.994 | 0.929 | 0.958 | 37303 | 210 | 4196043 | 2862 |
57 | | 44 | tel | Telugu | 30416 | 0.993 | 0.898 | 0.940 | 27303 | 195 | 4205807 | 3113 |
58 | | 45 | fil | Filipino | 19314 | 0.681 | 0.561 | 0.538 | 10843 | 5069 | 4212035 | 8471 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05-common-48/pycld2/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for pycld2 on tatoeba-sentences-2021-06-05-common-48
2 |
3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%)
4 | - **Aggregated accuracy: 87.12%**
5 |
6 | Supported languages (83)
7 |
8 | afr (Afrikaans), sqi (Albanian), ara (Arabic), hye (Armenian), aze (Azerbaijani), eus (Basque), bel (Belarusian), ben (Bangla), bih (Bihari languages), bul (Bulgarian), cat (Catalan), ceb (Cebuano), chr (Cherokee), hrv (Croatian), ces (Czech), zho (Chinese), dan (Danish), div (Divehi), nld (Dutch), eng (English), est (Estonian), fin (Finnish), fra (French), glg (Galician), lug (Ganda), kat (Georgian), deu (German), ell (Greek), guj (Gujarati), hat (Haitian Creole), heb (Hebrew), hin (Hindi), hmn (Hmong), hun (Hungarian), isl (Icelandic), ind (Indonesian), iku (Inuktitut), gle (Irish), ita (Italian), jav (Javanese), jpn (Japanese), kan (Kannada), khm (Khmer), kin (Kinyarwanda), kor (Korean), lao (Lao), lav (Latvian), lif (Limbu), lit (Lithuanian), mkd (Macedonian), msa (Malay), mal (Malayalam), mlt (Maltese), mar (Marathi), nep (Nepali), nob (Norwegian Bokmål), ori (Odia), fas (Persian), pol (Polish), por (Portuguese), pan (Punjabi), ron (Romanian), rus (Russian), gla (Scottish Gaelic), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), spa (Spanish), swa (Swahili), swe (Swedish), syr (Syriac), fil (Filipino), tam (Tamil), tel (Telugu), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), cym (Welsh), yid (Yiddish), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:|
14 | | 1 | eng | English | 1479733 | 0.955 | 0.970 | 0.941 | 1435203 | 68178 | 5913716 | 44530 |
15 | | 2 | rus | Russian | 849653 | 0.998 | 0.830 | 0.905 | 705549 | 1619 | 6610355 | 144104 |
16 | | 3 | ita | Italian | 787053 | 0.999 | 0.689 | 0.816 | 542549 | 441 | 6674133 | 244504 |
17 | | 4 | tur | Turkish | 709573 | 1.000 | 0.923 | 0.960 | 654731 | 110 | 6751944 | 54842 |
18 | | 5 | deu | German | 553727 | 1.000 | 0.954 | 0.976 | 528244 | 180 | 6907720 | 25483 |
19 | | 6 | fra | French | 466192 | 0.999 | 0.845 | 0.915 | 394106 | 372 | 6995063 | 72086 |
20 | | 7 | por | Portuguese | 385737 | 0.982 | 0.865 | 0.912 | 333763 | 6080 | 7069810 | 51974 |
21 | | 8 | spa | Spanish | 338781 | 0.995 | 0.798 | 0.883 | 270207 | 1476 | 7121370 | 68574 |
22 | | 9 | hun | Hungarian | 323048 | 1.000 | 0.935 | 0.966 | 302013 | 131 | 7138448 | 21035 |
23 | | 10 | jpn | Japanese | 208761 | 1.000 | 0.999 | 1.000 | 208635 | 0 | 7252866 | 126 |
24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.841 | 0.914 | 165882 | 4 | 7264397 | 31344 |
25 | | 12 | ukr | Ukrainian | 171674 | 0.992 | 0.791 | 0.877 | 135799 | 1148 | 7288805 | 35875 |
26 | | 13 | nld | Dutch | 144340 | 0.994 | 0.820 | 0.897 | 118356 | 664 | 7316623 | 25984 |
27 | | 14 | fin | Finnish | 128011 | 0.999 | 0.909 | 0.951 | 116372 | 159 | 7333457 | 11639 |
28 | | 15 | pol | Polish | 109662 | 0.999 | 0.926 | 0.961 | 101512 | 68 | 7351897 | 8150 |
29 | | 16 | mkd | Macedonian | 77938 | 0.973 | 0.477 | 0.635 | 37213 | 1038 | 7382651 | 40725 |
30 | | 17 | mar | Marathi | 64126 | 1.000 | 0.967 | 0.983 | 62024 | 24 | 7397477 | 2102 |
31 | | 18 | lit | Lithuanian | 59659 | 0.997 | 0.914 | 0.952 | 54501 | 144 | 7401824 | 5158 |
32 | | 19 | ces | Czech | 57030 | 0.971 | 0.891 | 0.917 | 50816 | 1511 | 7403086 | 6214 |
33 | | 20 | dan | Danish | 49399 | 0.866 | 0.698 | 0.730 | 34494 | 5317 | 7406911 | 14905 |
34 | | 21 | swe | Swedish | 41677 | 0.995 | 0.761 | 0.861 | 31709 | 145 | 7419805 | 9968 |
35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.776 | 0.874 | 27916 | 1 | 7425635 | 8075 |
36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 14 | 7427542 | 0 |
37 | | 24 | ron | Romanian | 24943 | 0.963 | 0.811 | 0.866 | 20227 | 780 | 7435904 | 4716 |
38 | | 25 | bul | Bulgarian | 24503 | 0.854 | 0.700 | 0.722 | 17140 | 2925 | 7434199 | 7363 |
39 | | 26 | vie | Vietnamese | 19234 | 0.995 | 0.991 | 0.990 | 19062 | 103 | 7442290 | 172 |
40 | | 27 | fil | Filipino | 16649 | 0.994 | 0.789 | 0.878 | 13136 | 75 | 7444903 | 3513 |
41 | | 28 | slk | Slovak | 14660 | 0.704 | 0.788 | 0.643 | 11559 | 4854 | 7442113 | 3101 |
42 | | 29 | ind | Indonesian | 14542 | 0.867 | 0.775 | 0.770 | 11270 | 1727 | 7445358 | 3272 |
43 | | 30 | hin | Hindi | 14230 | 0.918 | 0.973 | 0.907 | 13848 | 1230 | 7446167 | 382 |
44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.566 | 0.796 | 0.528 | 11327 | 8676 | 7438728 | 2896 |
45 | | 32 | cat | Catalan | 7971 | 0.807 | 0.685 | 0.681 | 5464 | 1305 | 7452351 | 2507 |
46 | | 33 | kor | Korean | 7570 | 1.000 | 0.991 | 0.995 | 7500 | 0 | 7454057 | 70 |
47 | | 34 | hrv | Croatian | 5204 | 0.940 | 0.565 | 0.690 | 2942 | 189 | 7456234 | 2262 |
48 | | 35 | ben | Bangla | 4714 | 1.000 | 0.777 | 0.874 | 3662 | 0 | 7456913 | 1052 |
49 | | 36 | afr | Afrikaans | 4031 | 0.446 | 0.826 | 0.426 | 3330 | 4129 | 7453467 | 701 |
50 | | 37 | est | Estonian | 3637 | 0.908 | 0.752 | 0.790 | 2734 | 276 | 7457714 | 903 |
51 | | 38 | tha | Thai | 3528 | 1.000 | 1.000 | 1.000 | 3528 | 0 | 7458099 | 0 |
52 | | 39 | sqi | Albanian | 2526 | 0.962 | 0.909 | 0.918 | 2295 | 90 | 7459011 | 231 |
53 | | 40 | urd | Urdu | 2008 | 0.997 | 0.948 | 0.971 | 1903 | 5 | 7459614 | 105 |
54 | | 41 | cym | Welsh | 1344 | 0.968 | 0.845 | 0.890 | 1136 | 37 | 7460246 | 208 |
55 | | 42 | slv | Slovenian | 1093 | 0.802 | 0.550 | 0.604 | 601 | 148 | 7460386 | 492 |
56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460800 | 0 |
57 | | 44 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7461293 | 0 |
58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461373 | 0 |
59 | | 46 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7461431 | 0 |
60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461451 | 0 |
61 | | 48 | guj | Gujarati | 168 | 1.000 | 1.000 | 1.000 | 168 | 0 | 7461459 | 0 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05-common-48/langid/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for langid on tatoeba-sentences-2021-06-05-common-48
2 |
3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%)
4 | - **Aggregated accuracy: 90.15%**
5 |
6 | Supported languages (97)
7 |
8 | afr (Afrikaans), amh (Amharic), arg (Aragonese), ara (Arabic), asm (Assamese), aze (Azerbaijani), bel (Belarusian), bul (Bulgarian), ben (Bangla), bre (Breton), bos (Bosnian), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), dzo (Dzongkha), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fao (Faroese), fra (French), gle (Irish), glg (Galician), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), isl (Icelandic), ita (Italian), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), nob (Norwegian Bokmål), nep (Nepali), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), que (Quechua), ron (Romanian), rus (Russian), kin (Kinyarwanda), sme (Northern Sami), sin (Sinhala), slk (Slovak), slv (Slovenian), sqi (Albanian), srp (Serbian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), uig (Uyghur), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), vol (Volapük), wln (Walloon), xho (Xhosa), zho (Chinese), zul (Zulu)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:|
14 | | 1 | eng | English | 1479733 | 0.965 | 0.973 | 0.952 | 1439789 | 52447 | 5929447 | 39944 |
15 | | 2 | rus | Russian | 849653 | 0.968 | 0.823 | 0.876 | 699008 | 23301 | 6588673 | 150645 |
16 | | 3 | ita | Italian | 787053 | 0.973 | 0.885 | 0.915 | 696690 | 19395 | 6655179 | 90363 |
17 | | 4 | tur | Turkish | 709573 | 0.997 | 0.919 | 0.955 | 652061 | 1817 | 6750237 | 57512 |
18 | | 5 | deu | German | 553727 | 0.958 | 0.974 | 0.946 | 539237 | 23461 | 6884439 | 14490 |
19 | | 6 | fra | French | 466192 | 0.921 | 0.935 | 0.893 | 436032 | 37228 | 6958207 | 30160 |
20 | | 7 | por | Portuguese | 385737 | 0.936 | 0.822 | 0.850 | 317068 | 21734 | 7054156 | 68669 |
21 | | 8 | spa | Spanish | 338781 | 0.863 | 0.837 | 0.796 | 283448 | 45158 | 7077688 | 55333 |
22 | | 9 | hun | Hungarian | 323048 | 0.980 | 0.929 | 0.944 | 300114 | 6226 | 7132353 | 22934 |
23 | | 10 | jpn | Japanese | 208761 | 0.999 | 1.000 | 0.999 | 208702 | 109 | 7252757 | 59 |
24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.999 | 1.000 | 197110 | 10 | 7264391 | 116 |
25 | | 12 | ukr | Ukrainian | 171674 | 0.751 | 0.774 | 0.677 | 132961 | 44090 | 7245863 | 38713 |
26 | | 13 | nld | Dutch | 144340 | 0.885 | 0.901 | 0.844 | 130115 | 16989 | 7300298 | 14225 |
27 | | 14 | fin | Finnish | 128011 | 0.953 | 0.931 | 0.920 | 119175 | 5915 | 7327701 | 8836 |
28 | | 15 | pol | Polish | 109662 | 0.960 | 0.977 | 0.950 | 107088 | 4405 | 7347560 | 2574 |
29 | | 16 | mkd | Macedonian | 77938 | 0.619 | 0.482 | 0.464 | 37554 | 23137 | 7360552 | 40384 |
30 | | 17 | mar | Marathi | 64126 | 0.988 | 0.700 | 0.815 | 44902 | 563 | 7396938 | 19224 |
31 | | 18 | lit | Lithuanian | 59659 | 0.909 | 0.915 | 0.872 | 54565 | 5495 | 7396473 | 5094 |
32 | | 19 | ces | Czech | 57030 | 0.893 | 0.837 | 0.822 | 47732 | 5701 | 7398896 | 9298 |
33 | | 20 | dan | Danish | 49399 | 0.767 | 0.602 | 0.612 | 29753 | 9059 | 7403169 | 19646 |
34 | | 21 | swe | Swedish | 41677 | 0.808 | 0.802 | 0.735 | 33438 | 7931 | 7412019 | 8239 |
35 | | 22 | ara | Arabic | 35991 | 0.999 | 0.950 | 0.973 | 34184 | 30 | 7425606 | 1807 |
36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 14 | 7427542 | 0 |
37 | | 24 | ron | Romanian | 24943 | 0.696 | 0.906 | 0.671 | 22605 | 9895 | 7426789 | 2338 |
38 | | 25 | bul | Bulgarian | 24503 | 0.211 | 0.624 | 0.199 | 15278 | 57018 | 7380106 | 9225 |
39 | | 26 | vie | Vietnamese | 19234 | 0.959 | 0.998 | 0.958 | 19192 | 819 | 7441574 | 42 |
40 | | 27 | fil | Filipino | 16649 | 0.907 | 0.792 | 0.810 | 13181 | 1357 | 7443621 | 3468 |
41 | | 28 | slk | Slovak | 14660 | 0.545 | 0.690 | 0.486 | 10119 | 8443 | 7438524 | 4541 |
42 | | 29 | ind | Indonesian | 14542 | 0.615 | 0.731 | 0.553 | 10632 | 6650 | 7440435 | 3910 |
43 | | 30 | hin | Hindi | 14230 | 0.420 | 0.901 | 0.410 | 12825 | 17724 | 7429673 | 1405 |
44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.315 | 0.770 | 0.301 | 10958 | 23798 | 7423606 | 3265 |
45 | | 32 | cat | Catalan | 7971 | 0.218 | 0.720 | 0.209 | 5739 | 20640 | 7433016 | 2232 |
46 | | 33 | kor | Korean | 7570 | 0.990 | 1.000 | 0.990 | 7568 | 77 | 7453980 | 2 |
47 | | 34 | hrv | Croatian | 5204 | 0.482 | 0.650 | 0.427 | 3384 | 3631 | 7452792 | 1820 |
48 | | 35 | ben | Bangla | 4714 | 0.999 | 0.977 | 0.988 | 4605 | 3 | 7456910 | 109 |
49 | | 36 | afr | Afrikaans | 4031 | 0.322 | 0.462 | 0.271 | 1861 | 3919 | 7453677 | 2170 |
50 | | 37 | est | Estonian | 3637 | 0.306 | 0.689 | 0.286 | 2505 | 5688 | 7452302 | 1132 |
51 | | 38 | tha | Thai | 3528 | 0.999 | 1.000 | 0.999 | 3528 | 3 | 7458096 | 0 |
52 | | 39 | sqi | Albanian | 2526 | 0.805 | 0.905 | 0.772 | 2285 | 555 | 7458546 | 241 |
53 | | 40 | urd | Urdu | 2008 | 0.855 | 0.947 | 0.835 | 1901 | 322 | 7459297 | 107 |
54 | | 41 | cym | Welsh | 1344 | 0.330 | 0.682 | 0.306 | 916 | 1860 | 7458423 | 428 |
55 | | 42 | slv | Slovenian | 1093 | 0.079 | 0.681 | 0.078 | 744 | 8640 | 7451894 | 349 |
56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460800 | 0 |
57 | | 44 | tam | Tamil | 334 | 0.991 | 1.000 | 0.991 | 334 | 3 | 7461290 | 0 |
58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461373 | 0 |
59 | | 46 | pan | Punjabi | 196 | 0.990 | 1.000 | 0.990 | 196 | 2 | 7461429 | 0 |
60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461451 | 0 |
61 | | 48 | guj | Gujarati | 168 | 0.971 | 1.000 | 0.971 | 168 | 5 | 7461454 | 0 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05-common-48/gcld3/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for gcld3 on tatoeba-sentences-2021-06-05-common-48
2 |
3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%)
4 | - **Aggregated accuracy: 86.98%**
5 |
6 | Supported languages (107)
7 |
8 | afr (Afrikaans), amh (Amharic), ara (Arabic), bul (Bulgarian), bul (Bulgarian), ben (Bangla), bos (Bosnian), cat (Catalan), ceb (Cebuano), cos (Corsican), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fil (Filipino), fra (French), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), guj (Gujarati), hau (Hausa), haw (Hawaiian), hin (Hindi), hin (Hindi), hmn (Hmong), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), ibo (Igbo), isl (Icelandic), ita (Italian), heb (Hebrew), jpn (Japanese), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mri (Maori), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), mya (Burmese), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), nya (Nyanja), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), ron (Romanian), rus (Russian), rus (Russian), snd (Sindhi), sin (Sinhala), slk (Slovak), slv (Slovenian), smo (Samoan), sna (Shona), som (Somali), sqi (Albanian), srp (Serbian), sot (Southern Sotho), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vie (Vietnamese), xho (Xhosa), yid (Yiddish), yor (Yoruba), zho (Chinese), zho (Chinese), zul (Zulu)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:|
14 | | 1 | eng | English | 1479733 | 0.995 | 0.851 | 0.915 | 1258584 | 6421 | 5975473 | 221149 |
15 | | 2 | rus | Russian | 849653 | 0.975 | 0.858 | 0.903 | 729212 | 18499 | 6593475 | 120441 |
16 | | 3 | ita | Italian | 787053 | 0.976 | 0.821 | 0.882 | 646316 | 16123 | 6658451 | 140737 |
17 | | 4 | tur | Turkish | 709573 | 0.994 | 0.894 | 0.939 | 634612 | 3790 | 6748264 | 74961 |
18 | | 5 | deu | German | 553727 | 0.978 | 0.933 | 0.945 | 516822 | 11764 | 6896136 | 36905 |
19 | | 6 | fra | French | 466192 | 0.975 | 0.860 | 0.903 | 400941 | 10202 | 6985233 | 65251 |
20 | | 7 | por | Portuguese | 385737 | 0.916 | 0.885 | 0.864 | 341427 | 31415 | 7044475 | 44310 |
21 | | 8 | spa | Spanish | 338781 | 0.924 | 0.782 | 0.819 | 264941 | 21705 | 7101141 | 73840 |
22 | | 9 | hun | Hungarian | 323048 | 0.969 | 0.895 | 0.917 | 289189 | 9311 | 7129268 | 33859 |
23 | | 10 | jpn | Japanese | 208761 | 0.983 | 0.999 | 0.983 | 208552 | 3592 | 7249274 | 209 |
24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.991 | 0.995 | 195374 | 48 | 7264353 | 1852 |
25 | | 12 | ukr | Ukrainian | 171674 | 0.802 | 0.889 | 0.764 | 152579 | 37666 | 7252287 | 19095 |
26 | | 13 | nld | Dutch | 144340 | 0.877 | 0.854 | 0.816 | 123199 | 17204 | 7300083 | 21141 |
27 | | 14 | fin | Finnish | 128011 | 0.954 | 0.902 | 0.907 | 115404 | 5554 | 7328062 | 12607 |
28 | | 15 | pol | Polish | 109662 | 0.918 | 0.931 | 0.888 | 102084 | 9094 | 7342871 | 7578 |
29 | | 16 | mkd | Macedonian | 77938 | 0.875 | 0.741 | 0.759 | 57730 | 8238 | 7375451 | 20208 |
30 | | 17 | mar | Marathi | 64126 | 0.989 | 0.911 | 0.944 | 58406 | 632 | 7396869 | 5720 |
31 | | 18 | lit | Lithuanian | 59659 | 0.908 | 0.883 | 0.856 | 52661 | 5352 | 7396616 | 6998 |
32 | | 19 | ces | Czech | 57030 | 0.885 | 0.813 | 0.803 | 46341 | 6049 | 7398548 | 10689 |
33 | | 20 | dan | Danish | 49399 | 0.656 | 0.746 | 0.590 | 36848 | 19288 | 7392940 | 12551 |
34 | | 21 | swe | Swedish | 41677 | 0.794 | 0.861 | 0.746 | 35870 | 9310 | 7410640 | 5807 |
35 | | 22 | ara | Arabic | 35991 | 0.999 | 0.911 | 0.952 | 32774 | 30 | 7425606 | 3217 |
36 | | 23 | ell | Greek | 34071 | 0.806 | 1.000 | 0.806 | 34062 | 8213 | 7419343 | 9 |
37 | | 24 | ron | Romanian | 24943 | 0.623 | 0.808 | 0.580 | 20164 | 12187 | 7424497 | 4779 |
38 | | 25 | bul | Bulgarian | 24503 | 0.327 | 0.844 | 0.318 | 20688 | 42483 | 7394641 | 3815 |
39 | | 26 | vie | Vietnamese | 19234 | 0.890 | 0.981 | 0.882 | 18870 | 2332 | 7440061 | 364 |
40 | | 27 | fil | Filipino | 16649 | 0.746 | 0.780 | 0.675 | 12994 | 4426 | 7440552 | 3655 |
41 | | 28 | slk | Slovak | 14660 | 0.425 | 0.727 | 0.394 | 10664 | 14404 | 7432563 | 3996 |
42 | | 29 | ind | Indonesian | 14542 | 0.688 | 0.640 | 0.577 | 9304 | 4210 | 7442875 | 5238 |
43 | | 30 | hin | Hindi | 14230 | 0.527 | 0.880 | 0.509 | 12527 | 11232 | 7436165 | 1703 |
44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.288 | 0.829 | 0.280 | 11791 | 29106 | 7418298 | 2432 |
45 | | 32 | cat | Catalan | 7971 | 0.181 | 0.818 | 0.177 | 6520 | 29565 | 7424091 | 1451 |
46 | | 33 | kor | Korean | 7570 | 0.917 | 0.996 | 0.915 | 7536 | 685 | 7453372 | 34 |
47 | | 34 | hrv | Croatian | 5204 | 0.270 | 0.447 | 0.231 | 2324 | 6278 | 7450145 | 2880 |
48 | | 35 | ben | Bangla | 4714 | 1.000 | 0.998 | 0.999 | 4704 | 0 | 7456913 | 10 |
49 | | 36 | afr | Afrikaans | 4031 | 0.147 | 0.865 | 0.145 | 3485 | 20233 | 7437363 | 546 |
50 | | 37 | est | Estonian | 3637 | 0.202 | 0.796 | 0.197 | 2894 | 11430 | 7446560 | 743 |
51 | | 38 | tha | Thai | 3528 | 0.995 | 0.998 | 0.994 | 3522 | 18 | 7458081 | 6 |
52 | | 39 | sqi | Albanian | 2526 | 0.332 | 0.865 | 0.323 | 2184 | 4404 | 7454697 | 342 |
53 | | 40 | urd | Urdu | 2008 | 0.882 | 0.961 | 0.867 | 1930 | 257 | 7459362 | 78 |
54 | | 41 | cym | Welsh | 1344 | 0.105 | 0.824 | 0.104 | 1108 | 9469 | 7450814 | 236 |
55 | | 42 | slv | Slovenian | 1093 | 0.060 | 0.724 | 0.059 | 791 | 12479 | 7448055 | 302 |
56 | | 43 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7460800 | 0 |
57 | | 44 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7461293 | 0 |
58 | | 45 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7461373 | 0 |
59 | | 46 | pan | Punjabi | 196 | 1.000 | 0.995 | 0.997 | 195 | 0 | 7461431 | 1 |
60 | | 47 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7461451 | 0 |
61 | | 48 | guj | Gujarati | 168 | 1.000 | 0.982 | 0.991 | 165 | 0 | 7461459 | 3 |
--------------------------------------------------------------------------------
/analyze.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pathlib
3 | import argparse
4 | import pandas as pd
5 | import numpy as np
6 |
7 | from jinja2 import Environment, select_autoescape, FileSystemLoader
8 | from langcodes import Language
9 | from typing import Dict, Any
10 |
11 | import datasets
12 | import analyze_dataset
13 | from benchmarks import BENCHMARKS
14 | from langcodes import Language
15 |
16 |
17 | jinja_env = Environment(loader=FileSystemLoader("./templates"), autoescape=select_autoescape())
18 |
19 |
20 | def get_alpha3(l):
21 | try:
22 | return Language.get(l).to_alpha3()
23 | except:
24 | return 'unk'
25 |
26 |
27 | def read_results(dataset_name, benchmark_name='fasttext', lang_dtype='str'):
28 | results_path = os.path.join('results', dataset_name, benchmark_name, 'results.csv')
29 | results = pd.read_csv(results_path, sep=',', index_col=0, names=['detected_lang', 'detected_prob'])
30 | # langdetect/pycld2 returns nan for small number of rows. We'll just convert them to strings
31 | results['detected_lang'] = results['detected_lang'].astype(str).astype("category")
32 | results['detected_lang_alpha3'] = results['detected_lang'].apply(lambda x: get_alpha3(x.replace('__label__', ''))).astype(lang_dtype)
33 | return results
34 |
35 |
36 | def accuracy(results_df):
37 | correct = (results_df['alpha3'] == results_df['detected_lang_alpha3']).astype(int)
38 | return correct.mean()
39 |
40 |
41 | def get_stats_per_language(results):
42 | langs = results['alpha3'].unique().tolist()
43 | class_metrics = {}
44 | for lang in langs:
45 | tp = (results['alpha3'] == lang) & (results['detected_lang_alpha3'] == lang)
46 | fp = (results['alpha3'] != lang) & (results['detected_lang_alpha3'] == lang)
47 | tn = (results['alpha3'] != lang) & (results['detected_lang_alpha3'] != lang)
48 | fn = (results['alpha3'] == lang) & (results['detected_lang_alpha3'] != lang)
49 | precision = tp.sum() / (tp.sum() + fp.sum())
50 | recall = tp.sum() / (tp.sum() + fn.sum())
51 | f1 = tp.sum() / (tp.sum() + (fp.sum() + fn.sum()/2))
52 | class_metrics[lang] = dict(
53 | sentences_count=tp.sum()+fn.sum(),
54 | precision=precision,
55 | recall=recall,
56 | tp=tp.sum(),
57 | fp=fp.sum(),
58 | tn=tn.sum(),
59 | fn=fn.sum(),
60 | f1=f1,
61 | )
62 |
63 | stats_per_language = pd.DataFrame.from_records(data=list(class_metrics.values()), index=list(class_metrics.keys()))
64 | stats_per_language.index.name = 'language_alpha3'
65 | stats_per_language = stats_per_language.reset_index()
66 |
67 | # assign the language
68 | stats_per_language['language'] = stats_per_language['language_alpha3'].apply(analyze_dataset.get_language_name)
69 |
70 | # sort by sentences count and set the index to be row number
71 | stats_per_language.sort_values(['sentences_count'], ascending=False, inplace=True)
72 | stats_per_language = stats_per_language.reset_index()
73 | stats_per_language.index += 1
74 |
75 | return stats_per_language[['language_alpha3', 'language', 'sentences_count', 'precision', 'recall', 'f1', 'tp', 'fp', 'tn', 'fn']]
76 |
77 |
78 | def md_link(text, url):
79 | return f"[{text}]({url})"
80 |
81 |
82 | def create_dataset_results_table(dataset_name, metrics_per_benchmark):
83 | link_base = "https://github.com/modelpredict/language-identification-survey/blob/main/results/"
84 |
85 | for benchmark_name, row in metrics_per_benchmark.items():
86 | per_language_link = os.path.join(link_base, dataset_name, benchmark_name, f"classification_performance.md#metrics-per-language")
87 | acc_link = os.path.join(link_base, dataset_name, benchmark_name, f"classification_performance.md")
88 | supported_languages_link = os.path.join(link_base, dataset_name, benchmark_name, f"classification_performance.md#supported-languages")
89 |
90 | row['per_language_link'] = md_link("See metrics", per_language_link)
91 | row['agg_accuracy'] = md_link(row['agg_accuracy'], acc_link)
92 | row['supported_languages'] = md_link(row['supported_languages'], supported_languages_link)
93 |
94 | df = pd.DataFrame.from_records([{'name':k, **v} for k, v in metrics_per_benchmark.items()])
95 | df.columns=['Library', 'Supported languages', '# sentences supported', 'Aggregated accuracy', 'Per language metrics']
96 | return df
97 |
98 |
99 | def write_md(template_name: str, template_ctx: Dict[str, Any], path: str):
100 | tmpl = jinja_env.get_template(f'{template_name}.md')
101 | rendered = tmpl.render(template_ctx)
102 | with open(path, 'w') as fd:
103 | fd.write(rendered)
104 |
105 |
106 | if __name__ == "__main__":
107 | parser = argparse.ArgumentParser(description='Write aggregated results files.')
108 | parser.add_argument('--dataset', '-d', type=str, choices=datasets.names(), required=True)
109 | parser.add_argument('--timings_prefix', '-t', type=str, default='', help='Prefix of the times.npy file')
110 | parser.add_argument("--timings", type=bool, nargs='?', const=True, default=False, help='Analyze timings')
111 | parser.add_argument("--correctness", type=bool, nargs='?', const=True, default=False, help='Analyze correctness')
112 | args = parser.parse_args()
113 |
114 | dataset_name = args.dataset
115 | timings_prefix = args.timings_prefix
116 | dataset = datasets.get(dataset_name)
117 |
118 | metrics_per_benchmark = {}
119 |
120 | for benchmark_name in BENCHMARKS.keys():
121 | benchmark_results_path = pathlib.Path('results') / dataset_name / benchmark_name
122 | print()
123 | if not benchmark_results_path.exists():
124 | print(f"Skipping {benchmark_name}. Results files not found on {benchmark_results_path}")
125 | continue
126 |
127 | if args.correctness:
128 | print(f"Analyzing {benchmark_name} results on {dataset_name}...")
129 |
130 | supported_languages = [Language.get(lang) for lang in BENCHMARKS[benchmark_name]['supported_languages_alpha3']]
131 | supported_languages_list_str = ", ".join(f"{lang.to_alpha3()} ({lang.display_name()})" for lang in supported_languages)
132 |
133 | print(f"Reading results...")
134 | results = read_results(dataset_name, benchmark_name, lang_dtype=dataset.dtypes['alpha3'])
135 | supported_langs = BENCHMARKS[benchmark_name]['supported_languages_alpha3']
136 | dataset_subset = datasets.get_supported_dataset_subset(dataset, supported_languages=supported_langs)
137 | print(f"Merging with dataset...")
138 | joined_results = pd.merge(dataset_subset, results, left_index=True, right_index=True, how="left", validate="one_to_one")
139 |
140 | print(f"Calculating accuracy...")
141 | aggregated_accuracy = accuracy(joined_results)
142 | print(f"Calculating metrics per language...")
143 | stats_per_language = get_stats_per_language(joined_results)
144 | dataset_supported_pct = "{:.2f}%".format(100. * len(dataset_subset) / len(dataset))
145 |
146 | metrics_per_benchmark[benchmark_name] = {
147 | 'supported_languages': len(supported_langs),
148 | 'supported_dataset': f"{len(dataset):,} ({dataset_supported_pct})",
149 | 'agg_accuracy': "{:.2f}%".format(100. * aggregated_accuracy),
150 | }
151 |
152 | # assemble the md file and write it
153 | template_ctx = dict(
154 | benchmark_name=benchmark_name,
155 | dataset_name=dataset_name,
156 | dataset_len=len(dataset_subset),
157 | dataset_supported_pct=dataset_supported_pct,
158 | supported_languages_count=len(supported_languages),
159 | supported_languages_list_str=supported_languages_list_str,
160 | accuracy="{:.2f}%".format(100. * aggregated_accuracy),
161 | stats_per_language=stats_per_language.to_markdown(floatfmt=".3f"),
162 | )
163 | results_path = os.path.join('results', dataset_name, benchmark_name, 'classification_performance.md')
164 | print(f"Dumping classification performance analysis to {results_path}")
165 | write_md('classification_performance', template_ctx=template_ctx, path=results_path)
166 |
167 | if args.timings:
168 | times = np.load(os.path.join('results', dataset_name, benchmark_name, f'{timings_prefix}times.npy'))
169 | template_ctx = dict(
170 | benchmark_name=benchmark_name,
171 | dataset_name=dataset_name,
172 | latency_avg=np.mean(times) / 10**6,
173 | latency_std=np.std(times) / 10**6,
174 | latency_p50=np.quantile(times, [0.5])[0] / 10**6,
175 | latency_p90=np.quantile(times, [0.9])[0] / 10**6,
176 | latency_p95=np.quantile(times, [0.95])[0] / 10**6,
177 | latency_p99=np.quantile(times, [0.99])[0] / 10**6,
178 | throughput=10**9/np.mean(times),
179 | )
180 |
181 | results_path = os.path.join('results', dataset_name, benchmark_name, f'{timings_prefix}speed_performance.md')
182 | print(f"Dumping latency/throughput analysis to {results_path}")
183 | write_md('speed_performance', template_ctx=template_ctx, path=results_path)
184 |
185 | print(f"Creating aggregated table for {dataset_name}")
186 | agg_table_path = os.path.join('results', dataset_name, 'results.md')
187 | df = create_dataset_results_table(dataset_name, metrics_per_benchmark)
188 | template_ctx = dict(
189 | dataset_name=dataset_name,
190 | results_table=df.to_markdown(floatfmt=".3f", index=False)
191 | )
192 | write_md('dataset_results', template_ctx=template_ctx, path=agg_table_path)
193 | print(f"Written in {agg_table_path}")
194 |
--------------------------------------------------------------------------------
/results/open-subtitles-v2018-100k-per-lang/fasttext/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for fasttext on open-subtitles-v2018-100k-per-lang
2 |
3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%)
4 | - **Aggregated accuracy: 80.16%**
5 |
6 | Supported languages (176)
7 |
8 | afr (Afrikaans), als (Tosk Albanian), amh (Amharic), arg (Aragonese), ara (Arabic), arz (Egyptian Arabic), asm (Assamese), ast (Asturian), ava (Avaric), aze (Azerbaijani), azb (South Azerbaijani), bak (Bashkir), bar (Bavarian), bcl (Central Bikol), bel (Belarusian), bul (Bulgarian), bih (Bihari languages), ben (Bangla), bod (Tibetan), bpy (Bishnupriya), bre (Breton), bos (Bosnian), bxr (Russia Buriat), cat (Catalan), cbk (Chavacano), che (Chechen), ceb (Cebuano), ckb (Central Kurdish), cos (Corsican), ces (Czech), chv (Chuvash), cym (Welsh), dan (Danish), deu (German), diq (Dimli (individual language)), dsb (Lower Sorbian), dty (Dotyali), div (Divehi), ell (Greek), eml (Unknown language [eml]), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fra (French), frr (Northern Frisian), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), grn (Guarani), gom (Goan Konkani), guj (Gujarati), glv (Manx), heb (Hebrew), hin (Hindi), hif (Fiji Hindi), hrv (Croatian), hsb (Upper Sorbian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ina (Interlingua), ind (Indonesian), ile (Interlingue), ilo (Iloko), ido (Ido), isl (Icelandic), ita (Italian), jpn (Japanese), jbo (Lojban), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), krc (Karachay-Balkar), kur (Kurdish), kom (Komi), cor (Cornish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lez (Lezghian), lim (Limburgish), lmo (Lombard), lao (Lao), lrc (Northern Luri), lit (Lithuanian), lav (Latvian), mai (Maithili), mlg (Malagasy), mhr (Eastern Mari), min (Minangkabau), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), mrj (Western Mari), msa (Malay), mlt (Maltese), mwl (Mirandese), mya (Burmese), myv (Erzya), mzn (Mazanderani), nah (Nahuatl languages), nap (Neapolitan), nds (Low German), nep (Nepali), new (Newari), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), oss (Ossetic), pan (Punjabi), pam (Pampanga), pfl (Palatine German), pol (Polish), pms (Piedmontese), pnb (Western Panjabi), pus (Pashto), por (Portuguese), que (Quechua), roh (Romansh), ron (Romanian), rus (Russian), rue (Rusyn), san (Sanskrit), sah (Sakha), srd (Sardinian), scn (Sicilian), sco (Scots), snd (Sindhi), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), srp (Serbian), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tuk (Turkmen), fil (Filipino), tur (Turkish), tat (Tatar), tyv (Tuvinian), uig (Uyghur), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vec (Venetian), vep (Veps), vie (Vietnamese), vls (West Flemish), vol (Volapük), wln (Walloon), war (Waray), wuu (Wu Chinese), xal (Kalmyk), xmf (Mingrelian), yid (Yiddish), yor (Yoruba), yue (Cantonese), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|-------:|--------:|------:|
14 | | 1 | ell | Greek | 100000 | 0.993 | 0.993 | 0.990 | 99294 | 693 | 4135725 | 706 |
15 | | 2 | swe | Swedish | 100000 | 0.861 | 0.820 | 0.787 | 81969 | 13210 | 4123208 | 18031 |
16 | | 3 | mkd | Macedonian | 100000 | 0.910 | 0.848 | 0.841 | 84828 | 8412 | 4128006 | 15172 |
17 | | 4 | tha | Thai | 100000 | 0.998 | 0.958 | 0.977 | 95843 | 150 | 4136268 | 4157 |
18 | | 5 | cat | Catalan | 100000 | 0.950 | 0.598 | 0.720 | 59808 | 3148 | 4133270 | 40192 |
19 | | 6 | bul | Bulgarian | 100000 | 0.947 | 0.787 | 0.839 | 78701 | 4413 | 4132005 | 21299 |
20 | | 7 | fin | Finnish | 100000 | 0.865 | 0.895 | 0.824 | 89517 | 13918 | 4122500 | 10483 |
21 | | 8 | dan | Danish | 100000 | 0.619 | 0.747 | 0.560 | 74699 | 45945 | 4090473 | 25301 |
22 | | 9 | hun | Hungarian | 100000 | 0.844 | 0.932 | 0.818 | 93172 | 17283 | 4119135 | 6828 |
23 | | 10 | kor | Korean | 100000 | 0.987 | 0.930 | 0.952 | 92967 | 1182 | 4135236 | 7033 |
24 | | 11 | spa | Spanish | 100000 | 0.706 | 0.936 | 0.689 | 93648 | 39086 | 4097332 | 6352 |
25 | | 12 | zho | Chinese | 100000 | 0.928 | 0.850 | 0.858 | 85002 | 6573 | 4129845 | 14998 |
26 | | 13 | slk | Slovak | 100000 | 0.918 | 0.512 | 0.639 | 51250 | 4573 | 4131845 | 48750 |
27 | | 14 | ron | Romanian | 100000 | 0.961 | 0.723 | 0.812 | 72276 | 2908 | 4133510 | 27724 |
28 | | 15 | ind | Indonesian | 100000 | 0.920 | 0.725 | 0.783 | 72503 | 6296 | 4130122 | 27497 |
29 | | 16 | est | Estonian | 100000 | 0.941 | 0.678 | 0.769 | 67801 | 4256 | 4132162 | 32199 |
30 | | 17 | por | Portuguese | 100000 | 0.735 | 0.883 | 0.701 | 88308 | 31776 | 4104642 | 11692 |
31 | | 18 | hrv | Croatian | 100000 | 0.732 | 0.305 | 0.399 | 30462 | 11167 | 4125251 | 69538 |
32 | | 19 | heb | Hebrew | 100000 | 1.000 | 0.980 | 0.990 | 98046 | 41 | 4136377 | 1954 |
33 | | 20 | ita | Italian | 100000 | 0.639 | 0.925 | 0.622 | 92461 | 52332 | 4084086 | 7539 |
34 | | 21 | slv | Slovenian | 100000 | 0.858 | 0.386 | 0.510 | 38621 | 6372 | 4130046 | 61379 |
35 | | 22 | ces | Czech | 100000 | 0.781 | 0.793 | 0.709 | 79295 | 22186 | 4114232 | 20705 |
36 | | 23 | mal | Malayalam | 100000 | 0.999 | 0.967 | 0.982 | 96741 | 99 | 4136319 | 3259 |
37 | | 24 | lit | Lithuanian | 100000 | 0.909 | 0.765 | 0.797 | 76457 | 7678 | 4128740 | 23543 |
38 | | 25 | ukr | Ukrainian | 100000 | 0.932 | 0.676 | 0.762 | 67632 | 4955 | 4131463 | 32368 |
39 | | 26 | pol | Polish | 100000 | 0.837 | 0.894 | 0.798 | 89392 | 17360 | 4119058 | 10608 |
40 | | 27 | fas | Persian | 100000 | 0.948 | 0.577 | 0.703 | 57657 | 3136 | 4133282 | 42343 |
41 | | 28 | jpn | Japanese | 100000 | 0.942 | 0.934 | 0.912 | 93404 | 5734 | 4130684 | 6596 |
42 | | 29 | hin | Hindi | 100000 | 0.998 | 0.850 | 0.917 | 84951 | 138 | 4136280 | 15049 |
43 | | 30 | eng | English | 100000 | 0.433 | 0.939 | 0.427 | 93908 | 123135 | 4013283 | 6092 |
44 | | 31 | sqi | Albanian | 100000 | 0.987 | 0.700 | 0.814 | 69957 | 933 | 4135485 | 30043 |
45 | | 32 | rus | Russian | 100000 | 0.679 | 0.962 | 0.670 | 96174 | 45549 | 4090869 | 3826 |
46 | | 33 | fra | French | 100000 | 0.709 | 0.919 | 0.687 | 91892 | 37802 | 4098616 | 8108 |
47 | | 34 | lav | Latvian | 100000 | 0.989 | 0.668 | 0.794 | 66756 | 724 | 4135694 | 33244 |
48 | | 35 | deu | German | 100000 | 0.813 | 0.891 | 0.775 | 89068 | 20443 | 4115975 | 10932 |
49 | | 36 | tur | Turkish | 100000 | 0.881 | 0.933 | 0.854 | 93281 | 12574 | 4123844 | 6719 |
50 | | 37 | ara | Arabic | 100000 | 0.783 | 0.924 | 0.759 | 92437 | 25579 | 4110839 | 7563 |
51 | | 38 | vie | Vietnamese | 100000 | 0.991 | 0.870 | 0.922 | 86978 | 798 | 4135620 | 13022 |
52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.770 | 0.342 | 0.443 | 34206 | 10192 | 4126226 | 65794 |
53 | | 40 | ben | Bangla | 100000 | 0.999 | 0.947 | 0.971 | 94676 | 141 | 4136277 | 5324 |
54 | | 41 | nld | Dutch | 100000 | 0.877 | 0.855 | 0.816 | 85450 | 12031 | 4124387 | 14550 |
55 | | 42 | urd | Urdu | 46523 | 0.970 | 0.761 | 0.842 | 35426 | 1100 | 4188795 | 11097 |
56 | | 43 | tam | Tamil | 40165 | 0.995 | 0.944 | 0.967 | 37924 | 177 | 4196076 | 2241 |
57 | | 44 | tel | Telugu | 30416 | 0.994 | 0.931 | 0.959 | 28330 | 167 | 4205835 | 2086 |
58 | | 45 | fil | Filipino | 19314 | 0.838 | 0.651 | 0.684 | 12579 | 2435 | 4214669 | 6735 |
--------------------------------------------------------------------------------
/results/open-subtitles-v2018-100k-per-lang/fasttext-compressed/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for fasttext-compressed on open-subtitles-v2018-100k-per-lang
2 |
3 | - Dataset coverage (sentences in supported languages): 4236418 (100.00%)
4 | - **Aggregated accuracy: 75.21%**
5 |
6 | Supported languages (176)
7 |
8 | afr (Afrikaans), als (Tosk Albanian), amh (Amharic), arg (Aragonese), ara (Arabic), arz (Egyptian Arabic), asm (Assamese), ast (Asturian), ava (Avaric), aze (Azerbaijani), azb (South Azerbaijani), bak (Bashkir), bar (Bavarian), bcl (Central Bikol), bel (Belarusian), bul (Bulgarian), bih (Bihari languages), ben (Bangla), bod (Tibetan), bpy (Bishnupriya), bre (Breton), bos (Bosnian), bxr (Russia Buriat), cat (Catalan), cbk (Chavacano), che (Chechen), ceb (Cebuano), ckb (Central Kurdish), cos (Corsican), ces (Czech), chv (Chuvash), cym (Welsh), dan (Danish), deu (German), diq (Dimli (individual language)), dsb (Lower Sorbian), dty (Dotyali), div (Divehi), ell (Greek), eml (Unknown language [eml]), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fra (French), frr (Northern Frisian), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), grn (Guarani), gom (Goan Konkani), guj (Gujarati), glv (Manx), heb (Hebrew), hin (Hindi), hif (Fiji Hindi), hrv (Croatian), hsb (Upper Sorbian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ina (Interlingua), ind (Indonesian), ile (Interlingue), ilo (Iloko), ido (Ido), isl (Icelandic), ita (Italian), jpn (Japanese), jbo (Lojban), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), krc (Karachay-Balkar), kur (Kurdish), kom (Komi), cor (Cornish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lez (Lezghian), lim (Limburgish), lmo (Lombard), lao (Lao), lrc (Northern Luri), lit (Lithuanian), lav (Latvian), mai (Maithili), mlg (Malagasy), mhr (Eastern Mari), min (Minangkabau), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), mrj (Western Mari), msa (Malay), mlt (Maltese), mwl (Mirandese), mya (Burmese), myv (Erzya), mzn (Mazanderani), nah (Nahuatl languages), nap (Neapolitan), nds (Low German), nep (Nepali), new (Newari), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), oss (Ossetic), pan (Punjabi), pam (Pampanga), pfl (Palatine German), pol (Polish), pms (Piedmontese), pnb (Western Panjabi), pus (Pashto), por (Portuguese), que (Quechua), roh (Romansh), ron (Romanian), rus (Russian), rue (Rusyn), san (Sanskrit), sah (Sakha), srd (Sardinian), scn (Sicilian), sco (Scots), snd (Sindhi), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), srp (Serbian), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tuk (Turkmen), fil (Filipino), tur (Turkish), tat (Tatar), tyv (Tuvinian), uig (Uyghur), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vec (Venetian), vep (Veps), vie (Vietnamese), vls (West Flemish), vol (Volapük), wln (Walloon), war (Waray), wuu (Wu Chinese), xal (Kalmyk), xmf (Mingrelian), yid (Yiddish), yor (Yoruba), yue (Cantonese), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|------:|-------:|--------:|------:|
14 | | 1 | ell | Greek | 100000 | 0.997 | 0.982 | 0.988 | 98172 | 253 | 4136165 | 1828 |
15 | | 2 | swe | Swedish | 100000 | 0.831 | 0.793 | 0.749 | 79263 | 16147 | 4120271 | 20737 |
16 | | 3 | mkd | Macedonian | 100000 | 0.870 | 0.761 | 0.765 | 76086 | 11387 | 4125031 | 23914 |
17 | | 4 | tha | Thai | 100000 | 0.999 | 0.957 | 0.977 | 95713 | 124 | 4136294 | 4287 |
18 | | 5 | cat | Catalan | 100000 | 0.918 | 0.508 | 0.635 | 50786 | 4527 | 4131891 | 49214 |
19 | | 6 | bul | Bulgarian | 100000 | 0.951 | 0.654 | 0.760 | 65408 | 3357 | 4133061 | 34592 |
20 | | 7 | fin | Finnish | 100000 | 0.818 | 0.865 | 0.768 | 86466 | 19296 | 4117122 | 13534 |
21 | | 8 | dan | Danish | 100000 | 0.601 | 0.700 | 0.533 | 69996 | 46449 | 4089969 | 30004 |
22 | | 9 | hun | Hungarian | 100000 | 0.831 | 0.880 | 0.787 | 88007 | 17864 | 4118554 | 11993 |
23 | | 10 | kor | Korean | 100000 | 0.995 | 0.891 | 0.938 | 89072 | 435 | 4135983 | 10928 |
24 | | 11 | spa | Spanish | 100000 | 0.625 | 0.908 | 0.606 | 90765 | 54449 | 4081969 | 9235 |
25 | | 12 | zho | Chinese | 100000 | 0.934 | 0.757 | 0.812 | 75651 | 5310 | 4131108 | 24349 |
26 | | 13 | slk | Slovak | 100000 | 0.902 | 0.400 | 0.538 | 39987 | 4356 | 4132062 | 60013 |
27 | | 14 | ron | Romanian | 100000 | 0.971 | 0.631 | 0.756 | 63068 | 1876 | 4134542 | 36932 |
28 | | 15 | ind | Indonesian | 100000 | 0.949 | 0.671 | 0.770 | 67114 | 3624 | 4132794 | 32886 |
29 | | 16 | est | Estonian | 100000 | 0.952 | 0.551 | 0.686 | 55128 | 2808 | 4133610 | 44872 |
30 | | 17 | por | Portuguese | 100000 | 0.790 | 0.810 | 0.722 | 80970 | 21587 | 4114831 | 19030 |
31 | | 18 | hrv | Croatian | 100000 | 0.728 | 0.264 | 0.361 | 26401 | 9843 | 4126575 | 73599 |
32 | | 19 | heb | Hebrew | 100000 | 1.000 | 0.969 | 0.984 | 96866 | 44 | 4136374 | 3134 |
33 | | 20 | ita | Italian | 100000 | 0.514 | 0.890 | 0.498 | 88985 | 84250 | 4052168 | 11015 |
34 | | 21 | slv | Slovenian | 100000 | 0.749 | 0.334 | 0.429 | 33416 | 11213 | 4125205 | 66584 |
35 | | 22 | ces | Czech | 100000 | 0.713 | 0.737 | 0.632 | 73683 | 29706 | 4106712 | 26317 |
36 | | 23 | mal | Malayalam | 100000 | 0.999 | 0.964 | 0.980 | 96373 | 125 | 4136293 | 3627 |
37 | | 24 | lit | Lithuanian | 100000 | 0.899 | 0.686 | 0.746 | 68646 | 7737 | 4128681 | 31354 |
38 | | 25 | ukr | Ukrainian | 100000 | 0.899 | 0.591 | 0.686 | 59075 | 6613 | 4129805 | 40925 |
39 | | 26 | pol | Polish | 100000 | 0.826 | 0.848 | 0.769 | 84830 | 17928 | 4118490 | 15170 |
40 | | 27 | fas | Persian | 100000 | 0.881 | 0.442 | 0.566 | 44234 | 5991 | 4130427 | 55766 |
41 | | 28 | jpn | Japanese | 100000 | 0.926 | 0.911 | 0.886 | 91080 | 7300 | 4129118 | 8920 |
42 | | 29 | hin | Hindi | 100000 | 0.998 | 0.797 | 0.886 | 79742 | 176 | 4136242 | 20258 |
43 | | 30 | eng | English | 100000 | 0.272 | 0.931 | 0.269 | 93052 | 249357 | 3887061 | 6948 |
44 | | 31 | sqi | Albanian | 100000 | 0.994 | 0.679 | 0.805 | 67935 | 395 | 4136023 | 32065 |
45 | | 32 | rus | Russian | 100000 | 0.563 | 0.956 | 0.556 | 95628 | 74265 | 4062153 | 4372 |
46 | | 33 | fra | French | 100000 | 0.727 | 0.864 | 0.688 | 86424 | 32405 | 4104013 | 13576 |
47 | | 34 | lav | Latvian | 100000 | 0.985 | 0.621 | 0.758 | 62101 | 931 | 4135487 | 37899 |
48 | | 35 | deu | German | 100000 | 0.745 | 0.849 | 0.699 | 84936 | 29124 | 4107294 | 15064 |
49 | | 36 | tur | Turkish | 100000 | 0.863 | 0.896 | 0.822 | 89562 | 14158 | 4122260 | 10438 |
50 | | 37 | ara | Arabic | 100000 | 0.661 | 0.916 | 0.642 | 91590 | 46947 | 4089471 | 8410 |
51 | | 38 | vie | Vietnamese | 100000 | 0.986 | 0.842 | 0.903 | 84224 | 1188 | 4135230 | 15776 |
52 | | 39 | nob | Norwegian Bokmål | 100000 | 0.755 | 0.303 | 0.404 | 30259 | 9843 | 4126575 | 69741 |
53 | | 40 | ben | Bangla | 100000 | 0.999 | 0.938 | 0.967 | 93803 | 93 | 4136325 | 6197 |
54 | | 41 | nld | Dutch | 100000 | 0.897 | 0.798 | 0.806 | 79835 | 9125 | 4127293 | 20165 |
55 | | 42 | urd | Urdu | 46523 | 0.984 | 0.751 | 0.846 | 34930 | 564 | 4189331 | 11593 |
56 | | 43 | tam | Tamil | 40165 | 0.998 | 0.943 | 0.968 | 37857 | 84 | 4196169 | 2308 |
57 | | 44 | tel | Telugu | 30416 | 0.998 | 0.927 | 0.961 | 28199 | 51 | 4205951 | 2217 |
58 | | 45 | fil | Filipino | 19314 | 0.836 | 0.566 | 0.633 | 10925 | 2136 | 4214968 | 8389 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05-common-48/fasttext/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for fasttext on tatoeba-sentences-2021-06-05-common-48
2 |
3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%)
4 | - **Aggregated accuracy: 98.94%**
5 |
6 | Supported languages (176)
7 |
8 | afr (Afrikaans), als (Tosk Albanian), amh (Amharic), arg (Aragonese), ara (Arabic), arz (Egyptian Arabic), asm (Assamese), ast (Asturian), ava (Avaric), aze (Azerbaijani), azb (South Azerbaijani), bak (Bashkir), bar (Bavarian), bcl (Central Bikol), bel (Belarusian), bul (Bulgarian), bih (Bihari languages), ben (Bangla), bod (Tibetan), bpy (Bishnupriya), bre (Breton), bos (Bosnian), bxr (Russia Buriat), cat (Catalan), cbk (Chavacano), che (Chechen), ceb (Cebuano), ckb (Central Kurdish), cos (Corsican), ces (Czech), chv (Chuvash), cym (Welsh), dan (Danish), deu (German), diq (Dimli (individual language)), dsb (Lower Sorbian), dty (Dotyali), div (Divehi), ell (Greek), eml (Unknown language [eml]), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fra (French), frr (Northern Frisian), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), grn (Guarani), gom (Goan Konkani), guj (Gujarati), glv (Manx), heb (Hebrew), hin (Hindi), hif (Fiji Hindi), hrv (Croatian), hsb (Upper Sorbian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ina (Interlingua), ind (Indonesian), ile (Interlingue), ilo (Iloko), ido (Ido), isl (Icelandic), ita (Italian), jpn (Japanese), jbo (Lojban), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), krc (Karachay-Balkar), kur (Kurdish), kom (Komi), cor (Cornish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lez (Lezghian), lim (Limburgish), lmo (Lombard), lao (Lao), lrc (Northern Luri), lit (Lithuanian), lav (Latvian), mai (Maithili), mlg (Malagasy), mhr (Eastern Mari), min (Minangkabau), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), mrj (Western Mari), msa (Malay), mlt (Maltese), mwl (Mirandese), mya (Burmese), myv (Erzya), mzn (Mazanderani), nah (Nahuatl languages), nap (Neapolitan), nds (Low German), nep (Nepali), new (Newari), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), oss (Ossetic), pan (Punjabi), pam (Pampanga), pfl (Palatine German), pol (Polish), pms (Piedmontese), pnb (Western Panjabi), pus (Pashto), por (Portuguese), que (Quechua), roh (Romansh), ron (Romanian), rus (Russian), rue (Rusyn), san (Sanskrit), sah (Sakha), srd (Sardinian), scn (Sicilian), sco (Scots), snd (Sindhi), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), srp (Serbian), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tuk (Turkmen), fil (Filipino), tur (Turkish), tat (Tatar), tyv (Tuvinian), uig (Uyghur), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vec (Venetian), vep (Veps), vie (Vietnamese), vls (West Flemish), vol (Volapük), wln (Walloon), war (Waray), wuu (Wu Chinese), xal (Kalmyk), xmf (Mingrelian), yid (Yiddish), yor (Yoruba), yue (Cantonese), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|-----:|--------:|-----:|
14 | | 1 | eng | English | 1479733 | 0.994 | 0.999 | 0.993 | 1478395 | 9216 | 5972678 | 1338 |
15 | | 2 | rus | Russian | 849653 | 0.995 | 0.998 | 0.994 | 848237 | 4129 | 6607845 | 1416 |
16 | | 3 | ita | Italian | 787053 | 0.994 | 0.996 | 0.992 | 783712 | 4984 | 6669590 | 3341 |
17 | | 4 | tur | Turkish | 709573 | 0.999 | 0.998 | 0.998 | 707878 | 812 | 6751242 | 1695 |
18 | | 5 | deu | German | 553727 | 0.996 | 0.997 | 0.994 | 552153 | 2391 | 6905509 | 1574 |
19 | | 6 | fra | French | 466192 | 0.994 | 0.994 | 0.991 | 463322 | 2814 | 6992621 | 2870 |
20 | | 7 | por | Portuguese | 385737 | 0.991 | 0.983 | 0.982 | 379214 | 3624 | 7072266 | 6523 |
21 | | 8 | spa | Spanish | 338781 | 0.979 | 0.987 | 0.972 | 334473 | 7311 | 7115535 | 4308 |
22 | | 9 | hun | Hungarian | 323048 | 0.996 | 0.993 | 0.993 | 320862 | 1315 | 7137264 | 2186 |
23 | | 10 | jpn | Japanese | 208761 | 1.000 | 1.000 | 1.000 | 208682 | 28 | 7252838 | 79 |
24 | | 11 | heb | Hebrew | 197226 | 1.000 | 1.000 | 1.000 | 197205 | 0 | 7264401 | 21 |
25 | | 12 | ukr | Ukrainian | 171674 | 0.996 | 0.978 | 0.985 | 167950 | 738 | 7289215 | 3724 |
26 | | 13 | nld | Dutch | 144340 | 0.988 | 0.955 | 0.966 | 137902 | 1685 | 7315602 | 6438 |
27 | | 14 | fin | Finnish | 128011 | 0.994 | 0.981 | 0.984 | 125521 | 797 | 7332819 | 2490 |
28 | | 15 | pol | Polish | 109662 | 0.988 | 0.992 | 0.984 | 108795 | 1366 | 7350599 | 867 |
29 | | 16 | mkd | Macedonian | 77938 | 0.982 | 0.986 | 0.976 | 76873 | 1395 | 7382294 | 1065 |
30 | | 17 | mar | Marathi | 64126 | 0.999 | 0.998 | 0.998 | 64016 | 96 | 7397405 | 110 |
31 | | 18 | lit | Lithuanian | 59659 | 0.996 | 0.960 | 0.976 | 57282 | 212 | 7401756 | 2377 |
32 | | 19 | ces | Czech | 57030 | 0.945 | 0.951 | 0.923 | 54248 | 3141 | 7401456 | 2782 |
33 | | 20 | dan | Danish | 49399 | 0.856 | 0.907 | 0.820 | 44828 | 7523 | 7404705 | 4571 |
34 | | 21 | swe | Swedish | 41677 | 0.953 | 0.948 | 0.929 | 39504 | 1944 | 7418006 | 2173 |
35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.993 | 0.997 | 35750 | 5 | 7425631 | 241 |
36 | | 23 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34069 | 10 | 7427546 | 2 |
37 | | 24 | ron | Romanian | 24943 | 0.983 | 0.937 | 0.951 | 23375 | 416 | 7436268 | 1568 |
38 | | 25 | bul | Bulgarian | 24503 | 0.966 | 0.944 | 0.939 | 23127 | 802 | 7436322 | 1376 |
39 | | 26 | vie | Vietnamese | 19234 | 0.993 | 0.997 | 0.992 | 19178 | 132 | 7442261 | 56 |
40 | | 27 | fil | Filipino | 16649 | 0.992 | 0.938 | 0.961 | 15621 | 122 | 7444856 | 1028 |
41 | | 28 | slk | Slovak | 14660 | 0.934 | 0.627 | 0.731 | 9190 | 651 | 7446316 | 5470 |
42 | | 29 | ind | Indonesian | 14542 | 0.963 | 0.921 | 0.925 | 13398 | 510 | 7446575 | 1144 |
43 | | 30 | hin | Hindi | 14230 | 0.994 | 0.989 | 0.989 | 14076 | 82 | 7447315 | 154 |
44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.676 | 0.422 | 0.462 | 5997 | 2872 | 7444532 | 8226 |
45 | | 32 | cat | Catalan | 7971 | 0.914 | 0.800 | 0.820 | 6377 | 601 | 7453055 | 1594 |
46 | | 33 | kor | Korean | 7570 | 0.997 | 0.994 | 0.994 | 7525 | 20 | 7454037 | 45 |
47 | | 34 | hrv | Croatian | 5204 | 0.786 | 0.451 | 0.532 | 2349 | 638 | 7455785 | 2855 |
48 | | 35 | ben | Bangla | 4714 | 0.998 | 0.999 | 0.998 | 4709 | 8 | 7456905 | 5 |
49 | | 36 | afr | Afrikaans | 4031 | 0.859 | 0.700 | 0.725 | 2823 | 465 | 7457131 | 1208 |
50 | | 37 | est | Estonian | 3637 | 0.842 | 0.818 | 0.770 | 2975 | 559 | 7457431 | 662 |
51 | | 38 | tha | Thai | 3528 | 0.999 | 1.000 | 0.999 | 3527 | 2 | 7458097 | 1 |
52 | | 39 | sqi | Albanian | 2526 | 0.967 | 0.865 | 0.899 | 2184 | 75 | 7459026 | 342 |
53 | | 40 | urd | Urdu | 2008 | 0.974 | 0.982 | 0.965 | 1972 | 53 | 7459566 | 36 |
54 | | 41 | cym | Welsh | 1344 | 0.965 | 0.721 | 0.813 | 969 | 35 | 7460248 | 375 |
55 | | 42 | slv | Slovenian | 1093 | 0.436 | 0.510 | 0.360 | 557 | 721 | 7459813 | 536 |
56 | | 43 | mal | Malayalam | 827 | 0.992 | 1.000 | 0.992 | 827 | 7 | 7460793 | 0 |
57 | | 44 | tam | Tamil | 334 | 0.991 | 1.000 | 0.991 | 334 | 3 | 7461290 | 0 |
58 | | 45 | tel | Telugu | 254 | 0.981 | 1.000 | 0.981 | 254 | 5 | 7461368 | 0 |
59 | | 46 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7461431 | 0 |
60 | | 47 | kan | Kannada | 176 | 0.926 | 1.000 | 0.926 | 176 | 14 | 7461437 | 0 |
61 | | 48 | guj | Gujarati | 168 | 0.988 | 1.000 | 0.988 | 168 | 2 | 7461457 | 0 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05-common-48/fasttext-compressed/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for fasttext-compressed on tatoeba-sentences-2021-06-05-common-48
2 |
3 | - Dataset coverage (sentences in supported languages): 7461627 (100.00%)
4 | - **Aggregated accuracy: 97.90%**
5 |
6 | Supported languages (176)
7 |
8 | afr (Afrikaans), als (Tosk Albanian), amh (Amharic), arg (Aragonese), ara (Arabic), arz (Egyptian Arabic), asm (Assamese), ast (Asturian), ava (Avaric), aze (Azerbaijani), azb (South Azerbaijani), bak (Bashkir), bar (Bavarian), bcl (Central Bikol), bel (Belarusian), bul (Bulgarian), bih (Bihari languages), ben (Bangla), bod (Tibetan), bpy (Bishnupriya), bre (Breton), bos (Bosnian), bxr (Russia Buriat), cat (Catalan), cbk (Chavacano), che (Chechen), ceb (Cebuano), ckb (Central Kurdish), cos (Corsican), ces (Czech), chv (Chuvash), cym (Welsh), dan (Danish), deu (German), diq (Dimli (individual language)), dsb (Lower Sorbian), dty (Dotyali), div (Divehi), ell (Greek), eml (Unknown language [eml]), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fra (French), frr (Northern Frisian), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), grn (Guarani), gom (Goan Konkani), guj (Gujarati), glv (Manx), heb (Hebrew), hin (Hindi), hif (Fiji Hindi), hrv (Croatian), hsb (Upper Sorbian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ina (Interlingua), ind (Indonesian), ile (Interlingue), ilo (Iloko), ido (Ido), isl (Icelandic), ita (Italian), jpn (Japanese), jbo (Lojban), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), krc (Karachay-Balkar), kur (Kurdish), kom (Komi), cor (Cornish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lez (Lezghian), lim (Limburgish), lmo (Lombard), lao (Lao), lrc (Northern Luri), lit (Lithuanian), lav (Latvian), mai (Maithili), mlg (Malagasy), mhr (Eastern Mari), min (Minangkabau), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), mrj (Western Mari), msa (Malay), mlt (Maltese), mwl (Mirandese), mya (Burmese), myv (Erzya), mzn (Mazanderani), nah (Nahuatl languages), nap (Neapolitan), nds (Low German), nep (Nepali), new (Newari), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), oss (Ossetic), pan (Punjabi), pam (Pampanga), pfl (Palatine German), pol (Polish), pms (Piedmontese), pnb (Western Panjabi), pus (Pashto), por (Portuguese), que (Quechua), roh (Romansh), ron (Romanian), rus (Russian), rue (Rusyn), san (Sanskrit), sah (Sakha), srd (Sardinian), scn (Sicilian), sco (Scots), snd (Sindhi), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), som (Somali), sqi (Albanian), srp (Serbian), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tuk (Turkmen), fil (Filipino), tur (Turkish), tat (Tatar), tyv (Tuvinian), uig (Uyghur), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vec (Venetian), vep (Veps), vie (Vietnamese), vls (West Flemish), vol (Volapük), wln (Walloon), war (Waray), wuu (Wu Chinese), xal (Kalmyk), xmf (Mingrelian), yid (Yiddish), yor (Yoruba), yue (Cantonese), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|------:|
14 | | 1 | eng | English | 1479733 | 0.985 | 0.997 | 0.984 | 1475222 | 22413 | 5959481 | 4511 |
15 | | 2 | rus | Russian | 849653 | 0.979 | 0.996 | 0.977 | 846620 | 18559 | 6593415 | 3033 |
16 | | 3 | ita | Italian | 787053 | 0.984 | 0.987 | 0.978 | 776749 | 12310 | 6662264 | 10304 |
17 | | 4 | tur | Turkish | 709573 | 0.998 | 0.994 | 0.995 | 705300 | 1312 | 6750742 | 4273 |
18 | | 5 | deu | German | 553727 | 0.990 | 0.993 | 0.987 | 549727 | 5449 | 6902451 | 4000 |
19 | | 6 | fra | French | 466192 | 0.984 | 0.988 | 0.978 | 460626 | 7670 | 6987765 | 5566 |
20 | | 7 | por | Portuguese | 385737 | 0.986 | 0.965 | 0.968 | 372107 | 5326 | 7070564 | 13630 |
21 | | 8 | spa | Spanish | 338781 | 0.952 | 0.977 | 0.942 | 331141 | 16754 | 7106092 | 7640 |
22 | | 9 | hun | Hungarian | 323048 | 0.992 | 0.981 | 0.982 | 316851 | 2660 | 7135919 | 6197 |
23 | | 10 | jpn | Japanese | 208761 | 0.999 | 0.997 | 0.998 | 208113 | 109 | 7252757 | 648 |
24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.999 | 0.999 | 197021 | 1 | 7264400 | 205 |
25 | | 12 | ukr | Ukrainian | 171674 | 0.991 | 0.916 | 0.948 | 157290 | 1385 | 7288568 | 14384 |
26 | | 13 | nld | Dutch | 144340 | 0.983 | 0.927 | 0.946 | 133805 | 2324 | 7314963 | 10535 |
27 | | 14 | fin | Finnish | 128011 | 0.986 | 0.964 | 0.968 | 123392 | 1776 | 7331840 | 4619 |
28 | | 15 | pol | Polish | 109662 | 0.979 | 0.983 | 0.970 | 107820 | 2367 | 7349598 | 1842 |
29 | | 16 | mkd | Macedonian | 77938 | 0.954 | 0.936 | 0.924 | 72941 | 3508 | 7380181 | 4997 |
30 | | 17 | mar | Marathi | 64126 | 0.993 | 0.984 | 0.985 | 63072 | 434 | 7397067 | 1054 |
31 | | 18 | lit | Lithuanian | 59659 | 0.993 | 0.916 | 0.950 | 54676 | 402 | 7401566 | 4983 |
32 | | 19 | ces | Czech | 57030 | 0.923 | 0.908 | 0.881 | 51757 | 4326 | 7400271 | 5273 |
33 | | 20 | dan | Danish | 49399 | 0.837 | 0.871 | 0.788 | 43048 | 8398 | 7403830 | 6351 |
34 | | 21 | swe | Swedish | 41677 | 0.929 | 0.918 | 0.892 | 38263 | 2925 | 7417025 | 3414 |
35 | | 22 | ara | Arabic | 35991 | 1.000 | 0.984 | 0.991 | 35415 | 17 | 7425619 | 576 |
36 | | 23 | ell | Greek | 34071 | 1.000 | 0.999 | 0.999 | 34042 | 10 | 7427546 | 29 |
37 | | 24 | ron | Romanian | 24943 | 0.978 | 0.891 | 0.922 | 22222 | 509 | 7436175 | 2721 |
38 | | 25 | bul | Bulgarian | 24503 | 0.900 | 0.815 | 0.817 | 19969 | 2215 | 7434909 | 4534 |
39 | | 26 | vie | Vietnamese | 19234 | 0.987 | 0.988 | 0.981 | 19008 | 250 | 7442143 | 226 |
40 | | 27 | fil | Filipino | 16649 | 0.989 | 0.878 | 0.925 | 14619 | 165 | 7444813 | 2030 |
41 | | 28 | slk | Slovak | 14660 | 0.884 | 0.478 | 0.596 | 7005 | 919 | 7446048 | 7655 |
42 | | 29 | ind | Indonesian | 14542 | 0.955 | 0.880 | 0.897 | 12793 | 602 | 7446483 | 1749 |
43 | | 30 | hin | Hindi | 14230 | 0.933 | 0.962 | 0.916 | 13687 | 978 | 7446419 | 543 |
44 | | 31 | nob | Norwegian Bokmål | 14223 | 0.630 | 0.381 | 0.417 | 5421 | 3190 | 7444214 | 8802 |
45 | | 32 | cat | Catalan | 7971 | 0.821 | 0.686 | 0.691 | 5469 | 1196 | 7452460 | 2502 |
46 | | 33 | kor | Korean | 7570 | 0.996 | 0.970 | 0.981 | 7344 | 32 | 7454025 | 226 |
47 | | 34 | hrv | Croatian | 5204 | 0.702 | 0.353 | 0.427 | 1839 | 781 | 7455642 | 3365 |
48 | | 35 | ben | Bangla | 4714 | 0.998 | 0.996 | 0.996 | 4693 | 8 | 7456905 | 21 |
49 | | 36 | afr | Afrikaans | 4031 | 0.844 | 0.627 | 0.675 | 2528 | 468 | 7457128 | 1503 |
50 | | 37 | est | Estonian | 3637 | 0.745 | 0.654 | 0.622 | 2380 | 816 | 7457174 | 1257 |
51 | | 38 | tha | Thai | 3528 | 0.996 | 0.999 | 0.996 | 3524 | 13 | 7458086 | 4 |
52 | | 39 | sqi | Albanian | 2526 | 0.967 | 0.835 | 0.883 | 2109 | 72 | 7459029 | 417 |
53 | | 40 | urd | Urdu | 2008 | 0.987 | 0.954 | 0.964 | 1915 | 26 | 7459593 | 93 |
54 | | 41 | cym | Welsh | 1344 | 0.892 | 0.541 | 0.647 | 727 | 88 | 7460195 | 617 |
55 | | 42 | slv | Slovenian | 1093 | 0.226 | 0.431 | 0.197 | 471 | 1611 | 7458923 | 622 |
56 | | 43 | mal | Malayalam | 827 | 0.981 | 0.999 | 0.980 | 826 | 16 | 7460784 | 1 |
57 | | 44 | tam | Tamil | 334 | 0.991 | 1.000 | 0.991 | 334 | 3 | 7461290 | 0 |
58 | | 45 | tel | Telugu | 254 | 0.973 | 1.000 | 0.973 | 254 | 7 | 7461366 | 0 |
59 | | 46 | pan | Punjabi | 196 | 0.933 | 1.000 | 0.933 | 196 | 14 | 7461417 | 0 |
60 | | 47 | kan | Kannada | 176 | 0.967 | 1.000 | 0.967 | 176 | 6 | 7461445 | 0 |
61 | | 48 | guj | Gujarati | 168 | 0.994 | 1.000 | 0.994 | 168 | 1 | 7461458 | 0 |
--------------------------------------------------------------------------------
/datasets/open-subtitles-v2018-100k-per-lang/download:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.en.gz" | gzip -d -c | head -n 100000 > en.txt
4 |
5 | # Languages commented out are not supported by some benchmarks
6 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.af.gz" | gzip -d -c | head -n 100000 > ar.txt
7 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz" | gzip -d -c | head -n 100000 > ar.txt
8 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.bg.gz" | gzip -d -c | head -n 100000 > bg.txt
9 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.bn.gz" | gzip -d -c | head -n 100000 > bn.txt
10 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.br.gz" | gzip -d -c | head -n 100000 > br.txt
11 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.bs.gz" | gzip -d -c | head -n 100000 > bs.txt
12 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ca.gz" | gzip -d -c | head -n 100000 > ca.txt
13 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.cs.gz" | gzip -d -c | head -n 100000 > cs.txt
14 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.da.gz" | gzip -d -c | head -n 100000 > da.txt
15 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.de.gz" | gzip -d -c | head -n 100000 > de.txt
16 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.el.gz" | gzip -d -c | head -n 100000 > el.txt
17 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.eo.gz" | gzip -d -c | head -n 100000 > eo.txt
18 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.es.gz" | gzip -d -c | head -n 100000 > es.txt
19 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.et.gz" | gzip -d -c | head -n 100000 > et.txt
20 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.eu.gz" | gzip -d -c | head -n 100000 > eu.txt
21 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fa.gz" | gzip -d -c | head -n 100000 > fa.txt
22 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fi.gz" | gzip -d -c | head -n 100000 > fi.txt
23 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fr.gz" | gzip -d -c | head -n 100000 > fr.txt
24 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.gl.gz" | gzip -d -c | head -n 100000 > gl.txt
25 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.he.gz" | gzip -d -c | head -n 100000 > he.txt
26 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.hi.gz" | gzip -d -c | head -n 100000 > hi.txt
27 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.hr.gz" | gzip -d -c | head -n 100000 > hr.txt
28 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.hu.gz" | gzip -d -c | head -n 100000 > hu.txt
29 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.hy.gz" | gzip -d -c | head -n 100000 > hy.txt
30 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.id.gz" | gzip -d -c | head -n 100000 > id.txt
31 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.is.gz" | gzip -d -c | head -n 100000 > is.txt
32 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.it.gz" | gzip -d -c | head -n 100000 > it.txt
33 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ja.gz" | gzip -d -c | head -n 100000 > ja.txt
34 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ka.gz" | gzip -d -c | head -n 100000 > ka.txt
35 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.kk.gz" | gzip -d -c | head -n 100000 > kk.txt
36 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ko.gz" | gzip -d -c | head -n 100000 > ko.txt
37 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.lt.gz" | gzip -d -c | head -n 100000 > lt.txt
38 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.lv.gz" | gzip -d -c | head -n 100000 > lv.txt
39 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.mk.gz" | gzip -d -c | head -n 100000 > mk.txt
40 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ml.gz" | gzip -d -c | head -n 100000 > ml.txt
41 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ms.gz" | gzip -d -c | head -n 100000 > ms.txt
42 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.nl.gz" | gzip -d -c | head -n 100000 > nl.txt
43 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.no.gz" | gzip -d -c | head -n 100000 > no.txt
44 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pl.gz" | gzip -d -c | head -n 100000 > pl.txt
45 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt.gz" | gzip -d -c | head -n 100000 > pt.txt
46 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt_br.gz" | gzip -d -c | head -n 100000 > pt_br.txt
47 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ro.gz" | gzip -d -c | head -n 100000 > ro.txt
48 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ru.gz" | gzip -d -c | head -n 100000 > ru.txt
49 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.si.gz" | gzip -d -c | head -n 100000 > si.txt
50 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sk.gz" | gzip -d -c | head -n 100000 > sk.txt
51 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sl.gz" | gzip -d -c | head -n 100000 > sl.txt
52 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sq.gz" | gzip -d -c | head -n 100000 > sq.txt
53 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sr.gz" | gzip -d -c | head -n 100000 > sr.txt
54 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.sv.gz" | gzip -d -c | head -n 100000 > sv.txt
55 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ta.gz" | gzip -d -c | head -n 100000 > ta.txt
56 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.te.gz" | gzip -d -c | head -n 100000 > te.txt
57 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.th.gz" | gzip -d -c | head -n 100000 > th.txt
58 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.tl.gz" | gzip -d -c | head -n 100000 > tl.txt
59 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.tr.gz" | gzip -d -c | head -n 100000 > tr.txt
60 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.uk.gz" | gzip -d -c | head -n 100000 > uk.txt
61 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ur.gz" | gzip -d -c | head -n 100000 > ur.txt
62 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.vi.gz" | gzip -d -c | head -n 100000 > vi.txt
63 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ze_en.gz" | gzip -d -c | head -n 100000 > ze_en.txt
64 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ze_zh.gz" | gzip -d -c | head -n 100000 > ze_zh.txt
65 | curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.zh_cn.gz" | gzip -d -c | head -n 100000 > zh_cn.txt
66 | # curl -s --location-trusted --max-redirs 10 -J "http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.zh_tw.gz" | gzip -d -c | head -n 100000 > zh_tw.txt
67 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/pycld2/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for pycld2 on tatoeba-sentences-2021-06-05
2 |
3 | - Dataset coverage (sentences in supported languages): 7569549 (78.52%)
4 | - **Aggregated accuracy: 86.95%**
5 |
6 | Supported languages (83)
7 |
8 | afr (Afrikaans), sqi (Albanian), ara (Arabic), hye (Armenian), aze (Azerbaijani), eus (Basque), bel (Belarusian), ben (Bangla), bih (Bihari languages), bul (Bulgarian), cat (Catalan), ceb (Cebuano), chr (Cherokee), hrv (Croatian), ces (Czech), zho (Chinese), dan (Danish), div (Divehi), nld (Dutch), eng (English), est (Estonian), fin (Finnish), fra (French), glg (Galician), lug (Ganda), kat (Georgian), deu (German), ell (Greek), guj (Gujarati), hat (Haitian Creole), heb (Hebrew), hin (Hindi), hmn (Hmong), hun (Hungarian), isl (Icelandic), ind (Indonesian), iku (Inuktitut), gle (Irish), ita (Italian), jav (Javanese), jpn (Japanese), kan (Kannada), khm (Khmer), kin (Kinyarwanda), kor (Korean), lao (Lao), lav (Latvian), lif (Limbu), lit (Lithuanian), mkd (Macedonian), msa (Malay), mal (Malayalam), mlt (Maltese), mar (Marathi), nep (Nepali), nob (Norwegian Bokmål), ori (Odia), fas (Persian), pol (Polish), por (Portuguese), pan (Punjabi), ron (Romanian), rus (Russian), gla (Scottish Gaelic), srp (Serbian), sin (Sinhala), slk (Slovak), slv (Slovenian), spa (Spanish), swa (Swahili), swe (Swedish), syr (Syriac), fil (Filipino), tam (Tamil), tel (Telugu), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), cym (Welsh), yid (Yiddish), zho (Chinese)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:|
14 | | 1 | eng | English | 1479733 | 0.954 | 0.970 | 0.940 | 1435203 | 68766 | 6021050 | 44530 |
15 | | 2 | rus | Russian | 849653 | 0.998 | 0.830 | 0.905 | 705549 | 1667 | 6718229 | 144104 |
16 | | 3 | ita | Italian | 787053 | 0.999 | 0.689 | 0.816 | 542549 | 442 | 6782054 | 244504 |
17 | | 4 | tur | Turkish | 709573 | 1.000 | 0.923 | 0.960 | 654731 | 191 | 6859785 | 54842 |
18 | | 5 | deu | German | 553727 | 1.000 | 0.954 | 0.976 | 528244 | 184 | 7015638 | 25483 |
19 | | 6 | fra | French | 466192 | 0.999 | 0.845 | 0.915 | 394106 | 374 | 7102983 | 72086 |
20 | | 7 | por | Portuguese | 385737 | 0.981 | 0.865 | 0.912 | 333763 | 6371 | 7177441 | 51974 |
21 | | 8 | spa | Spanish | 338781 | 0.994 | 0.798 | 0.883 | 270207 | 1677 | 7229091 | 68574 |
22 | | 9 | hun | Hungarian | 323048 | 1.000 | 0.935 | 0.966 | 302013 | 133 | 7246368 | 21035 |
23 | | 10 | jpn | Japanese | 208761 | 1.000 | 0.999 | 1.000 | 208635 | 0 | 7360788 | 126 |
24 | | 11 | heb | Hebrew | 197226 | 1.000 | 0.841 | 0.914 | 165882 | 10 | 7372313 | 31344 |
25 | | 12 | ukr | Ukrainian | 171674 | 0.991 | 0.791 | 0.877 | 135799 | 1168 | 7396707 | 35875 |
26 | | 13 | nld | Dutch | 144340 | 0.994 | 0.820 | 0.897 | 118356 | 664 | 7424545 | 25984 |
27 | | 14 | fin | Finnish | 128011 | 0.999 | 0.909 | 0.951 | 116372 | 161 | 7441377 | 11639 |
28 | | 15 | pol | Polish | 109662 | 0.999 | 0.926 | 0.961 | 101512 | 75 | 7459812 | 8150 |
29 | | 16 | mkd | Macedonian | 77938 | 0.969 | 0.477 | 0.633 | 37213 | 1178 | 7490433 | 40725 |
30 | | 17 | mar | Marathi | 64126 | 1.000 | 0.967 | 0.983 | 62024 | 24 | 7505399 | 2102 |
31 | | 18 | lit | Lithuanian | 59659 | 0.997 | 0.914 | 0.952 | 54501 | 144 | 7509746 | 5158 |
32 | | 19 | ces | Czech | 57030 | 0.970 | 0.891 | 0.916 | 50816 | 1551 | 7510968 | 6214 |
33 | | 20 | dan | Danish | 49399 | 0.866 | 0.698 | 0.729 | 34494 | 5341 | 7514809 | 14905 |
34 | | 21 | srp | Serbian | 45176 | 0.246 | 0.564 | 0.225 | 25486 | 77950 | 7446423 | 19690 |
35 | | 22 | swe | Swedish | 41677 | 0.995 | 0.761 | 0.861 | 31709 | 145 | 7527727 | 9968 |
36 | | 23 | ara | Arabic | 35991 | 1.000 | 0.776 | 0.874 | 27916 | 1 | 7533557 | 8075 |
37 | | 24 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 14 | 7535464 | 0 |
38 | | 25 | ron | Romanian | 24943 | 0.963 | 0.811 | 0.865 | 20227 | 787 | 7543819 | 4716 |
39 | | 26 | bul | Bulgarian | 24503 | 0.852 | 0.700 | 0.721 | 17140 | 2967 | 7542079 | 7363 |
40 | | 27 | vie | Vietnamese | 19234 | 0.995 | 0.991 | 0.990 | 19062 | 103 | 7550212 | 172 |
41 | | 28 | fil | Filipino | 16649 | 0.988 | 0.789 | 0.872 | 13136 | 166 | 7552734 | 3513 |
42 | | 29 | slk | Slovak | 14660 | 0.693 | 0.788 | 0.634 | 11559 | 5110 | 7549779 | 3101 |
43 | | 30 | ind | Indonesian | 14542 | 0.864 | 0.775 | 0.768 | 11270 | 1773 | 7553234 | 3272 |
44 | | 31 | hin | Hindi | 14230 | 0.918 | 0.973 | 0.907 | 13848 | 1230 | 7554089 | 382 |
45 | | 32 | nob | Norwegian Bokmål | 14223 | 0.566 | 0.796 | 0.528 | 11327 | 8682 | 7546644 | 2896 |
46 | | 33 | bel | Belarusian | 12633 | 0.929 | 0.885 | 0.876 | 11176 | 855 | 7556061 | 1457 |
47 | | 34 | isl | Icelandic | 11091 | 0.996 | 0.925 | 0.957 | 10261 | 43 | 7558415 | 830 |
48 | | 35 | cat | Catalan | 7971 | 0.806 | 0.685 | 0.680 | 5464 | 1317 | 7560261 | 2507 |
49 | | 36 | kor | Korean | 7570 | 1.000 | 0.991 | 0.995 | 7500 | 0 | 7561979 | 70 |
50 | | 37 | yid | Yiddish | 6895 | 0.991 | 0.937 | 0.959 | 6460 | 60 | 7562594 | 435 |
51 | | 38 | eus | Basque | 6166 | 0.972 | 0.893 | 0.918 | 5505 | 158 | 7563225 | 661 |
52 | | 39 | kat | Georgian | 5732 | 1.000 | 1.000 | 1.000 | 5731 | 0 | 7563817 | 1 |
53 | | 40 | aze | Azerbaijani | 5348 | 0.509 | 0.870 | 0.490 | 4651 | 4486 | 7559715 | 697 |
54 | | 41 | hrv | Croatian | 5204 | 0.270 | 0.565 | 0.244 | 2942 | 7960 | 7556385 | 2262 |
55 | | 42 | ben | Bangla | 4714 | 1.000 | 0.777 | 0.874 | 3662 | 0 | 7564835 | 1052 |
56 | | 43 | glg | Galician | 4613 | 0.292 | 0.668 | 0.273 | 3081 | 7456 | 7557480 | 1532 |
57 | | 44 | afr | Afrikaans | 4031 | 0.446 | 0.826 | 0.426 | 3330 | 4133 | 7561385 | 701 |
58 | | 45 | est | Estonian | 3637 | 0.907 | 0.752 | 0.789 | 2734 | 279 | 7565633 | 903 |
59 | | 46 | tha | Thai | 3528 | 1.000 | 1.000 | 1.000 | 3528 | 0 | 7566021 | 0 |
60 | | 47 | sqi | Albanian | 2526 | 0.962 | 0.909 | 0.917 | 2295 | 91 | 7566932 | 231 |
61 | | 48 | gle | Irish | 2389 | 0.938 | 0.884 | 0.883 | 2112 | 140 | 7567020 | 277 |
62 | | 49 | hye | Armenian | 2248 | 1.000 | 1.000 | 1.000 | 2247 | 0 | 7567301 | 1 |
63 | | 50 | urd | Urdu | 2008 | 0.997 | 0.948 | 0.971 | 1903 | 5 | 7567536 | 105 |
64 | | 51 | khm | Khmer | 1511 | 1.000 | 0.991 | 0.996 | 1498 | 0 | 7568038 | 13 |
65 | | 52 | ceb | Cebuano | 1478 | 0.617 | 0.551 | 0.493 | 815 | 506 | 7567565 | 663 |
66 | | 53 | cym | Welsh | 1344 | 0.968 | 0.845 | 0.890 | 1136 | 37 | 7568168 | 208 |
67 | | 54 | slv | Slovenian | 1093 | 0.739 | 0.550 | 0.568 | 601 | 212 | 7568244 | 492 |
68 | | 55 | gla | Scottish Gaelic | 1033 | 0.927 | 0.909 | 0.886 | 939 | 74 | 7568442 | 94 |
69 | | 56 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 7568722 | 0 |
70 | | 57 | jav | Javanese | 615 | 0.806 | 0.610 | 0.641 | 375 | 90 | 7568844 | 240 |
71 | | 58 | ori | Odia | 374 | 1.000 | 1.000 | 1.000 | 374 | 0 | 7569175 | 0 |
72 | | 59 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 7569215 | 0 |
73 | | 60 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 7569295 | 0 |
74 | | 61 | lao | Lao | 219 | 1.000 | 1.000 | 1.000 | 219 | 0 | 7569330 | 0 |
75 | | 62 | mlt | Maltese | 208 | 0.243 | 0.803 | 0.236 | 167 | 521 | 7568820 | 41 |
76 | | 63 | pan | Punjabi | 196 | 1.000 | 1.000 | 1.000 | 196 | 0 | 7569353 | 0 |
77 | | 64 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 7569373 | 0 |
78 | | 65 | guj | Gujarati | 168 | 1.000 | 1.000 | 1.000 | 168 | 0 | 7569381 | 0 |
79 | | 66 | hat | Haitian Creole | 64 | 0.114 | 0.594 | 0.110 | 38 | 294 | 7569191 | 26 |
80 | | 67 | sin | Sinhala | 45 | 1.000 | 1.000 | 1.000 | 45 | 0 | 7569504 | 0 |
81 | | 68 | chr | Cherokee | 28 | 1.000 | 0.964 | 0.982 | 27 | 0 | 7569521 | 1 |
82 | | 69 | kin | Kinyarwanda | 28 | 0.050 | 0.679 | 0.049 | 19 | 361 | 7569160 | 9 |
83 | | 70 | div | Divehi | 26 | 1.000 | 1.000 | 1.000 | 26 | 0 | 7569523 | 0 |
84 | | 71 | lug | Ganda | 2 | 0.032 | 1.000 | 0.032 | 2 | 60 | 7569487 | 0 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/langid/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for langid on tatoeba-sentences-2021-06-05
2 |
3 | - Dataset coverage (sentences in supported languages): 8298609 (86.08%)
4 | - **Aggregated accuracy: 89.00%**
5 |
6 | Supported languages (97)
7 |
8 | afr (Afrikaans), amh (Amharic), arg (Aragonese), ara (Arabic), asm (Assamese), aze (Azerbaijani), bel (Belarusian), bul (Bulgarian), ben (Bangla), bre (Breton), bos (Bosnian), cat (Catalan), ces (Czech), cym (Welsh), dan (Danish), deu (German), dzo (Dzongkha), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fao (Faroese), fra (French), gle (Irish), glg (Galician), guj (Gujarati), heb (Hebrew), hin (Hindi), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), isl (Icelandic), ita (Italian), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), nob (Norwegian Bokmål), nep (Nepali), nld (Dutch), nno (Norwegian Nynorsk), nob (Norwegian Bokmål), oci (Occitan), ori (Odia), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), que (Quechua), ron (Romanian), rus (Russian), kin (Kinyarwanda), sme (Northern Sami), sin (Sinhala), slk (Slovak), slv (Slovenian), sqi (Albanian), srp (Serbian), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tha (Thai), fil (Filipino), tur (Turkish), uig (Uyghur), ukr (Ukrainian), urd (Urdu), vie (Vietnamese), vol (Volapük), wln (Walloon), xho (Xhosa), zho (Chinese), zul (Zulu)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:------------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:|
14 | | 1 | eng | English | 1479733 | 0.953 | 0.973 | 0.941 | 1439789 | 70832 | 6748044 | 39944 |
15 | | 2 | rus | Russian | 849653 | 0.966 | 0.823 | 0.875 | 699008 | 24596 | 7424360 | 150645 |
16 | | 3 | ita | Italian | 787053 | 0.954 | 0.885 | 0.898 | 696690 | 33674 | 7477882 | 90363 |
17 | | 4 | tur | Turkish | 709573 | 0.995 | 0.919 | 0.953 | 652061 | 3202 | 7585834 | 57512 |
18 | | 5 | epo | Esperanto | 659632 | 0.983 | 0.854 | 0.907 | 563174 | 9712 | 7629265 | 96458 |
19 | | 6 | deu | German | 553727 | 0.950 | 0.974 | 0.938 | 539237 | 28349 | 7716533 | 14490 |
20 | | 7 | fra | French | 466192 | 0.897 | 0.935 | 0.870 | 436032 | 49971 | 7782446 | 30160 |
21 | | 8 | por | Portuguese | 385737 | 0.915 | 0.822 | 0.832 | 317068 | 29566 | 7883306 | 68669 |
22 | | 9 | spa | Spanish | 338781 | 0.784 | 0.837 | 0.728 | 283448 | 78313 | 7881515 | 55333 |
23 | | 10 | hun | Hungarian | 323048 | 0.977 | 0.929 | 0.942 | 300114 | 7043 | 7968518 | 22934 |
24 | | 11 | jpn | Japanese | 208761 | 0.997 | 1.000 | 0.997 | 208702 | 585 | 8089263 | 59 |
25 | | 12 | heb | Hebrew | 197226 | 1.000 | 0.999 | 1.000 | 197110 | 14 | 8101369 | 116 |
26 | | 13 | ukr | Ukrainian | 171674 | 0.748 | 0.774 | 0.675 | 132961 | 44688 | 8082247 | 38713 |
27 | | 14 | nld | Dutch | 144340 | 0.869 | 0.901 | 0.830 | 130115 | 19610 | 8134659 | 14225 |
28 | | 15 | fin | Finnish | 128011 | 0.941 | 0.931 | 0.909 | 119175 | 7527 | 8163071 | 8836 |
29 | | 16 | pol | Polish | 109662 | 0.944 | 0.977 | 0.933 | 107088 | 6349 | 8182598 | 2574 |
30 | | 17 | mkd | Macedonian | 77938 | 0.582 | 0.482 | 0.443 | 37554 | 27012 | 8193659 | 40384 |
31 | | 18 | mar | Marathi | 64126 | 0.988 | 0.700 | 0.815 | 44902 | 563 | 8233920 | 19224 |
32 | | 19 | lit | Lithuanian | 59659 | 0.774 | 0.915 | 0.747 | 54565 | 15938 | 8223012 | 5094 |
33 | | 20 | ces | Czech | 57030 | 0.879 | 0.837 | 0.809 | 47732 | 6594 | 8234985 | 9298 |
34 | | 21 | dan | Danish | 49399 | 0.739 | 0.602 | 0.594 | 29753 | 10483 | 8238727 | 19646 |
35 | | 22 | srp | Serbian | 45176 | 0.214 | 0.392 | 0.184 | 17727 | 64947 | 8188486 | 27449 |
36 | | 23 | swe | Swedish | 41677 | 0.782 | 0.802 | 0.713 | 33438 | 9345 | 8247587 | 8239 |
37 | | 24 | lat | Latin | 39718 | 0.939 | 0.196 | 0.322 | 7803 | 509 | 8258382 | 31915 |
38 | | 25 | ara | Arabic | 35991 | 0.999 | 0.950 | 0.973 | 34184 | 46 | 8262572 | 1807 |
39 | | 26 | ell | Greek | 34071 | 1.000 | 1.000 | 1.000 | 34071 | 15 | 8264523 | 0 |
40 | | 27 | ron | Romanian | 24943 | 0.637 | 0.906 | 0.617 | 22605 | 12867 | 8260799 | 2338 |
41 | | 28 | bul | Bulgarian | 24503 | 0.209 | 0.624 | 0.197 | 15278 | 57734 | 8216372 | 9225 |
42 | | 29 | vie | Vietnamese | 19234 | 0.956 | 0.998 | 0.955 | 19192 | 886 | 8278489 | 42 |
43 | | 30 | fil | Filipino | 16649 | 0.876 | 0.792 | 0.786 | 13181 | 1860 | 8280100 | 3468 |
44 | | 31 | slk | Slovak | 14660 | 0.525 | 0.690 | 0.470 | 10119 | 9140 | 8274809 | 4541 |
45 | | 32 | ind | Indonesian | 14542 | 0.528 | 0.731 | 0.481 | 10632 | 9494 | 8274573 | 3910 |
46 | | 33 | hin | Hindi | 14230 | 0.420 | 0.901 | 0.410 | 12825 | 17727 | 8266652 | 1405 |
47 | | 34 | nob | Norwegian Bokmål | 14223 | 0.305 | 0.770 | 0.292 | 10958 | 24918 | 8259468 | 3265 |
48 | | 35 | bel | Belarusian | 12633 | 0.437 | 0.879 | 0.424 | 11106 | 14321 | 8271655 | 1527 |
49 | | 36 | isl | Icelandic | 11091 | 0.874 | 0.929 | 0.846 | 10300 | 1484 | 8286034 | 791 |
50 | | 37 | cat | Catalan | 7971 | 0.200 | 0.720 | 0.193 | 5739 | 22950 | 8267688 | 2232 |
51 | | 38 | uig | Uyghur | 7792 | 0.961 | 0.989 | 0.956 | 7707 | 316 | 8290501 | 85 |
52 | | 39 | kor | Korean | 7570 | 0.989 | 1.000 | 0.989 | 7568 | 83 | 8290956 | 2 |
53 | | 40 | bre | Breton | 7195 | 0.423 | 0.630 | 0.377 | 4535 | 6179 | 8285235 | 2660 |
54 | | 41 | eus | Basque | 6166 | 0.324 | 0.866 | 0.316 | 5338 | 11150 | 8281293 | 828 |
55 | | 42 | kat | Georgian | 5732 | 0.997 | 0.996 | 0.995 | 5710 | 19 | 8292858 | 22 |
56 | | 43 | oci | Occitan | 5693 | 0.341 | 0.505 | 0.292 | 2873 | 5547 | 8287369 | 2820 |
57 | | 44 | aze | Azerbaijani | 5348 | 0.233 | 0.756 | 0.224 | 4044 | 13343 | 8279918 | 1304 |
58 | | 45 | hrv | Croatian | 5204 | 0.155 | 0.650 | 0.149 | 3384 | 18473 | 8274932 | 1820 |
59 | | 46 | ben | Bangla | 4714 | 0.659 | 0.977 | 0.654 | 4605 | 2380 | 8291515 | 109 |
60 | | 47 | glg | Galician | 4613 | 0.066 | 0.520 | 0.064 | 2400 | 33789 | 8260207 | 2213 |
61 | | 48 | vol | Volapük | 4132 | 0.554 | 0.265 | 0.313 | 1093 | 880 | 8293597 | 3039 |
62 | | 49 | afr | Afrikaans | 4031 | 0.300 | 0.462 | 0.255 | 1861 | 4338 | 8290240 | 2170 |
63 | | 50 | kaz | Kazakh | 3685 | 0.368 | 0.943 | 0.364 | 3476 | 5968 | 8288956 | 209 |
64 | | 51 | est | Estonian | 3637 | 0.226 | 0.689 | 0.215 | 2505 | 8560 | 8286412 | 1132 |
65 | | 52 | tha | Thai | 3528 | 0.998 | 1.000 | 0.998 | 3528 | 7 | 8295074 | 0 |
66 | | 53 | asm | Assamese | 2912 | 0.853 | 0.227 | 0.348 | 662 | 114 | 8295583 | 2250 |
67 | | 54 | mon | Mongolian | 2757 | 0.288 | 0.950 | 0.286 | 2618 | 6463 | 8289389 | 139 |
68 | | 55 | sqi | Albanian | 2526 | 0.764 | 0.905 | 0.735 | 2285 | 704 | 8295379 | 241 |
69 | | 56 | gle | Irish | 2389 | 0.392 | 0.840 | 0.378 | 2007 | 3108 | 8293112 | 382 |
70 | | 57 | hye | Armenian | 2248 | 0.992 | 0.911 | 0.946 | 2048 | 17 | 8296344 | 200 |
71 | | 58 | urd | Urdu | 2008 | 0.853 | 0.947 | 0.833 | 1901 | 328 | 8296273 | 107 |
72 | | 59 | nno | Norwegian Nynorsk | 1576 | 0.144 | 0.501 | 0.134 | 789 | 4696 | 8292337 | 787 |
73 | | 60 | khm | Khmer | 1511 | 0.997 | 0.975 | 0.985 | 1473 | 4 | 8297094 | 38 |
74 | | 61 | cym | Welsh | 1344 | 0.300 | 0.682 | 0.281 | 916 | 2133 | 8295132 | 428 |
75 | | 62 | slv | Slovenian | 1093 | 0.044 | 0.681 | 0.044 | 744 | 16127 | 8281389 | 349 |
76 | | 63 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 8297782 | 0 |
77 | | 64 | ltz | Luxembourgish | 805 | 0.383 | 0.343 | 0.280 | 276 | 444 | 8297360 | 529 |
78 | | 65 | jav | Javanese | 615 | 0.105 | 0.486 | 0.100 | 299 | 2542 | 8295452 | 316 |
79 | | 66 | bos | Bosnian | 567 | 0.011 | 0.053 | 0.010 | 30 | 2722 | 8295320 | 537 |
80 | | 67 | que | Quechua | 422 | 0.549 | 0.382 | 0.380 | 161 | 132 | 8298055 | 261 |
81 | | 68 | fao | Faroese | 402 | 0.253 | 0.281 | 0.191 | 113 | 333 | 8297874 | 289 |
82 | | 69 | ori | Odia | 374 | 1.000 | 0.992 | 0.996 | 371 | 0 | 8298235 | 3 |
83 | | 70 | tam | Tamil | 334 | 0.988 | 1.000 | 0.988 | 334 | 4 | 8298271 | 0 |
84 | | 71 | tel | Telugu | 254 | 0.996 | 1.000 | 0.996 | 254 | 1 | 8298354 | 0 |
85 | | 72 | kir | Kyrgyz | 254 | 0.049 | 0.118 | 0.041 | 30 | 588 | 8297767 | 224 |
86 | | 73 | xho | Xhosa | 252 | 0.110 | 0.575 | 0.106 | 145 | 1174 | 8297183 | 107 |
87 | | 74 | lao | Lao | 219 | 0.952 | 1.000 | 0.952 | 219 | 11 | 8298379 | 0 |
88 | | 75 | amh | Amharic | 211 | 0.770 | 1.000 | 0.770 | 211 | 63 | 8298335 | 0 |
89 | | 76 | mlt | Maltese | 208 | 0.034 | 0.817 | 0.034 | 170 | 4791 | 8293610 | 38 |
90 | | 77 | pan | Punjabi | 196 | 0.985 | 1.000 | 0.985 | 196 | 3 | 8298410 | 0 |
91 | | 78 | sme | Northern Sami | 181 | 0.190 | 0.320 | 0.158 | 58 | 248 | 8298180 | 123 |
92 | | 79 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 8298433 | 0 |
93 | | 80 | guj | Gujarati | 168 | 0.966 | 1.000 | 0.966 | 168 | 6 | 8298435 | 0 |
94 | | 81 | arg | Aragonese | 103 | 0.002 | 0.029 | 0.002 | 3 | 1529 | 8296977 | 100 |
95 | | 82 | zul | Zulu | 77 | 0.054 | 0.286 | 0.051 | 22 | 383 | 8298149 | 55 |
96 | | 83 | hat | Haitian Creole | 64 | 0.008 | 0.406 | 0.008 | 26 | 3176 | 8295369 | 38 |
97 | | 84 | mlg | Malagasy | 59 | 0.017 | 0.559 | 0.017 | 33 | 1949 | 8296601 | 26 |
98 | | 85 | wln | Walloon | 53 | 0.015 | 0.528 | 0.015 | 28 | 1847 | 8296709 | 25 |
99 | | 86 | sin | Sinhala | 45 | 0.738 | 1.000 | 0.738 | 45 | 16 | 8298548 | 0 |
100 | | 87 | pus | Pashto | 44 | 0.178 | 0.432 | 0.159 | 19 | 88 | 8298477 | 25 |
101 | | 88 | kin | Kinyarwanda | 28 | 0.005 | 0.214 | 0.005 | 6 | 1135 | 8297446 | 22 |
--------------------------------------------------------------------------------
/results/tatoeba-sentences-2021-06-05/gcld3/classification_performance.md:
--------------------------------------------------------------------------------
1 | # Classification performance for gcld3 on tatoeba-sentences-2021-06-05
2 |
3 | - Dataset coverage (sentences in supported languages): 8261834 (85.70%)
4 | - **Aggregated accuracy: 87.11%**
5 |
6 | Supported languages (107)
7 |
8 | afr (Afrikaans), amh (Amharic), ara (Arabic), bul (Bulgarian), bul (Bulgarian), ben (Bangla), bos (Bosnian), cat (Catalan), ceb (Cebuano), cos (Corsican), ces (Czech), cym (Welsh), dan (Danish), deu (German), ell (Greek), ell (Greek), eng (English), epo (Esperanto), spa (Spanish), est (Estonian), eus (Basque), fas (Persian), fin (Finnish), fil (Filipino), fra (French), fry (Western Frisian), gle (Irish), gla (Scottish Gaelic), glg (Galician), guj (Gujarati), hau (Hausa), haw (Hawaiian), hin (Hindi), hin (Hindi), hmn (Hmong), hrv (Croatian), hat (Haitian Creole), hun (Hungarian), hye (Armenian), ind (Indonesian), ibo (Igbo), isl (Icelandic), ita (Italian), heb (Hebrew), jpn (Japanese), jpn (Japanese), jav (Javanese), kat (Georgian), kaz (Kazakh), khm (Khmer), kan (Kannada), kor (Korean), kur (Kurdish), kir (Kyrgyz), lat (Latin), ltz (Luxembourgish), lao (Lao), lit (Lithuanian), lav (Latvian), mlg (Malagasy), mri (Maori), mkd (Macedonian), mal (Malayalam), mon (Mongolian), mar (Marathi), msa (Malay), mlt (Maltese), mya (Burmese), nep (Nepali), nld (Dutch), nob (Norwegian Bokmål), nya (Nyanja), pan (Punjabi), pol (Polish), pus (Pashto), por (Portuguese), ron (Romanian), rus (Russian), rus (Russian), snd (Sindhi), sin (Sinhala), slk (Slovak), slv (Slovenian), smo (Samoan), sna (Shona), som (Somali), sqi (Albanian), srp (Serbian), sot (Southern Sotho), sun (Sundanese), swe (Swedish), swa (Swahili), tam (Tamil), tel (Telugu), tgk (Tajik), tha (Thai), tur (Turkish), ukr (Ukrainian), urd (Urdu), uzb (Uzbek), vie (Vietnamese), xho (Xhosa), yid (Yiddish), yor (Yoruba), zho (Chinese), zho (Chinese), zul (Zulu)
9 |
10 | Stats per language
11 |
12 | | | language_alpha3 | language | sentences_count | precision | recall | f1 | tp | fp | tn | fn |
13 | |---:|:------------------|:-----------------|------------------:|------------:|---------:|------:|--------:|------:|--------:|-------:|
14 | | 1 | eng | English | 1479733 | 0.995 | 0.851 | 0.915 | 1258584 | 6819 | 6775282 | 221149 |
15 | | 2 | rus | Russian | 849653 | 0.974 | 0.858 | 0.902 | 729212 | 19236 | 7392945 | 120441 |
16 | | 3 | ita | Italian | 787053 | 0.970 | 0.821 | 0.877 | 646316 | 20005 | 7454776 | 140737 |
17 | | 4 | tur | Turkish | 709573 | 0.993 | 0.894 | 0.938 | 634612 | 4617 | 7547644 | 74961 |
18 | | 5 | epo | Esperanto | 659632 | 0.963 | 0.922 | 0.925 | 608152 | 23690 | 7578512 | 51480 |
19 | | 6 | deu | German | 553727 | 0.977 | 0.933 | 0.944 | 516822 | 12347 | 7695760 | 36905 |
20 | | 7 | fra | French | 466192 | 0.973 | 0.860 | 0.902 | 400941 | 11073 | 7784569 | 65251 |
21 | | 8 | por | Portuguese | 385737 | 0.910 | 0.885 | 0.859 | 341427 | 33858 | 7842239 | 44310 |
22 | | 9 | spa | Spanish | 338781 | 0.908 | 0.782 | 0.806 | 264941 | 26864 | 7896189 | 73840 |
23 | | 10 | hun | Hungarian | 323048 | 0.968 | 0.895 | 0.916 | 289189 | 9667 | 7929119 | 33859 |
24 | | 11 | jpn | Japanese | 208761 | 0.978 | 0.999 | 0.977 | 208552 | 4759 | 8048314 | 209 |
25 | | 12 | heb | Hebrew | 197226 | 0.998 | 0.991 | 0.993 | 195374 | 409 | 8064199 | 1852 |
26 | | 13 | ukr | Ukrainian | 171674 | 0.800 | 0.889 | 0.762 | 152579 | 38095 | 8052065 | 19095 |
27 | | 14 | nld | Dutch | 144340 | 0.876 | 0.854 | 0.814 | 123199 | 17516 | 8099978 | 21141 |
28 | | 15 | fin | Finnish | 128011 | 0.941 | 0.902 | 0.895 | 115404 | 7235 | 8126588 | 12607 |
29 | | 16 | pol | Polish | 109662 | 0.915 | 0.931 | 0.885 | 102084 | 9477 | 8142695 | 7578 |
30 | | 17 | mkd | Macedonian | 77938 | 0.862 | 0.741 | 0.749 | 57730 | 9232 | 8174664 | 20208 |
31 | | 18 | mar | Marathi | 64126 | 0.989 | 0.911 | 0.943 | 58406 | 654 | 8197054 | 5720 |
32 | | 19 | lit | Lithuanian | 59659 | 0.841 | 0.883 | 0.797 | 52661 | 9950 | 8192225 | 6998 |
33 | | 20 | ces | Czech | 57030 | 0.881 | 0.813 | 0.799 | 46341 | 6282 | 8198522 | 10689 |
34 | | 21 | dan | Danish | 49399 | 0.652 | 0.746 | 0.587 | 36848 | 19626 | 8192809 | 12551 |
35 | | 22 | srp | Serbian | 45176 | 0.317 | 0.449 | 0.265 | 20304 | 43763 | 8172895 | 24872 |
36 | | 23 | swe | Swedish | 41677 | 0.786 | 0.861 | 0.739 | 35870 | 9753 | 8210404 | 5807 |
37 | | 24 | lat | Latin | 39718 | 0.500 | 0.729 | 0.457 | 28963 | 28998 | 8193118 | 10755 |
38 | | 25 | ara | Arabic | 35991 | 0.999 | 0.911 | 0.952 | 32774 | 32 | 8225811 | 3217 |
39 | | 26 | ell | Greek | 34071 | 0.730 | 1.000 | 0.730 | 34062 | 12617 | 8215146 | 9 |
40 | | 27 | ron | Romanian | 24943 | 0.592 | 0.808 | 0.553 | 20164 | 13920 | 8222971 | 4779 |
41 | | 28 | bul | Bulgarian | 24503 | 0.318 | 0.844 | 0.309 | 20688 | 44456 | 8192875 | 3815 |
42 | | 29 | vie | Vietnamese | 19234 | 0.883 | 0.981 | 0.876 | 18870 | 2495 | 8240105 | 364 |
43 | | 30 | fil | Filipino | 16649 | 0.733 | 0.780 | 0.664 | 12994 | 4735 | 8240450 | 3655 |
44 | | 31 | slk | Slovak | 14660 | 0.411 | 0.727 | 0.381 | 10664 | 15304 | 8231870 | 3996 |
45 | | 32 | ind | Indonesian | 14542 | 0.642 | 0.640 | 0.544 | 9304 | 5180 | 8242112 | 5238 |
46 | | 33 | hin | Hindi | 14230 | 0.508 | 0.880 | 0.491 | 12527 | 12125 | 8235479 | 1703 |
47 | | 34 | nob | Norwegian Bokmål | 14223 | 0.285 | 0.829 | 0.277 | 11791 | 29626 | 8217985 | 2432 |
48 | | 35 | isl | Icelandic | 11091 | 0.523 | 0.945 | 0.515 | 10484 | 9560 | 8241183 | 607 |
49 | | 36 | cat | Catalan | 7971 | 0.176 | 0.818 | 0.173 | 6520 | 30461 | 8223402 | 1451 |
50 | | 37 | kor | Korean | 7570 | 0.915 | 0.996 | 0.913 | 7536 | 703 | 8253561 | 34 |
51 | | 38 | yid | Yiddish | 6895 | 0.790 | 0.944 | 0.772 | 6512 | 1728 | 8253211 | 383 |
52 | | 39 | eus | Basque | 6166 | 0.439 | 0.861 | 0.424 | 5306 | 6789 | 8248879 | 860 |
53 | | 40 | kat | Georgian | 5732 | 1.000 | 0.998 | 0.999 | 5720 | 2 | 8256100 | 12 |
54 | | 41 | hrv | Croatian | 5204 | 0.139 | 0.447 | 0.128 | 2324 | 14392 | 8242238 | 2880 |
55 | | 42 | ben | Bangla | 4714 | 1.000 | 0.998 | 0.999 | 4704 | 0 | 8257120 | 10 |
56 | | 43 | glg | Galician | 4613 | 0.064 | 0.774 | 0.063 | 3569 | 52237 | 8204984 | 1044 |
57 | | 44 | afr | Afrikaans | 4031 | 0.145 | 0.865 | 0.143 | 3485 | 20607 | 8237196 | 546 |
58 | | 45 | kaz | Kazakh | 3685 | 0.404 | 0.932 | 0.398 | 3434 | 5063 | 8253086 | 251 |
59 | | 46 | est | Estonian | 3637 | 0.165 | 0.796 | 0.162 | 2894 | 14623 | 8243574 | 743 |
60 | | 47 | tha | Thai | 3528 | 0.995 | 0.998 | 0.994 | 3522 | 18 | 8258288 | 6 |
61 | | 48 | mon | Mongolian | 2757 | 0.415 | 0.955 | 0.411 | 2633 | 3712 | 8255365 | 124 |
62 | | 49 | sqi | Albanian | 2526 | 0.288 | 0.865 | 0.282 | 2184 | 5395 | 8253913 | 342 |
63 | | 50 | gle | Irish | 2389 | 0.175 | 0.911 | 0.174 | 2177 | 10238 | 8249207 | 212 |
64 | | 51 | hye | Armenian | 2248 | 0.994 | 0.998 | 0.993 | 2243 | 13 | 8259573 | 5 |
65 | | 52 | urd | Urdu | 2008 | 0.882 | 0.961 | 0.867 | 1930 | 258 | 8259568 | 78 |
66 | | 53 | khm | Khmer | 1511 | 1.000 | 0.985 | 0.993 | 1489 | 0 | 8260323 | 22 |
67 | | 54 | ceb | Cebuano | 1478 | 0.148 | 0.571 | 0.141 | 844 | 4846 | 8255510 | 634 |
68 | | 55 | cym | Welsh | 1344 | 0.103 | 0.824 | 0.102 | 1108 | 9667 | 8250823 | 236 |
69 | | 56 | slv | Slovenian | 1093 | 0.048 | 0.724 | 0.047 | 791 | 15714 | 8245027 | 302 |
70 | | 57 | gla | Scottish Gaelic | 1033 | 0.090 | 0.867 | 0.089 | 896 | 9060 | 8251741 | 137 |
71 | | 58 | uzb | Uzbek | 855 | 0.091 | 0.581 | 0.088 | 497 | 4957 | 8256022 | 358 |
72 | | 59 | mal | Malayalam | 827 | 1.000 | 1.000 | 1.000 | 827 | 0 | 8261007 | 0 |
73 | | 60 | ltz | Luxembourgish | 805 | 0.031 | 0.867 | 0.031 | 698 | 21519 | 8239510 | 107 |
74 | | 61 | jav | Javanese | 615 | 0.030 | 0.361 | 0.029 | 222 | 7128 | 8254091 | 393 |
75 | | 62 | bos | Bosnian | 567 | 0.012 | 0.388 | 0.011 | 220 | 18770 | 8242497 | 347 |
76 | | 63 | mya | Burmese | 433 | 1.000 | 1.000 | 1.000 | 433 | 0 | 8261401 | 0 |
77 | | 64 | mri | Maori | 388 | 0.040 | 0.711 | 0.040 | 276 | 6622 | 8254824 | 112 |
78 | | 65 | fry | Western Frisian | 355 | 0.012 | 0.738 | 0.012 | 262 | 21054 | 8240425 | 93 |
79 | | 66 | tam | Tamil | 334 | 1.000 | 1.000 | 1.000 | 334 | 0 | 8261500 | 0 |
80 | | 67 | tel | Telugu | 254 | 1.000 | 1.000 | 1.000 | 254 | 0 | 8261580 | 0 |
81 | | 68 | kir | Kyrgyz | 254 | 0.027 | 0.878 | 0.027 | 223 | 7974 | 8253606 | 31 |
82 | | 69 | xho | Xhosa | 252 | 0.039 | 0.683 | 0.039 | 172 | 4230 | 8257352 | 80 |
83 | | 70 | lao | Lao | 219 | 1.000 | 1.000 | 1.000 | 219 | 0 | 8261615 | 0 |
84 | | 71 | amh | Amharic | 211 | 1.000 | 0.991 | 0.995 | 209 | 0 | 8261623 | 2 |
85 | | 72 | mlt | Maltese | 208 | 0.013 | 0.817 | 0.013 | 170 | 12489 | 8249137 | 38 |
86 | | 73 | pan | Punjabi | 196 | 1.000 | 0.995 | 0.997 | 195 | 0 | 8261638 | 1 |
87 | | 74 | kan | Kannada | 176 | 1.000 | 1.000 | 1.000 | 176 | 0 | 8261658 | 0 |
88 | | 75 | guj | Gujarati | 168 | 1.000 | 0.982 | 0.991 | 165 | 0 | 8261666 | 3 |
89 | | 76 | haw | Hawaiian | 155 | 0.010 | 0.839 | 0.010 | 130 | 12567 | 8249112 | 25 |
90 | | 77 | som | Somali | 80 | 0.019 | 0.912 | 0.019 | 73 | 3674 | 8258080 | 7 |
91 | | 78 | zul | Zulu | 77 | 0.007 | 0.753 | 0.007 | 58 | 7839 | 8253918 | 19 |
92 | | 79 | smo | Samoan | 76 | 0.008 | 0.763 | 0.008 | 58 | 7277 | 8254481 | 18 |
93 | | 80 | hat | Haitian Creole | 64 | 0.004 | 0.797 | 0.004 | 51 | 13257 | 8248513 | 13 |
94 | | 81 | tgk | Tajik | 63 | 0.008 | 0.889 | 0.008 | 56 | 7343 | 8254428 | 7 |
95 | | 82 | hau | Hausa | 60 | 0.006 | 0.800 | 0.006 | 48 | 7835 | 8253939 | 12 |
96 | | 83 | mlg | Malagasy | 59 | 0.003 | 0.831 | 0.003 | 49 | 14975 | 8246800 | 10 |
97 | | 84 | sin | Sinhala | 45 | 1.000 | 1.000 | 1.000 | 45 | 0 | 8261789 | 0 |
98 | | 85 | pus | Pashto | 44 | 0.098 | 0.864 | 0.097 | 38 | 350 | 8261440 | 6 |
99 | | 86 | sna | Shona | 42 | 0.006 | 0.667 | 0.005 | 28 | 5062 | 8256730 | 14 |
100 | | 87 | yor | Yoruba | 37 | 0.001 | 0.108 | 0.001 | 4 | 3799 | 8257998 | 33 |
101 | | 88 | ibo | Igbo | 32 | 0.002 | 0.688 | 0.002 | 22 | 10813 | 8250989 | 10 |
102 | | 89 | sun | Sundanese | 31 | 0.003 | 0.548 | 0.003 | 17 | 5505 | 8256298 | 14 |
103 | | 90 | nya | Nyanja | 24 | 0.010 | 0.833 | 0.010 | 20 | 2072 | 8259738 | 4 |
104 | | 91 | cos | Corsican | 24 | 0.000 | 0.583 | 0.000 | 14 | 35270 | 8226540 | 10 |
105 | | 92 | snd | Sindhi | 6 | 0.003 | 0.833 | 0.003 | 5 | 1440 | 8260388 | 1 |
106 | | 93 | sot | Southern Sotho | 2 | 0.001 | 1.000 | 0.001 | 2 | 3240 | 8258592 | 0 |
--------------------------------------------------------------------------------