├── .gitattributes ├── .ipynb_checkpoints ├── LANGUAGE DETECTION AND TRANSLATION-checkpoint.ipynb ├── README-checkpoint.md └── translate_language-checkpoint.py ├── LANGUAGE DETECTION AND TRANSLATION.ipynb ├── LICENSE ├── README.md ├── detect_and_translate.py ├── detect_language.py ├── language_probabilities.py └── translate_language.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/LANGUAGE DETECTION AND TRANSLATION-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from langdetect import DetectorFactory\n", 10 | "DetectorFactory.seed = 0" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "'sw'" 22 | ] 23 | }, 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | } 28 | ], 29 | "source": [ 30 | "# Detect the language of the sentence\n", 31 | "from langdetect import detect\n", 32 | "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n", 33 | "detect(sentence)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "[sw:0.9999971210408874]" 45 | ] 46 | }, 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "# show probabilities for the top languages\n", 54 | "from langdetect import detect_langs\n", 55 | "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n", 56 | "detect_langs(sentence)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": { 63 | "scrolled": true 64 | }, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "Tanzania is a leading tourist destination in Africa \n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "# translate a sentence from swahili to english langauge\n", 76 | "\n", 77 | "from google_trans_new import google_translator \n", 78 | "\n", 79 | "translator = google_translator()\n", 80 | "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n", 81 | "translate_text = translator.translate(sentence,lang_tgt='en') \n", 82 | "print(translate_text)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from langdetect import detect\n", 92 | "from google_trans_new import google_translator \n", 93 | "\n", 94 | "#simple function to detect and translate text \n", 95 | "def detect_and_translate(text,target_lang):\n", 96 | " \n", 97 | " result_lang = detect(text)\n", 98 | " \n", 99 | " if result_lang == target_lang:\n", 100 | " return text \n", 101 | " \n", 102 | " else:\n", 103 | " translator = google_translator()\n", 104 | " translate_text = translator.translate(text,lang_src=result_lang,lang_tgt=target_lang)\n", 105 | " return translate_text \n", 106 | " " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Natumai kuwa, nitakapojiwekea akiba, nitaweza kusafiri kwenda Mexico \n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "sentence = \"I hope that, when I’ve built up my savings, I’ll be able to travel to Mexico\"\n", 124 | "print(detect_and_translate(sentence,target_lang='sw'))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 3, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.8.3" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/README-checkpoint.md: -------------------------------------------------------------------------------- 1 | # Detect and Translate Text Data 2 | How to detect language and tranlsate text data into the language of your choice when working on a NLP project 3 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/translate_language-checkpoint.py: -------------------------------------------------------------------------------- 1 | # translate a sentence from swahili to english langauge 2 | 3 | from google_trans_new import google_translator 4 | 5 | translator = google_translator() 6 | 7 | sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika" 8 | translate_text = translator.translate(sentence,lang_tgt='en') 9 | 10 | print(translate_text) 11 | -------------------------------------------------------------------------------- /LANGUAGE DETECTION AND TRANSLATION.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from langdetect import DetectorFactory\n", 10 | "DetectorFactory.seed = 0" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "'sw'" 22 | ] 23 | }, 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | } 28 | ], 29 | "source": [ 30 | "# Detect the language of the sentence\n", 31 | "from langdetect import detect\n", 32 | "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n", 33 | "detect(sentence)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "[sw:0.9999971210408874]" 45 | ] 46 | }, 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "# show probabilities for the top languages\n", 54 | "from langdetect import detect_langs\n", 55 | "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n", 56 | "detect_langs(sentence)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": { 63 | "scrolled": true 64 | }, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "Tanzania is a leading tourist destination in Africa \n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "# translate a sentence from swahili to english langauge\n", 76 | "\n", 77 | "from google_trans_new import google_translator \n", 78 | "\n", 79 | "translator = google_translator()\n", 80 | "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n", 81 | "translate_text = translator.translate(sentence,lang_tgt='en') \n", 82 | "print(translate_text)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from langdetect import detect\n", 92 | "from google_trans_new import google_translator \n", 93 | "\n", 94 | "#simple function to detect and translate text \n", 95 | "def detect_and_translate(text,target_lang):\n", 96 | " \n", 97 | " result_lang = detect(text)\n", 98 | " \n", 99 | " if result_lang == target_lang:\n", 100 | " return text \n", 101 | " \n", 102 | " else:\n", 103 | " translator = google_translator()\n", 104 | " translate_text = translator.translate(text,lang_src=result_lang,lang_tgt=target_lang)\n", 105 | " return translate_text \n", 106 | " " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Natumai kuwa, nitakapojiwekea akiba, nitaweza kusafiri kwenda Mexico \n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "sentence = \"I hope that, when I’ve built up my savings, I’ll be able to travel to Mexico\"\n", 124 | "print(detect_and_translate(sentence,target_lang='sw'))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 3, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.8.3" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Davis David 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Detect and Translate Text Data 2 | 3 | How to detect language and translate text data into the language of your choice when working on a NLP project 4 | -------------------------------------------------------------------------------- /detect_and_translate.py: -------------------------------------------------------------------------------- 1 | from langdetect import DetectorFactory 2 | DetectorFactory.seed = 0 3 | 4 | from langdetect import detect 5 | from google_trans_new import google_translator 6 | 7 | #simple function to detect and translate text 8 | def detect_and_translate(text,target_lang): 9 | 10 | result_lang = detect(text) 11 | 12 | if result_lang == target_lang: 13 | return text 14 | 15 | else: 16 | translator = google_translator() 17 | translate_text = translator.translate(text,lang_src=result_lang,lang_tgt=target_lang) 18 | return translate_text 19 | 20 | 21 | 22 | # Example 23 | sentence = "I hope that, when I’ve built up my savings, I’ll be able to travel to Mexico" 24 | 25 | print(detect_and_translate(sentence,target_lang='sw')) 26 | 27 | 28 | # output: Natumai kwamba, nitakapojiwekea akiba, nitaweza kusafiri kwenda Mexico -------------------------------------------------------------------------------- /detect_language.py: -------------------------------------------------------------------------------- 1 | from langdetect import DetectorFactory 2 | DetectorFactory.seed = 0 3 | 4 | # Detect the language of the sentence 5 | from langdetect import detect 6 | 7 | sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika" 8 | 9 | print(detect(sentence)) -------------------------------------------------------------------------------- /language_probabilities.py: -------------------------------------------------------------------------------- 1 | from langdetect import DetectorFactory 2 | DetectorFactory.seed = 0 3 | 4 | # show probabilities for the top languages 5 | from langdetect import detect_langs 6 | 7 | sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika" 8 | 9 | print(detect_langs(sentence)) -------------------------------------------------------------------------------- /translate_language.py: -------------------------------------------------------------------------------- 1 | # translate a sentence from swahili to english langauge 2 | 3 | from google_trans_new import google_translator 4 | 5 | translator = google_translator() 6 | 7 | sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika" 8 | translate_text = translator.translate(sentence,lang_tgt='en') 9 | 10 | print(translate_text) 11 | --------------------------------------------------------------------------------