├── .gitattributes
├── .ipynb_checkpoints
    ├── LANGUAGE DETECTION AND TRANSLATION-checkpoint.ipynb
    ├── README-checkpoint.md
    └── translate_language-checkpoint.py
├── LANGUAGE DETECTION AND TRANSLATION.ipynb
├── LICENSE
├── README.md
├── detect_and_translate.py
├── detect_language.py
├── language_probabilities.py
└── translate_language.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/LANGUAGE DETECTION AND TRANSLATION-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from langdetect import DetectorFactory\n",
 10 |     "DetectorFactory.seed = 0"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "data": {
 20 |       "text/plain": [
 21 |        "'sw'"
 22 |       ]
 23 |      },
 24 |      "execution_count": 2,
 25 |      "metadata": {},
 26 |      "output_type": "execute_result"
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "# Detect the language of the sentence\n",
 31 |     "from langdetect import detect\n",
 32 |     "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n",
 33 |     "detect(sentence)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/plain": [
 44 |        "[sw:0.9999971210408874]"
 45 |       ]
 46 |      },
 47 |      "execution_count": 3,
 48 |      "metadata": {},
 49 |      "output_type": "execute_result"
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "# show probabilities for the top languages\n",
 54 |     "from langdetect import detect_langs\n",
 55 |     "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n",
 56 |     "detect_langs(sentence)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {
 63 |     "scrolled": true
 64 |    },
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "Tanzania is a leading tourist destination in Africa \n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "# translate a sentence from swahili to english langauge\n",
 76 |     "\n",
 77 |     "from google_trans_new import google_translator  \n",
 78 |     "\n",
 79 |     "translator = google_translator()\n",
 80 |     "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n",
 81 |     "translate_text = translator.translate(sentence,lang_tgt='en')  \n",
 82 |     "print(translate_text)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from langdetect import detect\n",
 92 |     "from google_trans_new import google_translator  \n",
 93 |     "\n",
 94 |     "#simple function to detect and translate text \n",
 95 |     "def detect_and_translate(text,target_lang):\n",
 96 |     "    \n",
 97 |     "    result_lang = detect(text)\n",
 98 |     "    \n",
 99 |     "    if result_lang == target_lang:\n",
100 |     "        return text \n",
101 |     "    \n",
102 |     "    else:\n",
103 |     "        translator = google_translator()\n",
104 |     "        translate_text = translator.translate(text,lang_src=result_lang,lang_tgt=target_lang)\n",
105 |     "        return translate_text \n",
106 |     "        "
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 6,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "Natumai kuwa, nitakapojiwekea akiba, nitaweza kusafiri kwenda Mexico \n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "sentence = \"I hope that, when I’ve built up my savings, I’ll be able to travel to Mexico\"\n",
124 |     "print(detect_and_translate(sentence,target_lang='sw'))"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 3,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": []
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": []
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": []
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.8.3"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 4
170 | }
171 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/README-checkpoint.md:
--------------------------------------------------------------------------------
1 | # Detect and Translate Text Data
2 |  How to detect language and tranlsate text data into the language of your choice when working on a NLP project
3 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/translate_language-checkpoint.py:
--------------------------------------------------------------------------------
 1 | # translate a sentence from swahili to english langauge
 2 | 
 3 | from google_trans_new import google_translator  
 4 | 
 5 | translator = google_translator()
 6 | 
 7 | sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika"
 8 | translate_text = translator.translate(sentence,lang_tgt='en')  
 9 | 
10 | print(translate_text)
11 | 


--------------------------------------------------------------------------------
/LANGUAGE DETECTION AND TRANSLATION.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from langdetect import DetectorFactory\n",
 10 |     "DetectorFactory.seed = 0"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "data": {
 20 |       "text/plain": [
 21 |        "'sw'"
 22 |       ]
 23 |      },
 24 |      "execution_count": 2,
 25 |      "metadata": {},
 26 |      "output_type": "execute_result"
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "# Detect the language of the sentence\n",
 31 |     "from langdetect import detect\n",
 32 |     "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n",
 33 |     "detect(sentence)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/plain": [
 44 |        "[sw:0.9999971210408874]"
 45 |       ]
 46 |      },
 47 |      "execution_count": 3,
 48 |      "metadata": {},
 49 |      "output_type": "execute_result"
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "# show probabilities for the top languages\n",
 54 |     "from langdetect import detect_langs\n",
 55 |     "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n",
 56 |     "detect_langs(sentence)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {
 63 |     "scrolled": true
 64 |    },
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "Tanzania is a leading tourist destination in Africa \n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "# translate a sentence from swahili to english langauge\n",
 76 |     "\n",
 77 |     "from google_trans_new import google_translator  \n",
 78 |     "\n",
 79 |     "translator = google_translator()\n",
 80 |     "sentence = \"Tanzania ni nchi inayoongoza kwa utalii barani afrika\"\n",
 81 |     "translate_text = translator.translate(sentence,lang_tgt='en')  \n",
 82 |     "print(translate_text)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from langdetect import detect\n",
 92 |     "from google_trans_new import google_translator  \n",
 93 |     "\n",
 94 |     "#simple function to detect and translate text \n",
 95 |     "def detect_and_translate(text,target_lang):\n",
 96 |     "    \n",
 97 |     "    result_lang = detect(text)\n",
 98 |     "    \n",
 99 |     "    if result_lang == target_lang:\n",
100 |     "        return text \n",
101 |     "    \n",
102 |     "    else:\n",
103 |     "        translator = google_translator()\n",
104 |     "        translate_text = translator.translate(text,lang_src=result_lang,lang_tgt=target_lang)\n",
105 |     "        return translate_text \n",
106 |     "        "
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 6,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "Natumai kuwa, nitakapojiwekea akiba, nitaweza kusafiri kwenda Mexico \n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "sentence = \"I hope that, when I’ve built up my savings, I’ll be able to travel to Mexico\"\n",
124 |     "print(detect_and_translate(sentence,target_lang='sw'))"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 3,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": []
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": []
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": []
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.8.3"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 4
170 | }
171 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Davis David
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Detect and Translate Text Data
2 | 
3 |  How to detect language and translate text data into the language of your choice when working on a NLP project
4 | 


--------------------------------------------------------------------------------
/detect_and_translate.py:
--------------------------------------------------------------------------------
 1 | from langdetect import DetectorFactory
 2 | DetectorFactory.seed = 0
 3 | 
 4 | from langdetect import detect
 5 | from google_trans_new import google_translator  
 6 | 
 7 | #simple function to detect and translate text 
 8 | def detect_and_translate(text,target_lang):
 9 |     
10 |     result_lang = detect(text)
11 |     
12 |     if result_lang == target_lang:
13 |         return text 
14 |     
15 |     else:
16 |         translator = google_translator()
17 |         translate_text = translator.translate(text,lang_src=result_lang,lang_tgt=target_lang)
18 |         return translate_text 
19 |         
20 |         
21 | 
22 | # Example        
23 | sentence = "I hope that, when I’ve built up my savings, I’ll be able to travel to Mexico"
24 | 
25 | print(detect_and_translate(sentence,target_lang='sw'))
26 | 
27 | 
28 | # output: Natumai kwamba, nitakapojiwekea akiba, nitaweza kusafiri kwenda Mexico 


--------------------------------------------------------------------------------
/detect_language.py:
--------------------------------------------------------------------------------
1 | from langdetect import DetectorFactory
2 | DetectorFactory.seed = 0
3 | 
4 | # Detect the language of the sentence
5 | from langdetect import detect
6 | 
7 | sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika"
8 | 
9 | print(detect(sentence))


--------------------------------------------------------------------------------
/language_probabilities.py:
--------------------------------------------------------------------------------
1 | from langdetect import DetectorFactory
2 | DetectorFactory.seed = 0
3 | 
4 | # show probabilities for the top languages
5 | from langdetect import detect_langs
6 | 
7 | sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika"
8 | 
9 | print(detect_langs(sentence))


--------------------------------------------------------------------------------
/translate_language.py:
--------------------------------------------------------------------------------
 1 | # translate a sentence from swahili to english langauge
 2 | 
 3 | from google_trans_new import google_translator  
 4 | 
 5 | translator = google_translator()
 6 | 
 7 | sentence = "Tanzania ni nchi inayoongoza kwa utalii barani afrika"
 8 | translate_text = translator.translate(sentence,lang_tgt='en')  
 9 | 
10 | print(translate_text)
11 | 


--------------------------------------------------------------------------------