├── .gitignore ├── Dockerfile ├── Makefile ├── README.md ├── README_en.md ├── docs ├── APIArlein.md ├── Docker.md ├── GosodiadArferol.md ├── Hyfforddi.md ├── RhedegMoses.md ├── Training.md └── demo1.md ├── license.md ├── scripts ├── moses.py ├── mtdk │ ├── mt_download_engine.sh │ ├── mt_filter_for_mixedcase.py │ ├── mt_update_compress_moses_ini.py │ ├── mtdk-00-prepare-new-engine.sh │ ├── mtdk-01-prepare-corpus.sh │ ├── mtdk-02-train-language-model.sh │ ├── mtdk-02-train-recaser-model.sh │ ├── mtdk-03-train-translation-engine.sh │ ├── mtdk-04-compress-translation-engine-ram.sh │ └── mtdk-05-package.sh └── python-server.py └── tut └── demo1.py /.gitignore: -------------------------------------------------------------------------------- 1 | dockerfiles/moses-smt/*.sh 2 | dockerfiles/moses-smt/*.py 3 | tut/API_KEY 4 | moses-models/* 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | 3 | # Copyright (c) 2015 Prifysgol Bangor University 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | # Datblygwyr / Developers: 24 | # Dewi Bryn Jones, Patrick Robertson 25 | # 26 | # Rhagor / Further Information: 27 | # http://techiaith.cymru/cyfieithu/cyfieithu-peirianyddol/ 28 | # 29 | FROM ubuntu:16.04 30 | MAINTAINER Uned Technolegau Iaith, Prifysgol Bangor / Language Technologies Unit, Bangor University 31 | 32 | #ARG DEBIAN_FRONTEND=noninteractive 33 | #ENV TZ=Europe/London 34 | 35 | RUN apt-get update && apt-get install -q -y --no-install-recommends \ 36 | unzip \ 37 | make \ 38 | g++ \ 39 | wget \ 40 | git \ 41 | locales \ 42 | mercurial \ 43 | bzip2 \ 44 | autotools-dev \ 45 | automake \ 46 | locales \ 47 | libtool \ 48 | zlib1g-dev \ 49 | libbz2-dev \ 50 | libboost-all-dev \ 51 | libxmlrpc-core-c3-dev \ 52 | libxmlrpc-c++8-dev \ 53 | python3-pip \ 54 | python3-setuptools \ 55 | python3-dev \ 56 | && apt-get clean \ 57 | && rm -rf /var/lib/apt/lists/* 58 | 59 | RUN sed -i -e 's/# cy_GB.UTF-8 UTF-8/cy_GB.UTF-8 UTF-8/' /etc/locale.gen && \ 60 | sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \ 61 | dpkg-reconfigure --frontend=noninteractive locales && \ 62 | update-locale LANG=cy_GB.UTF-8 63 | 64 | ENV LANG cy_GB.UTF-8 65 | 66 | RUN pip3 install cherrypy==8.0.1 67 | RUN pip3 install python-Levenshtein 68 | 69 | RUN mkdir -p /home/moses 70 | RUN mkdir -p /home/moses/moses-models 71 | RUN mkdir -p /home/moses/moses-smt 72 | 73 | ENV HOME /home/moses 74 | 75 | ADD scripts/ /home/moses/moses-smt 76 | 77 | WORKDIR /home/moses 78 | 79 | # lawrlwytho/download snapshot RELEASE-3.0 moses 80 | RUN wget https://github.com/moses-smt/mosesdecoder/archive/RELEASE-3.0.zip 81 | RUN unzip RELEASE-3.0.zip 82 | RUN rm RELEASE-3.0.zip 83 | RUN mv mosesdecoder-RELEASE-3.0 mosesdecoder 84 | 85 | RUN wget -O giza-pp.zip http://github.com/moses-smt/giza-pp/archive/228a39b94ff61f41f36a15ce0194dadc69dc0e36.zip 86 | RUN unzip giza-pp.zip 87 | RUN rm giza-pp.zip 88 | RUN mv giza-pp-228a39b94ff61f41f36a15ce0194dadc69dc0e36 giza-pp 89 | WORKDIR /home/moses/giza-pp 90 | RUN make 91 | 92 | WORKDIR /home/moses 93 | 94 | RUN mkdir external-bin-dir 95 | RUN cp giza-pp/GIZA++-v2/GIZA++ external-bin-dir 96 | RUN cp giza-pp/GIZA++-v2/snt2cooc.out external-bin-dir 97 | RUN cp giza-pp/mkcls-v2/mkcls external-bin-dir 98 | 99 | #RUN wget -O cmph-2.0.tar.gz http://downloads.sourceforge.net/project/cmph/cmph/cmph-2.0.tar.gz 100 | RUN wget -O cmph-2.0.tar.gz http://techiaith.cymru/moses/downloads/cmph-2.0.tar.gz 101 | RUN tar zxvf cmph-2.0.tar.gz 102 | 103 | WORKDIR /home/moses/cmph-2.0 104 | RUN ./configure 105 | RUN make 106 | RUN make install 107 | WORKDIR /home/moses 108 | 109 | #RUN wget -O irstlm-5.80.08.tgz http://downloads.sourceforge.net/project/irstlm/irstlm/irstlm-5.80/irstlm-5.80.08.tgz 110 | RUN wget -O irstlm-5.80.08.tgz http://techiaith.cymru/moses/downloads/irstlm-5.80.08.tgz 111 | RUN tar zxvf irstlm-5.80.08.tgz 112 | 113 | WORKDIR /home/moses/irstlm-5.80.08/trunk 114 | RUN /bin/bash -c "source regenerate-makefiles.sh" 115 | RUN ./configure -prefix=/home/moses/irstlm 116 | RUN make 117 | RUN make install 118 | 119 | WORKDIR /home/moses 120 | 121 | # Adeiladu mosesdecoder 122 | ENV IRSTLM /home/moses/irstlm 123 | WORKDIR /home/moses/mosesdecoder 124 | 125 | RUN ./bjam -a --with-irstlm=/home/moses/irstlm --serial --with-xmlrpc-c=/usr/ --with-cmph=/home/moses/cmph-2.0 126 | 127 | WORKDIR /home/moses/moses-smt 128 | 129 | EXPOSE 8008 130 | EXPOSE 8080 131 | 132 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | default: build 2 | 3 | build: 4 | docker build --rm -t techiaith/moses-smt . 5 | 6 | run: 7 | docker run --name moses-smt -it \ 8 | -v ${PWD}/moses-models:/home/moses/moses-models \ 9 | -p 8080:8080 \ 10 | techiaith/moses-smt bash 11 | 12 | stop: 13 | -docker stop moses-smt 14 | -docker rm moses-smt 15 | 16 | 17 | clean: stop 18 | -docker rmi techiaith/moses-smt 19 | 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [>> English README](README_en.md) 2 | 3 | # Moses-SMT hefo Docker 4 | 5 | Os yw Docker wedi ei osod ar eich cyfrifiadur [Get Started with Docker](https://docs.docker.com/windows/) yna dyma modd hwylus iawn i chi osod a ddefnyddio cyfieithu peirianyddol Cymraeg gyda Moses-SMT . 6 | 7 | Dim ond 2 orchymun sydd ei angen. 8 | 9 | ### Gorchymyn 1 : Gosod Moses-SMT 10 | 11 | ``` 12 | $ docker pull techiaith/moses-smt 13 | ``` 14 | 15 | Bydd hyn yn llwytho ac yn gosod isadeiledd cyfieithu peirianyddol o fewn eich system Docker. 16 | 17 | ### Gorchymun 2 : Cychwyn Peiriant Cyfieithu o’ch Ddewis 18 | 19 | Mae’r Uned Technolegau Iaith wedi creu peiriannau cyfieithu ar sail hyfforddi gyda data rydym wedi’i gasglu o ffynonellau agored a chyhoeddus, megis Cofnod y Cynulliad a’r Ddeddfwriaeth ar-lein. 20 | 21 | Mae gan y peiriannau enwau a chyfeiriadau cyfieithu penodol. Yr enw ar y peiriant a hyfforddwyd gyda chofnodion y Cynulliad yw ‘CofnodYCynulliad’ a’r enw ar gyfer peiriant y corpws deddfwriaeth yw ‘Deddfwriaeth’. 22 | 23 | Dyma’r ail orchymyn, gan ddewis peiriant ‘CofnodYCynulliad’ a’i osod i gyfieithu o’r Saesneg i’r Gymraeg : 24 | 25 | ``` 26 | $ docker run --name moses-smt-cofnodycynulliad-en-cy -p 8080:8080 -p 8008:8008 techiaith/moses-smt start -e CofnodYCynulliad -s en -t cy 27 | ``` 28 | 29 | Bydd y system yn llwytho ffeil i lawr (tua 3Gb mewn maint yn achos peiriant CofnodYCynulliad) cyn iddo gadarnhau ei fod yn barod i dderbyn ceisiadau i’w cyfieithu. 30 | 31 | Os agorwch chi eich porwr a mynd at [http://localhost:8008](http://localhost:8008), dylai ffurflen syml ymddangos er mwyn i chi wirio a yw’r peiriant yn gweithio ai peidio. 32 | 33 | 34 | # Gosod a rhedeg o GitHub 35 | 36 | Mae modd i chi llwytho i lawr o GitHub ac addasu'r adnoddau hyn: 37 | 38 | ```sh 39 | $ git clone https://github.com/porthtechnolegauiaith/moses-smt 40 | $ cd moses-smt 41 | $ make 42 | ``` 43 | 44 | ac yna i redeg peiriant cyfieithu parod: 45 | 46 | ```sh 47 | $ make run 48 | $ python moses.py start -e moses-smt-cofnodycynulliad -e CofnodYCynulliad -s en -t cy 49 | ``` 50 | 51 | Bydd y rhith weinydd Docker yn ymateb i geisiadau JSON ar borth 8008 yn ogystal i XMLRPC ar borth 8080. 52 | 53 | # Hyfforddi Modelau Cyfieithu Newydd 54 | 55 | Mae modd hyfforddi peiriannau cyfieithu Moses-SMT eich hunain una ai gan ddefnyddio data gan yr Uned Technolegau Iaith, neu gyda'ch eich data eich hun. 56 | 57 | Gall hyfforddi eich peiriant eich hun gynnig y cyfle i greu peiriant cyfieithu sy'n adlewyrchu eich anghenion arbenigol chi o fewn pau benodol. Er enghraifft, os ydych yn gweithio ym maes cyllid, byddai'n bosib hyfforddi'r peiriant i fod yn arbennig o effeithiol wrth gyfieithu cywair yn y maes hwn, gan gynnwys termau a chystrawen arbenigol y maes. 58 | 59 | Gweler [Creu Peiriannau Moses-SMT](https://github.com/PorthTechnolegauIaith/moses-smt/blob/master/docs/Hyfforddi.md) 60 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | # Moses-SMT with Docker 2 | 3 | If you have Docker installed on your computer [Get Started with Docker](https://docs.docker.com/windows/) then there is a very easy method by which you can install and use machine translation engines locally. There are only two commands involved. 4 | 5 | ### Command 1 : Installing Moses-SMT 6 | 7 | ``` 8 | $ docker pull techiaith/moses-smt 9 | ``` 10 | 11 | This will download and install a Moses machine translation system within your Docker environment. 12 | 13 | 14 | ### Command 2 : Start a Machine Translation Engine of your Choice 15 | 16 | The Language Technologies Unit have created machine translation engines that have been trained from bilingual data that we have collected from open and public sources, such as the Proceedings of the Welsh Assembly, the UK and Welsh Legislature website, as well as localisations of open source software. Each in turn provide domain specific machine translation capabilities. Each one is identified according to its Welsh name. Thus: 17 | 18 | - CofnodYCynulliad : as trained from the Welsh Assembly Proceedings 19 | - Deddfwriaeth : as trained from UK and Welsh legislature 20 | - Meddalwedd : as trained from localisations of various open source software projects. 21 | 22 | These names can be used in the second Docker command that will start (and fetch is necessary from the Welsh National Language Technologies Portal) an engine for a desired source and target language pairing: 23 | 24 | ``` 25 | $ docker run --name moses-smt-cofnodycynulliad-en-cy -p 8080:8080 -p 8008:8008 techiaith/moses-smt start -e CofnodYCynulliad -s en -t cy 26 | ``` 27 | 28 | In the case of CofnodYCynulliad, the engine may be a very large download - about 3Gb. 29 | 30 | Open your browser and browse to [http://localhost:8008](http://localhost:8008), where you should see a simple demo form that will help you check if the engine is working or not. 31 | 32 | # Installing and Running from GitHub 33 | 34 | To download and install from GitHub: 35 | 36 | ```sh 37 | $ git clone https://github.com/porthtechnolegauiaith/moses-smt 38 | $ cd moses-smt 39 | $ make 40 | ``` 41 | 42 | and then: 43 | 44 | ```sh 45 | $ make run 46 | $ python moses.py start -e CofnodYCynulliad -s en -t cy 47 | ``` 48 | 49 | The running Docker container will respond to JSON requests on port 8008 as well as XMLRPC on port 8080. 50 | 51 | # Train New Machine Translation Engines 52 | 53 | It's possible to train your onw Moses-SMT translation engines with data by the Language Technologies Unit or your own. 54 | 55 | Training your own translation machine could be an opportunity to create a machine that can reflect your specific needs within the field in which you work. For example, if you worked in finance, it would be possible to train your machine to be particularly effective at translating the register of this domain, including the field's own particular terminology and syntax. 56 | 57 | See [Create Moses-SMT Engines](https://github.com/PorthTechnolegauIaith/moses-smt/blob/master/docs/Training.md) 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /docs/APIArlein.md: -------------------------------------------------------------------------------- 1 | 2 | [Scroll down for english](#moses_smt_machine_translation_online_api) 3 | 4 | # API Cyfieithu Peirianyddol Moses-SMT 5 | 6 | Mae swyddogaethau peiriannau cyfieithu y storfa hon ar gael dros y we o'r Canolfan APIs [gweler https://api.techiaith.org](https://api.techiaith.org). Mae'r API yn gweithio dros HTTPS GET, felly gellir defnyddio unrhyw iaith/meddalwedd HTTP er mwyn cysylltu at yr API. 7 | 8 | ## Tiwtorialau 9 | 10 | Mae [Twitorial 1](demo1.md) yn enghraifft o sut i ddefnyddio API cyfieithu peirianyddol i gyfieithu testun 11 | 12 | 13 | ## Fersiwn Cyfredol 14 | 15 | Mae un fersiwn o API Cyfieithu Peiraianyddol ar gael. v1 neu 'fersiwn 1'. 16 | Dychwelir enw'r fersiwn sy'n cael ei ddefnyddio yn y canlyniadau JSON. 17 | 18 | Mi fydd yr URL yn newid ar gyfer pob fersiwn newydd o'r API. Ar hyn o bryd, dylid defnyddio `/v1` ar gyfer fersiwn 1. 19 | 20 | ## Sgema 21 | 22 | Mae cysylltu â'r API yn gweithio dros HTTPS yn unig, gan ddefnyddio'r parth `api.techiaith.org/translate`. Mae'r holl ddata sy'n cael ei dderbyn/anfon yn cael ei drosglwyddo ar ffurf JSON ([unicode-escaped ASCII](http://tools.ietf.org/html/rfc5137)). 23 | 24 | ## Paramedrau 25 | 26 | | Paramedr | Disgrifiad | Sylwadau | 27 | |--------------|------------|----------| 28 | | `api_key` | Eich allwedd API, ar gael o'r Canolfan APIs (https://api.techiaith.org) | angenrheidiol | 29 | | `q` | Y testun i'w gyfieithu. Wedi ei fformatio yn ôl RFC 3986 (percent-encoded) | angenrheidiol | 30 | | `engine` | Enw'r peiriant rydych am defnyddio i gyfieithu'n peirianyddol. Y dewis yw : CofnodYCynulliad neu Deddfwriaeth | angenrheidiol | 31 | | `source` | Iaith y testun ffynhonnell. Dewis o `en` neu `cy`. | angenrheidiol | 32 | | `target` | Iaith ar gyfer y cyfieithiad sy'n cael ei ddychwelyd yn ôl. Dewis o `en` neu `cy`. | angenrheidiol | 33 | | `callback` | Enw 'function' ar gyfer unrhyw callback JSON-P (gweler isod) | dewisiol | 34 | 35 | ### Enghriafft 36 | 37 | ``` 38 | $ curl https://api.techiaith.org/translate/v1/translate?api_key=123&q=Will+the+Minister+make+a+statement&engine=CofnodYCynulliad&source=en&target=cy 39 | 40 | { 41 | "success": true, 42 | "translations": [ 43 | {"translatedText": "A wnaiff y Gweinidog wneud datganiad"}, 44 | ], 45 | "version": 1 46 | } 47 | ``` 48 | 49 | ## JSON-P Callbacks 50 | 51 | Gellir defnyddio'r API gyda JSON-P callbacks trwy ychwanegu'r paramedr `callback` i'ch galwad: 52 | 53 | ``` 54 | $ curl https://api.techiaith.org/translate/v1/translate?api_key=123&q=Will+the+Minister+make+a+statement&engine=CofnodYCynulliad&source=en&target=cy&callback=foo 55 | foo({ 56 | "success": true, 57 | "translations": [ 58 | {"translatedText": "A wnaiff y Gweinidog wneud datganiad"}, 59 | ], 60 | "version": 1 61 | }); 62 | ``` 63 | 64 | 65 | ## Cyfyngu nifer yr alwadau yr awr 66 | 67 | Mae gan yr API gyfyngiad ar nifer yr alwadau y gellir eu gwneud mewn awr. 68 | 69 | Os ydych eisiau cynyddu nifer y galwadau at yr API sydd gennych, cysylltwch â ni. 70 | 71 | Gellir gweld cyfanswm nifer eich galwadau ar unrhyw adeg drwy edrych ar y 'HTTP headers' yn eich galwad API: 72 | 73 | ``` 74 | $ curl -i https://api.techiaith.org/translate/v1/translate?api_key=123&q=Will+the+Minister+make+a+statement&engine=CofnodYCynulliad&source=en&target=cy 75 | 76 | HTTP/1.1 200 OK 77 | Date: Mon, 17 Nov 2014 14:41:21 GMT 78 | Content-Type: application/json 79 | Content-Language: cy 80 | X-Ratelimit-Remaining: 276 81 | X-Ratelimit-Limit: 24 82 | X-Ratelimit-Reset: 1416237399 83 | ``` 84 | 85 | Mae'r headers yn cynnwys yr holl wybodaeth sydd ei angen: 86 | 87 | | Enw'r Header | Disgrifiad | 88 | |--------------|------------| 89 | | X-RateLimit-Limit | Y nifer mwyaf o alwadau allwch chi eu gwneud mewn awr | 90 | | X-RateLimit-Remaining | Y nifer o alwadau sydd gennych ar ôl yn y 'blwch' cyfyngu presennol | 91 | | X-RateLimit-Reset | Yr amser y bydd y 'blwch' cyfyngu presennol yn cael ei ail-osod, mewn [eiliadau epoch UTC](http://en.wikipedia.org/wiki/Unix_time) | 92 | 93 | Os ydych chi angen yr amser mewn fformat gwahanol, gellir gwneud hyn gydag unrhyw iaith raglennu modern. Er engraifft, gellir gwneud hyn trwy gonsol eich porwr (gyda Javascript) a dychwelych gwrthrych 'Javascript Date'. 94 | 95 | 96 | ```javascript 97 | new Date(1416237399 * 1000) 98 | Date 2014-11-17T15:16:39.000Z 99 | ``` 100 | 101 | Ar ôl i chi fynd dros eich nifer mwyaf o alwadau yr awr, byddwch yn derbyn gwall gan y gweinydd (403 Forbidden): 102 | 103 | 104 | ``` 105 | curl -i https://api.techiaith.org/translate/v1/translate?api_key=123&q=Will+the+Minister+make+a+statement&engine=CofnodYCynulliad&source=en&target=cy 106 | 107 | HTTP/1.1 200 OK 108 | Date: Tue, 18 Nov 2014 10:45:10 GMT 109 | Content-Type: application/json 110 | X-Ratelimit-Limit: 300 111 | X-Ratelimit-Remaining: 0 112 | Content-Language: cy 113 | X-Ratelimit-Reset: 1416310586 114 | 115 | { 116 | "success": false, 117 | "errors": ["403 Forbidden: Rydych chi wedi mynd dros eich cyfyngiad nifer yr alwadau yr awr"] 118 | } 119 | ``` 120 | 121 | ------ 122 | 123 | # Moses SMT Machine Translation Online API 124 | This repository's machine translation capabilities are available also online from our API Centre [please see: https://api.techiaith.org/](https://api.techiaith.org). 125 | The API works using HTTPS GET, meaning you can use it with any programming language/software package of your choice which works over HTTP. 126 | 127 | ## Tutorials 128 | 129 | [Tutorial 1](demo1.md) is an example of how to use the Moses SMT Machine Translation Online API to translate text. 130 | 131 | 132 | 133 | ## Current Version 134 | 135 | Currently, there is only one version of the Moses SMT Machine Translation Online API available: v1 or 'version 1'. 136 | The version used for the request is returned in the JSON result. 137 | 138 | ## Schema 139 | 140 | The connection to the API is over HTTPS only, from the domain `api.techiaith.org/translate`. All data sent to and received from the API is in JSON ([unicode-escaped ASCII](http://tools.ietf.org/html/rfc5137)) 141 | 142 | ## API Parameters 143 | 144 | | Parameter | Description | Notes | 145 | |--------------|------------|----------| 146 | | `api_key` | Your API key, from the API Centre (https://api.techiaith.org) | required | 147 | | `q` | The text to be translated. Formatted according to RFC 3986 (percent-encoded) | required | 148 | | `engine` | The name of the engine to be used for translation. Choices are : `CofnodYCynulliad` or `Deddfwriaeth` | required | 149 | | `source` | The source text's language. Choices are `en` or `cy`. | required | 150 | | `target` | The target language for any translations returned by the API. Choices are `en` or `cy`. | required | 151 | | `callback` | Name of the function to wrap the response in for a JSON-P callback (see below) | optional | 152 | 153 | 154 | ### Example 155 | 156 | ``` 157 | $ curl https://api.techiaith.org/translate/v1/translate?api_key=123&q=Will+the+Minister+make+a+statement&engine=CofnodYCynulliad&source=en&target=cy 158 | 159 | { 160 | "success": true, 161 | "translations": [ 162 | {"translatedText": "A wnaiff y Gweinidog wneud datganiad"}, 163 | ], 164 | "version": 1 165 | } 166 | ``` 167 | 168 | ## JSON-P Callbacks 169 | 170 | You can use the API with JSON-P callbacks by adding the parameter `callback` to your request: 171 | 172 | ``` 173 | $ curl https://api.techiaith.org/translate/v1/translate?api_key=123&q=Will+the+Minister+make+a+statement&engine=CofnodYCynulliad&source=en&target=cy&callback=foo 174 | foo({ 175 | "success": true, 176 | "translations": [ 177 | {"translatedText": "A wnaiff y Gweinidog wneud datganiad"}, 178 | ], 179 | "version": 1 180 | }); 181 | ``` 182 | 183 | 184 | ## Rate Limiting 185 | 186 | The API has a limit on the number of requests you can make per hour, linked to your API key. If you would like to increase the number of requests you can make to the API per hour, use the form within the 'API Centre'. 187 | 188 | You can view the number of requests you have made/have remaining at any time by looking at the 'HTTP headers' of any response to the API: 189 | 190 | ``` 191 | $ curl -i https://api.techiaith.org/translate/v1/translate?api_key=123&q=Will+the+Minister+make+a+statement&engine=CofnodYCynulliad&source=en&target=cy 192 | 193 | HTTP/1.1 200 OK 194 | Date: Mon, 17 Nov 2014 14:41:21 GMT 195 | Content-Type: application/json 196 | Content-Language: cy 197 | X-Ratelimit-Remaining: 276 198 | X-Ratelimit-Limit: 300 199 | X-Ratelimit-Reset: 1416237399 200 | ``` 201 | 202 | The headers contain all information you may require: 203 | 204 | | Header Name | Description | 205 | |--------------|------------| 206 | | X-RateLimit-Limit | Maximum number of requests you can make per hour (rate limit) | 207 | | X-RateLimit-Remaining | The number of requests remaining in the current rate limit window | 208 | | X-RateLimit-Reset | The time at which the current rate limit window resets in [UTC epoch seconds](http://en.wikipedia.org/wiki/Unix_time) | 209 | 210 | 211 | If you need the time in a different format, any modern programming language can get the job done. For example, if you open up the console on your web browser, you can easily get the reset time as a JavaScript Date object. 212 | 213 | ```javascript 214 | new Date(1416237399 * 1000) 215 | Date 2014-11-17T15:16:39.000Z 216 | ``` 217 | 218 | Once you go over the rate limit you will receive an error response: 219 | 220 | ``` 221 | $ curl -i curl https://api.techiaith.org/translate/v1/translate?api_key=123&q=Will+the+Minister+make+a+statement&engine=CofnodYCynulliad&source=en&target=cy 222 | 223 | HTTP/1.1 200 OK 224 | Date: Tue, 18 Nov 2014 10:44:37 GMT 225 | Content-Type: application/json 226 | X-Ratelimit-Limit: 300 227 | X-Ratelimit-Remaining: 0 228 | Content-Language: en 229 | X-Ratelimit-Reset: 1416310586 230 | 231 | { 232 | "success": false, 233 | "errors": ["403 Forbidden: You have exceeded your request limit"] 234 | } 235 | ``` 236 | -------------------------------------------------------------------------------- /docs/Docker.md: -------------------------------------------------------------------------------- 1 | # Moses-SMT gyda Docker 2 | 3 | Mae Docker yn dechnoleg pecynnu meddalwedd a hwyluso gosod ar gyfer Linux. 4 | Rydym wedi llwytho ein system Moses-SMT i gronfa docker.com er mwyn ei gwneud 5 | hi'n haws i chi berfformio cyfieithu peirianyddol rhwng y Gymraeg a'r Saesneg. 6 | 7 | Byddwch angen fersiwn mwy diweddar na 1.0.1 o Docker ar eich system Linux. Bydd 8 | hefyd angen Boot2Docker arnoch os hoffwch redeg eich peiriant ar gyfrifiadur 9 | Windows neu Mac OS X. 10 | 11 | ## Moses-SMT o docker.com 12 | Bydd y gorchymyn canlynol yn gosod popeth: 13 | 14 | ```sh 15 | docker pull techiaith/moses-smt 16 | ``` 17 | 18 | A dyma enghraifft o sut mae defnyddio Moses-SMT er mwyn rhedeg peiriant CofnodYCynulliad, sy'n 19 | cyfieithu o'r Saesneg i'r Gymraeg : 20 | 21 | ```sh 22 | docker run --name moses-smt-cofnodycynulliad-en-cy -p 8008:8008 -p 8080:8080 techiaith/moses-smt start -e CofnodYCynulliad -s en -t cy 23 | ``` 24 | 25 | Bydd modd mynd at http://127.0.0.1:8008 er mwyn gweld y peiriant ar waith. 26 | 27 | # Moses-SMT with Docker 28 | 29 | Docker is a software packaging and installation facilitator for Linux. 30 | We have loaded our Moses-SMT to docker.com in order to make it easier for 31 | you to perform machine translation between Welsh and English. 32 | You will need a version of Docker more recent than 1.0.1 on your Linux system. 33 | You will also need Boot2Docker if you want to run your engine on a Windows or 34 | Mac OS X computer. 35 | 36 | ## Moses-SMT from docker.com 37 | The following command will install everything: 38 | 39 | ```sh 40 | docker pull techiaith/moses-smt 41 | ``` 42 | 43 | Here's an example of how to use Moses-SMT to run the engine CofnodYCynulliad, 44 | which is set to translate from English to Welsh : 45 | 46 | ```sh 47 | docker run --name moses-smt-cofnodycynulliad-en-cy -p 8008:8008 -p 8080:8080 techiaith/moses-smt start -e CofnodYCynulliad -s en -t cy 48 | ``` 49 | 50 | To see the engine working, go to http://127.0.0.1:8008 -------------------------------------------------------------------------------- /docs/GosodiadArferol.md: -------------------------------------------------------------------------------- 1 | # Gosod Moses-SMT ar Linux 2 | Bydd y cyfarwyddiadau yma yn eich cynorthwyo i osod meddalwedd cyfieithu peiranyddol 3 | Moses yn uniongyrchol ar systemau Linux; gan gynnwys Debian, Ubuntu, CentOS a RedHat. 4 | 5 | Bydd gosodiad llawn o Moses yn eich caniatau i greu peiriannau cyfieithu eich hunain 6 | ar sail hyfforddi gyda'ch data eich hunain neu corpora a gasglwyd eisoes gan Uned 7 | Technolegau Iaith. Bydd modd hefyd eu rhedeg i'w defnyddio at dibenion eich 8 | gwaith cyfieithu. 9 | 10 | Rydym wedi symleiddio'r broses o osod Moses yn ddau gam syml iawn. 11 | 12 | ## Cam 1: Paratoi'r gweinydd 13 | Mae angen hawliau gweinyddwr ar eich system Linux. Mae'r cyfarwyddiadau canlynol 14 | yn canolbwyntio ar system Debian fel Ubuntu. 15 | Mae angen addasiadau bach ar gyfer CentOS/RedHat. 16 | 17 | Yn gyntaf, bydd rhaid i chi greu defnyddiwr 'moses' ar gyfer eich peiriant. 18 | 19 | ``` sh 20 | $ sudo adduser --home /home/moses moses 21 | $ sudo adduser moses sudo 22 | ``` 23 | All-gofnodwch, ac yna ail-fewngofnodwch fel eich defnyddiwr 'moses' newydd. 24 | 25 | Bydd angen gosod 'curl' er mwyn dechrau'r proses gosod: 26 | 27 | ``` sh 28 | moses@ubuntu:~$ sudo apt-get install curl 29 | ``` 30 | ## Cam 2: Gosod Moses 31 | 32 | Dim ond un gorchymyn syml sydd ei hangen i chi osod Moses : 33 | 34 | ```sh 35 | moses@ubuntu:~$ curl http://techiaith.cymru/moses/get/debian/install.sh | sh 36 | ``` 37 | 38 | Bydd hyn yn achosi i lwyth o negeseuon ymddangos ar y sgrin am gyfnod estynedig o amser. 39 | Ond bydd modd wedyn i chi un ai: 40 | 41 | - [hyfforddi gyda data corpora yr Uned Technolegau Iaith](Hyfforddi.md) 42 | - [hyfforddi gyda eich data corpora cyfochrog eich hunain](Hyfforddi.md#CorporaEichHun) 43 | - [rhedeg peiriant a grëwyd gan yr Uned Technolegau Iaith](RhedegMoses.md) 44 | 45 | ## Welsh English Machine Translation Moses-SMT 46 | 47 | These instructions will allow you to install Moses SMT on Linux systems; including Ubuntu, CentOS, RedHat and Raspberry Pi. They will describe how to 48 | create machines by training them with your own data, or using the corpora already collected by Bangor's Language Technologies Unit. They will also describe how to run your machine, and how to use it for translation. 49 | 50 | We have simplified the installation process into two very easy steps. 51 | 52 | ## Step 1: Preparing the Server 53 | You will need admin priviledges on your Linux system. The following instructions were written for the Debian system; which includes Ubuntu, Raspberry Pi a Mint. Small changes are needed for CentOS/RedHat. 54 | 55 | First, you will need to create a 'moses' user for your machine. 56 | 57 | ``` sh 58 | $ sudo adduser --home /home/moses moses 59 | $ sudo adduser moses sudo 60 | ``` 61 | 62 | Log out, then login again using your new 'moses' user . 63 | 64 | You'll need to enter a 'curl' in order to begin the installation process. 65 | 66 | ``` sh 67 | moses@ubuntu:~$ sudo apt-get install curl 68 | ``` 69 | 70 | Many messages will appear on the screen for a short period. 71 | 72 | ## Installing Moses-SMT 73 | 74 | You only need one simple command to install Moses : 75 | 76 | ```sh 77 | moses@ubuntu:~$ curl http://techiaith.cymru/moses/get/debian/install.sh | sh 78 | ``` 79 | 80 | Many messages will appear on the screen for some time. 81 | But you will then be able to either: 82 | 83 | - [train with the Language Technologies Unit's corpus data](Hyfforddi.md) 84 | - [train with the your own parallel corpus data](Hyfforddi.md#YourOwnCorpora) 85 | - [run a machine created by the Language Technologies Unit](RhedegMoses.md) 86 | -------------------------------------------------------------------------------- /docs/Hyfforddi.md: -------------------------------------------------------------------------------- 1 | # Creu Peiriannau Moses-SMT 2 | 3 | ## Hyfforddi gyda data corpora gan yr Uned Technolegau Iaith 4 | 5 | Os hoffwch chi hyfforddi gan ddefnyddio data gan yr Uned Technolegau Iaith, sy'n dod o ffynonellau megis 6 | Cofnod y Cynulliad a'r corpws Ddeddfwriaeth, yna mae tri gorchymyn syml i'w defnyddio. 7 | 8 | Yn gyntaf mae angen estyn y data rydym yn bwriadu ei ddefnyddio: 9 | 10 | ``` sh 11 | moses@ubuntu:~/moses-smt$ make run 12 | # python moses.py fetchcorpus -e CofnodYCynulliad 13 | ``` 14 | Pwrpas yr ail orchymyn yw hyfforddi a dynodi'r cyfeiriad cyfieithu (e.e. o Gymraeg i Saesneg, neu o Saesneg i'r Gymraeg). 15 | Byddwch angen cyfrifiadur gyda dros 4Gb o gof. Bydd y broses yn cymryd rhai oriau i gwblhau. 16 | 17 | I hyfforddi peiriant cyfieithu gan ddefnyddio data CofnodYCynulliad i gyfieithu o Saesneg i'r Gymraeg: 18 | 19 | ``` sh 20 | # python moses.py train -e CofnodYCynulliad -s en -t cy 21 | ``` 22 | 23 | Ac yna i'w gychwyn: 24 | 25 | ``` sh 26 | # python moses.py start -e CofnodYCynulliad -s en -t cy 27 | ``` 28 | 29 | [Gweler y dudalen canlynol](RhedegMoses.md) am rhagor ar sut gellir rhedeg Moses-SMT 30 | 31 | 32 | ## Hyfforddi gyda data corpora eich hun 33 | 34 | Rhaid gosod eich testun fel ffeiliau testun cyfochrog o fewn is-ffolder sydd 35 | wedi ei henwi ar ôl enw eich peiriant newydd; 36 | 37 | e.e. os yw'r data yn dod o hen gyfieithiadau 'Marchnata', yna defnyddiwch y gorchymyn canlynol : 38 | 39 | ```sh 40 | # mkdir -p moses-models/Marchnata/corpus 41 | ``` 42 | 43 | Rhowch y ffeil Cymraeg o fewn ffeil gyda enw'r corpws a'r estyniad '.cy'. 44 | 45 | Rhowch y data Saesneg o fewn ffeil gyda enw'r corpws a'r estyniad '.en'. 46 | 47 | ```sh 48 | # ls moses-models/Marchnata/corpus 49 | Marchnata.cy Marchnata.en 50 | ``` 51 | 52 | Mae'r broses hyfforddi yn debyg i'r camau ochod, gweler : 53 | 54 | ``` sh 55 | # python moses.py train -e Marchnata -s en -t cy 56 | ``` 57 | 58 | Ac yna i'w chychwyn: 59 | 60 | ``` sh 61 | # python moses.py start -e Marchnata -s en -t cy 62 | ``` 63 | 64 | 65 | -------------------------------------------------------------------------------- /docs/RhedegMoses.md: -------------------------------------------------------------------------------- 1 | # Rhedeg Moses-SMT 2 | Ar ôl i'r broses osod gwblhau'n lwyddianus, bydd modd i chi ddefnyddio Moses-SMT 3 | drwy ddefnyddio un script cyffredinol rydym wedi ei ddatblygu er mwyn hwyluso 4 | defnyddio ei holl nodweddion. Lleolir y script yn ~/moses-smt: 5 | 6 | ``` sh 7 | moses@ubuntu:~$ cd ~/moses-smt 8 | moses@ubuntu:~/moses-smt$ python moses.py 9 | usage: moses.py [-h] {fetchcorpus,fetchengine,train,start} ... 10 | moses.py: error: too few arguments 11 | ``` 12 | 13 | ## Rhedeg Peirianau'r Uned Technolegau Iaith 14 | Un o'r nodweddion y script yw estyn peiriant sydd wedi ei hyfforddi eisoes gan 15 | yr Uned a'i chychwyn. Y peiriannau (a'r data) sydd ar gael gan yr Uned i 16 | gyfieithu'n beirianyddol gyda Moses-SMT yw : 17 | 18 | * CofnodYCynulliad 19 | * Deddfwriaeth 20 | 21 | Felly er mwyn estyn peiriant parod, sy'n cyfieithu o Saesneg i Gymraeg, defnyddiwch 22 | i'r is-orchymun 'fetchengine'. Dyma manylion 'fetchengine' 23 | 24 | ```sh 25 | moses@ubuntu:~/moses-smt$ python moses.py fetchengine 26 | usage: moses.py fetchengine [-h] -e ENGINE_NAME -s SOURCE_LANG -t TARGET_LANG 27 | moses.py fetchengine: error: argument -e/--engine is required 28 | ``` 29 | 30 | Felly, er mwyn estyn peiriant 'CofnodYCynulliad' yr Uned, sy'n cyfieithu o Saesneg 31 | i'r Gymraeg, mae modd rhedeg y canlynol: 32 | 33 | ```sh 34 | moses@ubuntu:~/moses-smt$ python moses.py fetchengine -e CofnodYCynulliad -s en -t cy 35 | ``` 36 | 37 | Ac yna i'w gychwyn: 38 | 39 | ``` sh 40 | moses@ubuntu:~/moses-smt$ python moses.py start -e CofnodYCynulliad -s en -t cy 41 | ``` 42 | 43 | Bydd hyn yn achosi i weinydd Moses-SMT redeg gan ddisgwyl am negeseuon XMLRPC-C ar 44 | borth 8080. 45 | 46 | ## Gweinydd Dirprwyo HTTP 47 | Os rydych angen cyfathrebu gyda'r weinydd Moses-SMT o porwr (ac HTTP) yna bydd 48 | angen cychwyn weinydd dirprwyo: 49 | 50 | ```sh 51 | moses@ubuntu:~/moses-smt$ python server.py 52 | ``` 53 | 54 | Bydd modd mynd at http://127.0.0.1:8008 er mwyn gweld y peiriant ar waith. 55 | 56 | # Running Moses-SMT 57 | Once the installation process has been completed successfully, it will be 58 | possible for you to use Moses-SMT using a single general script that we've 59 | developed in order to make using all of its features easier. 60 | The script is located in ~/moses-smt: 61 | 62 | ``` sh 63 | moses@ubuntu:~$ cd ~/moses-smt 64 | moses@ubuntu:~/moses-smt$ python moses.py 65 | usage: moses.py [-h] {fetchcorpus,fetchengine,train,start} ... 66 | moses.py: error: too few arguments 67 | ``` 68 | ## Running the Unit's machine translation engines 69 | One feature of the script is the ability to fetch and start a machine that 70 | has already been trained by the Unit. The engines (and data) available from 71 | the Unit to machine translate with Moses-SMT are : 72 | 73 | * CofnodYCynulliad 74 | * Deddfwriaeth 75 | 76 | So in order to fetch a ready made engine, which can translate from English to 77 | Welsh, use the sub-command 'fetchengine'. Here are the details for 'fetchengine' 78 | 79 | ```sh 80 | moses@ubuntu:~/moses-smt$ python moses.py fetchengine 81 | usage: moses.py fetchengine [-h] -e ENGINE_NAME -s SOURCE_LANG -t TARGET_LANG 82 | moses.py fetchengine: error: argument -e/--engine is required 83 | ``` 84 | 85 | So, in order to fetch the unit's engine 'CofnodYCynulliad' which translates 86 | from Welsh to English, you can run the following: 87 | 88 | ```sh 89 | moses@ubuntu:~/moses-smt$ python moses.py fetchengine -e CofnodYCynulliad -s en -t cy 90 | ``` 91 | 92 | And to start it: 93 | 94 | ``` sh 95 | moses@ubuntu:~/moses-smt$ python moses.py start -e CofnodYCynulliad -s en -t cy 96 | ``` 97 | 98 | This will cause the Moses-SMT server to run while listening out for messages 99 | from XMLRPC-C on port 8080. 100 | 101 | ## HTTP Delegation Server 102 | If you need to communicate with the Moses-SMT server from a browser (and HTTP) then you will need to start a 103 | delegation server. 104 | 105 | ```sh 106 | moses@ubuntu:~/moses-smt$ python server.py 107 | ``` 108 | 109 | It should be possible to go to http://127.0.0.1:8008 in order to see the machine at work. 110 | -------------------------------------------------------------------------------- /docs/Training.md: -------------------------------------------------------------------------------- 1 | 2 | # Create Moses-SMT machines 3 | 4 | ## Training with data from mthe Language Technology Unit 5 | 6 | The Language Technologies Unit has data from sources like the Proceedings of the National Assembly and the Legislation corpus, that you can use with three simple commands to train your own translation engine: 7 | 8 | First, you need to fetch the data that we will be using: 9 | 10 | ``` sh 11 | moses@ubuntu:~/moses-smt$ make run 12 | # python moses.py fetchcorpus -e CofnodYCynulliad 13 | ``` 14 | 15 | The second command is to train and state the direction that the translation will take (i.e. Welsh to English or English to Welsh). 16 | You will need a computer that has at least more than 4GB of memory. The entire process will take a few hours to complete. 17 | 18 | In order to train a translation machine using the CofnodYCynulliad (the Proceedings of the National Assembly corpus) data that can translate from English to Welsh, you will need: 19 | 20 | ``` sh 21 | # python moses.py train -e CofnodYCynulliad -s en -t cy 22 | ``` 23 | 24 | And then to start it: 25 | 26 | ``` sh 27 | # python moses.py start -e CofnodYCynulliad -s en -t cy 28 | ``` 29 | 30 | [See the following page](RhedegMoses.md) for more on how to run Moses-SMT 31 | 32 | ## Training with your own corpus data 33 | You will need to set out your text as paralell files in a sub-folder which is named after your new machine; 34 | 35 | For example, if you have data from old translations of 'Marketing', then create : 36 | 37 | ```sh 38 | moses@ubuntu:~/moses-smt$ make run 39 | # mkdir -p moses-models/Marchnata/corpus 40 | ``` 41 | 42 | Place the Welsh file in a file with the corpus' name, and the extension'.cy'. Then, place the English data in a file which also has the corpus' name, but with the extension '.en'. 43 | 44 | ```sh 45 | # ls moses-models/Marchnata/corpus 46 | Marchnata.cy Marchnata.en 47 | ``` 48 | 49 | To train your own machine : 50 | 51 | ``` sh 52 | # python moses.py train -e Marchnata -s en -t cy 53 | ``` 54 | 55 | To start it : 56 | 57 | ``` sh 58 | # python moses.py start -e Marchnata -s en -t cy 59 | ``` 60 | -------------------------------------------------------------------------------- /docs/demo1.md: -------------------------------------------------------------------------------- 1 | # Tiwtorial 1 2 | 3 | Cyfieithu testun gyda'r API Cyfieithu Peirianyddol Moses SMT Ar-lein. 4 | 5 | Mae'r ffeil `tut/demo1.py` yn enghraifft o sut i gyfieithu testun gyda'r API Cyfieithu Peirianyddol Moses SMT ar-lein. 6 | Er mwyn rhedeg y tiwtorial, dilynwch y camau isod: 7 | 8 | * `cd tut` 9 | * `python demo1.py -e CofnodYCynulliad -s en -t cy` (mae'n bosib rhoid 'Deddfwriaeth' neu 'Meddalwedd' hefyd ar gyfer -e) 10 | * Dilynwch y negeseuon ar y sgrin 11 | 12 | Gellier cyfieithu ffeil sydd ar eich cyfrifiadur eisioes trwy pasio enw'r ffeil fel -f i'r sgript tiwtorial: 13 | 14 | `python demo1.py -e CofnodYCynulliad -s en -t cy -f ffeil_iw_gwirio.txt` 15 | -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Prifysgol Bangor University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/moses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from argparse import ArgumentParser 5 | 6 | DEFAULT_TRAINING_DATA_URL = "http://techiaith.cymru/corpws/Moses" 7 | DEFAULT_ENGINES_URL = "http://techiaith.cymru/moses/3.0" 8 | 9 | HOME = os.environ.get('HOME') 10 | MOSES_HOME = os.environ.get('MOSES_HOME', HOME) 11 | MOSESMODELS_HOME = os.path.join(MOSES_HOME, "moses-models") 12 | MTDK_HOME = os.path.join(os.path.dirname(__file__), "mtdk") 13 | SERVER_HOME = os.path.join(os.path.dirname(__file__)) 14 | 15 | # host/port to hose this server (for access thru browsers) 16 | HOST = '0.0.0.0' 17 | PORT = 8008 18 | 19 | # host/post for your Moses XMLRPC server 20 | MOSES_HOST = 'http://0.0.0.0' 21 | MOSES_PORT = 8080 22 | RECASER_PORT = 8081 23 | MOSES_URL = MOSES_HOST + ":" + str(MOSES_PORT) + "/RPC2" 24 | RECASER_URL = MOSES_HOST + ":" + str(RECASER_PORT) + "/RPC2" 25 | 26 | DESCRIPTION = """Sgript Hyfforddi a chychwyn rhedeg system cyfieithu peirianyddol Moses 27 | © Prifysgol Bangor University 28 | """ 29 | 30 | class MosesRunError(Exception): 31 | pass 32 | 33 | def run_commands(cmds): 34 | for cmd in cmds: 35 | cmd = u" ".join(cmd) 36 | print("Rhedeg %s" % cmd) 37 | returncode = os.system(cmd) 38 | if returncode != 0: 39 | exception_str = ["Problem yn rehedeg y gorchymyn:", " %s" % cmd] 40 | raise MosesRunError(u"\n".join(exception_str)) 41 | 42 | def script_path(script): 43 | path = os.path.join(MTDK_HOME, script) 44 | if not os.path.exists(path): 45 | raise MosesRunError("Nid yw'r path '%s' yn bodoli.\nYdych chi wedi gosod y ffeiliau i gyd yn iawn?" % path) 46 | return path 47 | 48 | def fetchcorpus(engine_name, **args): 49 | """Lawrlwytho data corpws / Download corpus data""" 50 | 51 | data_url = os.path.join(DEFAULT_TRAINING_DATA_URL, engine_name, "%s.tar.gz" % engine_name) 52 | 53 | prepare_engine_cmd = [script_path("mtdk-00-prepare-new-engine.sh"), "-h", MOSESMODELS_HOME, "-e", engine_name, "-u", data_url] 54 | #prepare_corpus_cmd = [script_path("mtdk-01-prepare-corpus.sh"), "-m", MOSES_HOME, "-h", MOSESMODELS_HOME, "-e", engine_name] 55 | 56 | run_commands([prepare_engine_cmd]) 57 | 58 | def fetchengine(engine_name, source_lang, target_lang, **args): 59 | """Lawrlwytho peiriant cyfieithu o techiaith.cymru / Download a translation engine from techiaith.cymru""" 60 | 61 | download_engine_cmd = [script_path("mt_download_engine.sh"),"-m", MOSES_HOME, "-h", MOSESMODELS_HOME, "-e", engine_name, "-s", source_lang, "-t", target_lang] 62 | 63 | run_commands([download_engine_cmd]) 64 | 65 | def train(engine_name, ngram_size, source_lang, target_lang, **args): 66 | """Hyfforddi model iaith Moses / Train Moses' language model""" 67 | script_params = ["-m", MOSES_HOME, "-h", MOSESMODELS_HOME, "-e", engine_name] 68 | str_ngram_size = str(ngram_size) 69 | 70 | prepare_corpus_cmd = [script_path("mtdk-01-prepare-corpus.sh"), "-m", MOSES_HOME, "-h", MOSESMODELS_HOME, "-e", engine_name, "-s", source_lang, "-t", target_lang] 71 | train_lang_model_cmd = [script_path("mtdk-02-train-language-model.sh")] + script_params + ["-t", target_lang] 72 | train_recaser_model_cmd = [script_path("mtdk-02-train-recaser-model.sh")] + script_params + ["-t", target_lang] 73 | train_translation_cmd = [script_path("mtdk-03-train-translation-engine.sh")] + script_params + ["-n", str_ngram_size, "-s", source_lang, "-t", target_lang] 74 | compress_translation_cmd = [script_path("mtdk-04-compress-translation-engine-ram.sh")] + script_params + ["-s", source_lang, "-t", target_lang] 75 | 76 | run_commands([prepare_corpus_cmd, train_lang_model_cmd, train_recaser_model_cmd, train_translation_cmd, compress_translation_cmd]) 77 | 78 | def package(engine_name, source_lang, target_lang, **args): 79 | """Pecynnu'r peiriant i ffeil tar.gz er mwyn hwyluso copio i gyfrifiadur arall / Package the an engine to a tar.gz file to make copying to other computers easier""" 80 | script_params = ["-m", MOSES_HOME, "-h", MOSESMODELS_HOME, "-e", engine_name, "-s", source_lang, "-t", target_lang] 81 | 82 | package_cmd = [script_path("mtdk-05-package.sh")] + script_params 83 | 84 | run_commands([package_cmd]) 85 | 86 | def start(engine_name, source_lang, target_lang, **args): 87 | 88 | """Cychwyn y gweinydd Moses / Start the Moses Server""" 89 | moses_recaser_server_cmd = [os.path.join(MOSES_HOME, "mosesdecoder","bin","mosesserver"), \ 90 | "-f", os.path.join(MOSESMODELS_HOME, engine_name, "recaser", target_lang, "moses.ini"),\ 91 | "--server-port",str(RECASER_PORT),\ 92 | "&"] 93 | 94 | source_target_lang = "%s-%s" % (source_lang, target_lang) 95 | moses_server_cmd = [os.path.join(MOSES_HOME, "mosesdecoder", "bin", "mosesserver"), \ 96 | "-f",\ 97 | os.path.join(MOSESMODELS_HOME, engine_name, source_target_lang, "engine", "model", "moses.ini"),\ 98 | "&"] 99 | 100 | python_server_cmd = ["python", os.path.join(SERVER_HOME,"python-server.py"), \ 101 | "-pretty",\ 102 | "-verbose","1", \ 103 | "-port",str(PORT),\ 104 | "-ip",HOST,\ 105 | "-slang",source_lang,\ 106 | "-tlang",target_lang,\ 107 | "-mosesurl","\"" + MOSES_URL + "\"",\ 108 | "-recaserurl","\"" + RECASER_URL + "\"",\ 109 | "-moseshome","\"" + MOSES_HOME + "\"" ] 110 | 111 | run_commands([moses_server_cmd, moses_recaser_server_cmd, python_server_cmd]) 112 | 113 | 114 | if __name__ == "__main__": 115 | 116 | parser = ArgumentParser(description=DESCRIPTION) 117 | subparsers = parser.add_subparsers(title="Is-gorchmynion", description="Is-gorchmynion dilys", help="a ddylid llwytho, hyfforddi ynte cychwyn y peiriant") 118 | fetchparser = subparsers.add_parser('fetchcorpus') 119 | fetchparser.add_argument('-e', '--engine', dest="engine_name", required=True, help="enw i'r peiriant cyfieithu benodol") 120 | fetchparser.set_defaults(func=fetchcorpus) 121 | 122 | fetchengineparser = subparsers.add_parser('fetchengine') 123 | fetchengineparser.add_argument('-e','--engine', dest="engine_name", required=True, help="enw i'r peiriant cyfieithu benodol") 124 | fetchengineparser.add_argument('-s', '--sourcelang', dest="source_lang", required=True, help="iaith ffynhonnell") 125 | fetchengineparser.add_argument('-t', '--targetlang', dest="target_lang", required=True, help="iaith targed") 126 | fetchengineparser.set_defaults(func=fetchengine) 127 | 128 | trainparser = subparsers.add_parser('train') 129 | trainparser.add_argument('-e', '--engine', dest="engine_name", required=True, help="enw i'r peiriant cyfieithu benodol") 130 | trainparser.add_argument('-n', '--ngramsize', dest="ngram_size", default=3, help="maint ngrams - 3,4,5,6 neu 7") 131 | trainparser.add_argument('-s', '--sourcelang', dest="source_lang", required=True, help="iaith ffynhonnell") 132 | trainparser.add_argument('-t', '--targetlang', dest="target_lang", required=True, help="iaith targed") 133 | trainparser.set_defaults(func=train) 134 | 135 | startparser = subparsers.add_parser('start') 136 | startparser.add_argument('-e', '--engine', dest="engine_name", required=True, help="enw i'r peiriant cyfieithu benodol") 137 | startparser.add_argument('-s', '--sourcelang', dest="source_lang", required=True, help="iaith ffynhonnell") 138 | startparser.add_argument('-t', '--targetlang', dest="target_lang", required=True, help="iaith targed") 139 | startparser.set_defaults(func=start) 140 | 141 | packageparser = subparsers.add_parser('package') 142 | packageparser.add_argument('-e', '--engine', dest="engine_name", required=True, help="enw i'r peiriant cyfieithu benodol") 143 | packageparser.add_argument('-s', '--sourcelang', dest="source_lang", required=True, help="iaith ffynhonnell") 144 | packageparser.add_argument('-t', '--targetlang', dest="target_lang", required=True, help="iaith targed") 145 | packageparser.set_defaults(func=package) 146 | 147 | args = parser.parse_args() 148 | try: 149 | args.func(**vars(args)) 150 | except MosesRunError as e: 151 | print("\n**ERROR**\n") 152 | print(e) 153 | -------------------------------------------------------------------------------- /scripts/mtdk/mt_download_engine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { echo "Usage: $0 -m 4 | -h 5 | -e 6 | -s 7 | -t " 1>&2; exit 1; } 8 | 9 | while getopts ":m:h:e:s:t:" o; do 10 | case "${o}" in 11 | m) 12 | MOSES_HOME=${OPTARG} 13 | ;; 14 | h) 15 | MOSESMODELS_HOME=${OPTARG} 16 | ;; 17 | e) 18 | NAME=${OPTARG} 19 | ;; 20 | s) 21 | SOURCE_LANG=${OPTARG} 22 | ;; 23 | t) 24 | TARGET_LANG=${OPTARG} 25 | ;; 26 | *) 27 | usage 28 | ;; 29 | esac 30 | done 31 | shift $((OPTIND-1)) 32 | 33 | if [ -z "${MOSES_HOME}" ] || [ -z "${MOSESMODELS_HOME}" ] || [ -z "${NAME}" ] || [ -z "${SOURCE_LANG}" ] || [ -z "${TARGET_LANG}" ]; then 34 | usage 35 | fi 36 | 37 | mkdir -p ${MOSESMODELS_HOME}/${NAME} 38 | cd ${MOSESMODELS_HOME}/${NAME} 39 | 40 | wget --progress=dot:mega -O - http://techiaith.org/moses/3.0/${NAME}-${SOURCE_LANG}-${TARGET_LANG}.tar.gz|tar -zxf - 41 | 42 | cd - 43 | 44 | -------------------------------------------------------------------------------- /scripts/mtdk/mt_filter_for_mixedcase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | file=str(sys.argv[1]) 6 | 7 | with open (file) as f: 8 | for line in f: 9 | sline=line.strip() 10 | if not sline.islower(): 11 | if not sline.isdigit(): 12 | if len(sline) > 1: 13 | if sline[0].isupper(): 14 | print sline 15 | 16 | -------------------------------------------------------------------------------- /scripts/mtdk/mt_update_compress_moses_ini.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | file=str(sys.argv[1]) 6 | file2=str(sys.argv[2]) 7 | 8 | outfile=open(file2,'w') 9 | 10 | with open (file) as f: 11 | for line in f: 12 | if line.startswith('PhraseDictionaryMemory'): 13 | line = line.replace('PhraseDictionaryMemory','PhraseDictionaryCompact') 14 | line = line.replace('table.gz','table.minphr') 15 | if line.startswith('LexicalReordering'): 16 | line = line.replace('bidirectional-fe.gz','bidirectional-fe') 17 | 18 | outfile.write(line) 19 | 20 | outfile.close() 21 | -------------------------------------------------------------------------------- /scripts/mtdk/mtdk-00-prepare-new-engine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { echo "Usage: $0 -h 4 | -e 5 | -u &2; exit 1; } 6 | 7 | while getopts ":h:e:u:" o; do 8 | case "${o}" in 9 | h) 10 | MOSESMODELS_HOME=${OPTARG} 11 | echo "MosesModel will be generated in : ${MOSESMODELS_HOME}" 12 | ;; 13 | e) 14 | NAME=${OPTARG} 15 | echo "Name of model/engine : ${NAME}" 16 | ;; 17 | u) 18 | DATA_URL=${OPTARG} 19 | echo "Will fetch parallel corpus data from : ${DATA_URL}" 20 | ;; 21 | *) 22 | usage 23 | ;; 24 | esac 25 | done 26 | shift $((OPTIND-1)) 27 | 28 | if [ -z "${MOSESMODELS_HOME}" ] || [ -z "${NAME}" ] || [ -z "${DATA_URL}" ]; then 29 | usage 30 | fi 31 | 32 | mkdir -p ${MOSESMODELS_HOME}/${NAME}/corpus 33 | mkdir -p ${MOSESMODELS_HOME}/${NAME}/lm 34 | 35 | cd ${MOSESMODELS_HOME}/${NAME}/corpus 36 | wget -O - ${DATA_URL} | tar -zxf - 37 | cd - 38 | -------------------------------------------------------------------------------- /scripts/mtdk/mtdk-01-prepare-corpus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { echo "Usage: $0 -m 4 | -h 5 | -e 6 | -s 7 | -t " 1>&2; exit 1; } 8 | 9 | while getopts ":m:h:e:s:t:" o; do 10 | case "${o}" in 11 | m) MOSES_HOME=${OPTARG} 12 | ;; 13 | h) 14 | MOSESMODELS_HOME=${OPTARG} 15 | ;; 16 | e) 17 | NAME=${OPTARG} 18 | ;; 19 | s) 20 | SOURCE_LANG=${OPTARG} 21 | ;; 22 | t) 23 | TARGET_LANG=${OPTARG} 24 | ;; 25 | *) 26 | usage 27 | ;; 28 | esac 29 | done 30 | shift $((OPTIND-1)) 31 | 32 | if [ -z "${MOSES_HOME}" ] || [ -z "${MOSESMODELS_HOME}" ] || [ -z "${NAME}" ] || [ -z "${SOURCE_LANG}" ] || [ -z "${TARGET_LANG}" ]; then 33 | usage 34 | fi 35 | 36 | cd ${MOSESMODELS_HOME}/${NAME}/corpus 37 | 38 | echo "##### PREPARING TRAINING CORPUS #####" 39 | mkdir -p temp 40 | 41 | echo "##### LOWER CASING #####" 42 | ${MOSES_HOME}/mosesdecoder/scripts/tokenizer/lowercase.perl -l ${TARGET_LANG} < "${NAME}.${TARGET_LANG}" > "temp/${NAME}.lower.${TARGET_LANG}" 43 | ${MOSES_HOME}/mosesdecoder/scripts/tokenizer/lowercase.perl -l ${SOURCE_LANG} < "${NAME}.${SOURCE_LANG}" > "temp/${NAME}.lower.${SOURCE_LANG}" 44 | 45 | 46 | echo "##### TOKENIZATION ######" 47 | ${MOSES_HOME}/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${TARGET_LANG} < "temp/${NAME}.lower.${TARGET_LANG}" > "temp/${NAME}.tok.${TARGET_LANG}" 48 | ${MOSES_HOME}/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${SOURCE_LANG} < "temp/${NAME}.lower.${SOURCE_LANG}" > "temp/${NAME}.tok.${SOURCE_LANG}" 49 | 50 | #echo "##### CLEANING #####" 51 | ${MOSES_HOME}/mosesdecoder/scripts/training/clean-corpus-n.perl "temp/${NAME}.tok" ${TARGET_LANG} ${SOURCE_LANG} "${NAME}.clean" 1 80 52 | 53 | cd - 54 | -------------------------------------------------------------------------------- /scripts/mtdk/mtdk-02-train-language-model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { echo "Usage: $0 -m 4 | -h 5 | -e 6 | -t " 1>&2; exit 1; } 7 | 8 | while getopts ":m:h:e:t:" o; do 9 | case "${o}" in 10 | m) MOSES_HOME=${OPTARG} 11 | ;; 12 | h) 13 | MOSESMODELS_HOME=${OPTARG} 14 | ;; 15 | e) 16 | NAME=${OPTARG} 17 | ;; 18 | t) 19 | TARGET_LANG=${OPTARG} 20 | ;; 21 | *) 22 | usage 23 | ;; 24 | esac 25 | done 26 | shift $((OPTIND-1)) 27 | 28 | if [ -z "${MOSES_HOME}" ] || [ -z "${MOSESMODELS_HOME}" ] || [ -z "${NAME}" ] || [ -z "${TARGET_LANG}" ]; then 29 | usage 30 | fi 31 | 32 | echo "##### CREATING TARGET LANGUAGE MODEL #####" 33 | 34 | export IRSTLM=${MOSES_HOME}/irstlm 35 | cd ${MOSESMODELS_HOME}/${NAME}/lm 36 | 37 | echo "##### INSERT START AND END BOUNDARIES #####" 38 | echo "${MOSES_HOME}/irstlm/bin/add-start-end.sh < ../corpus/${NAME}.clean.${TARGET_LANG} > ${NAME}.sb.${TARGET_LANG}" 39 | ${MOSES_HOME}/irstlm/bin/add-start-end.sh < ../corpus/${NAME}.clean.${TARGET_LANG} > ${NAME}.sb.${TARGET_LANG} 40 | 41 | if ! [ $? -eq 0 ] 42 | then 43 | echo "Failed" 44 | exit 1 45 | fi 46 | 47 | echo "##### BUILD LM DATA #####" 48 | echo "${MOSES_HOME}/irstlm/bin/build-lm.sh -i ${NAME}.sb.${TARGET_LANG} -t ./tmp -p -s improved-kneser-ney -o ${NAME}.lm.${TARGET_LANG}" 49 | ${MOSES_HOME}/irstlm/bin/build-lm.sh -i ${NAME}.sb.${TARGET_LANG} -t ./tmp -p -s improved-kneser-ney -o ${NAME}.lm.${TARGET_LANG} 50 | 51 | if ! [ $? -eq 0 ] 52 | then 53 | echo "Failed" 54 | exit 1 55 | fi 56 | 57 | echo "##### COMPILE LM #####" 58 | echo "${MOSES_HOME}/irstlm/bin/compile-lm --text yes ${NAME}.lm.${TARGET_LANG}.gz ${NAME}.arpa.${TARGET_LANG}" 59 | ${MOSES_HOME}/irstlm/bin/compile-lm --text=yes ${NAME}.lm.${TARGET_LANG}.gz ${NAME}.arpa.${TARGET_LANG} 60 | 61 | if ! [ $? -eq 0 ] 62 | then 63 | echo "Failed" 64 | exit 1 65 | fi 66 | 67 | echo "##### BUILD BINARY LM #####" 68 | echo "${MOSES_HOME}/mosesdecoder/bin/build_binary -i ${NAME}.arpa.${TARGET_LANG} ${NAME}.blm.${TARGET_LANG}" 69 | ${MOSES_HOME}/mosesdecoder/bin/build_binary -i ${NAME}.arpa.${TARGET_LANG} ${NAME}.blm.${TARGET_LANG} 70 | 71 | if ! [ $? -eq 0 ] 72 | then 73 | echo "Failed" 74 | exit 1 75 | fi 76 | 77 | cd - 78 | 79 | exit 0 80 | -------------------------------------------------------------------------------- /scripts/mtdk/mtdk-02-train-recaser-model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { echo "Usage: $0 -m 4 | -h 5 | -e 6 | -t " 1>&2; exit 1; } 7 | 8 | while getopts ":m:h:e:s:t:" o; do 9 | case "${o}" in 10 | m) MOSES_HOME=${OPTARG} 11 | ;; 12 | h) 13 | MOSESMODELS_HOME=${OPTARG} 14 | ;; 15 | e) 16 | NAME=${OPTARG} 17 | ;; 18 | t) 19 | TARGET_LANG=${OPTARG} 20 | ;; 21 | *) 22 | usage 23 | ;; 24 | esac 25 | done 26 | shift $((OPTIND-1)) 27 | 28 | if [ -z "${MOSES_HOME}" ] || [ -z "${MOSESMODELS_HOME}" ] || [ -z "${NAME}" ] || [ -z "${TARGET_LANG}" ]; then 29 | usage 30 | fi 31 | 32 | mkdir -p ${MOSESMODELS_HOME}/${NAME}/recaser 33 | cd ${MOSESMODELS_HOME}/${NAME}/recaser 34 | 35 | echo "##### PREPARING RECASER MODEL #####" 36 | 37 | echo "##### FILTER FOR MIXED CASE IN CORPUS #######" 38 | python ${MOSES_HOME}/moses-smt/mtdk/mt_filter_for_mixedcase.py ${MOSESMODELS_HOME}/${NAME}/corpus/${NAME}.${TARGET_LANG} > "${NAME}.mixed.${TARGET_LANG}" 39 | 40 | echo "##### TOKENIZATION ######" 41 | ${MOSES_HOME}/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${TARGET_LANG} < "${NAME}.mixed.${TARGET_LANG}" > "${NAME}.mixed.tok.${TARGET_LANG}" 42 | 43 | echo "##### TRAIN RECASER #####" 44 | ${MOSES_HOME}/mosesdecoder/scripts/recaser/train-recaser.perl --dir ${MOSESMODELS_HOME}/${NAME}/recaser/${TARGET_LANG} --corpus ${NAME}.mixed.tok.${TARGET_LANG} --train-script ${MOSES_HOME}/mosesdecoder/scripts/training/train-model.perl 45 | 46 | cd - 47 | -------------------------------------------------------------------------------- /scripts/mtdk/mtdk-03-train-translation-engine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { echo "Usage: $0 -m 4 | -h 5 | -e 6 | -n 7 | -s 8 | -t " 1>&2; exit 1; } 9 | 10 | while getopts ":m:h:e:n:s:t:" o; do 11 | case "${o}" in 12 | m) 13 | MOSES_HOME=${OPTARG} 14 | ;; 15 | h) 16 | MOSESMODELS_HOME=${OPTARG} 17 | ;; 18 | e) 19 | NAME=${OPTARG} 20 | ;; 21 | n) 22 | NGRAMSIZE=${OPTARG} 23 | ;; 24 | s) 25 | SOURCE_LANG=${OPTARG} 26 | ;; 27 | t) 28 | TARGET_LANG=${OPTARG} 29 | ;; 30 | *) 31 | usage 32 | ;; 33 | esac 34 | done 35 | shift $((OPTIND-1)) 36 | 37 | if [ -z "${MOSES_HOME}" ] || [ -z "${MOSESMODELS_HOME}" ] || [ -z "${NAME}" ] || [ -z "${NGRAMSIZE}" ] || [ -z "${SOURCE_LANG}" ] || [ -z "${TARGET_LANG}" ]; then 38 | usage 39 | fi 40 | 41 | echo "##### TRAINING MACHINE TRANSLATION ENGINE FOR ${SOURCE_LANG} TO ${TARGET_LANG} #####" 42 | echo "##### (this will take a *long time*, like hours!) #####" 43 | 44 | mkdir -p ${MOSESMODELS_HOME}/${NAME}/${SOURCE_LANG}-${TARGET_LANG} 45 | cd ${MOSESMODELS_HOME}/${NAME}/${SOURCE_LANG}-${TARGET_LANG} 46 | 47 | echo "${MOSES_HOME}/mosesdecoder/scripts/training/train-model.perl -external-bin-dir ${MOSES_HOME}/external-bin-dir -root engine -corpus ${MOSESMODELS_HOME}/${NAME}/corpus/$NAME.clean -f ${SOURCE_LANG} -e ${TARGET_LANG} -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:${NGRAMSIZE}:${MOSESMODELS_HOME}/${NAME}/lm/${NAME}.blm.${TARGET_LANG}:8 > training.out c" 48 | ${MOSES_HOME}/mosesdecoder/scripts/training/train-model.perl -external-bin-dir ${MOSES_HOME}/external-bin-dir -root engine -corpus ${MOSESMODELS_HOME}/${NAME}/corpus/$NAME.clean -f ${SOURCE_LANG} -e ${TARGET_LANG} -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:${NGRAMSIZE}:${MOSESMODELS_HOME}/${NAME}/lm/${NAME}.blm.${TARGET_LANG}:8 > training.out c 49 | 50 | if ! [ $? -eq 0 ] 51 | then 52 | echo "Failed" 53 | exit 1 54 | fi 55 | 56 | cd - 57 | 58 | exit 0 59 | -------------------------------------------------------------------------------- /scripts/mtdk/mtdk-04-compress-translation-engine-ram.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { echo "Usage: $0 -m 4 | -h 5 | -e 6 | -s 7 | -t " 1>&2; exit 1; } 8 | 9 | while getopts ":m:h:e:s:t:" o; do 10 | case "${o}" in 11 | m) MOSES_HOME=${OPTARG} 12 | ;; 13 | h) 14 | MOSESMODELS_HOME=${OPTARG} 15 | ;; 16 | e) 17 | NAME=${OPTARG} 18 | ;; 19 | s) 20 | SOURCE_LANG=${OPTARG} 21 | ;; 22 | t) 23 | TARGET_LANG=${OPTARG} 24 | ;; 25 | *) 26 | usage 27 | ;; 28 | esac 29 | done 30 | shift $((OPTIND-1)) 31 | 32 | if [ -z "${MOSES_HOME}" ] || [ -z "${MOSESMODELS_HOME}" ] || [ -z "${NAME}" ] || [ -z "${SOURCE_LANG}" ] || [ -z "${TARGET_LANG}" ]; then 33 | usage 34 | fi 35 | 36 | cd ${MOSESMODELS_HOME}/${NAME}/${SOURCE_LANG}-${TARGET_LANG} 37 | 38 | echo "##### COMPACTING PHRASE TABLE #####" 39 | echo "" 40 | echo "Marcin Junczys-Dowmunt: Phrasal Rank-Encoding: Exploiting Phrase Redundancy and Translational Relations for Phrase Table Compression, Proceedings of the Machine Translation Marathon 2012, The Prague Bulletin of Mathematical Linguistics, vol. 98, pp. 63-74, 2012. (http://ufal.mff.cuni.cz/pbml/98/art-junczys-dowmunt.pdf)" 41 | echo "" 42 | echo "${MOSES_HOME}/mosesdecoder/bin/processPhraseTableMin -in engine/model/phrase-table.gz -nscores 4 -threads 4 -out engine/model/phrase-table" 43 | ${MOSES_HOME}/mosesdecoder/bin/processPhraseTableMin -in engine/model/phrase-table.gz -nscores 4 -threads 4 -out engine/model/phrase-table 44 | 45 | echo "##### REORDERING TABLE #####" 46 | gunzip engine/model/reordering-table.wbe-msd-bidirectional-fe.gz 47 | cat engine/model/reordering-table.wbe-msd-bidirectional-fe | LC_ALL=C sort | ${MOSES_HOME}/mosesdecoder/bin/processLexicalTable -out engine/model/reordering-table.wbe-msd-bidirectional-fe 48 | 49 | 50 | echo "##### UPDATING MOSES.INI #####" 51 | python ${MOSES_HOME}/moses-smt/mtdk/mt_update_compress_moses_ini.py ${MOSESMODELS_HOME}/${NAME}/${SOURCE_LANG}-${TARGET_LANG}/engine/model/moses.ini ${MOSESMODELS_HOME}/${NAME}/${SOURCE_LANG}-${TARGET_LANG}/engine/model/moses.ini.tmp 52 | 53 | cd - 54 | 55 | cd ${MOSESMODELS_HOME}/${NAME}/${SOURCE_LANG}-${TARGET_LANG}/engine/model 56 | 57 | mv moses.ini moses.ini.orig 58 | mv moses.ini.tmp moses.ini 59 | 60 | cd - 61 | 62 | -------------------------------------------------------------------------------- /scripts/mtdk/mtdk-05-package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { echo "Usage: $0 -m 4 | -h 5 | -e 6 | -s 7 | -t " 1>&2; exit 1; } 8 | 9 | while getopts ":m:h:e:s:t:" o; do 10 | case "${o}" in 11 | m) MOSES_HOME=${OPTARG} 12 | ;; 13 | h) 14 | MOSESMODELS_HOME=${OPTARG} 15 | ;; 16 | e) 17 | NAME=${OPTARG} 18 | ;; 19 | s) 20 | SOURCE_LANG=${OPTARG} 21 | ;; 22 | t) 23 | TARGET_LANG=${OPTARG} 24 | ;; 25 | *) 26 | usage 27 | ;; 28 | esac 29 | done 30 | shift $((OPTIND-1)) 31 | 32 | if [ -z "${MOSES_HOME}" ] || [ -z "${MOSESMODELS_HOME}" ] || [ -z "${NAME}" ] || [ -z "${SOURCE_LANG}" ] || [ -z "${TARGET_LANG}" ]; then 33 | usage 34 | fi 35 | 36 | echo ${MOSESMODELS_HOME}/${NAME} 37 | cd ${MOSESMODELS_HOME}/${NAME} 38 | 39 | tar -zcvf ${NAME}-${SOURCE_LANG}-${TARGET_LANG}.tar.gz lm/*.${TARGET_LANG} lm/*.${TARGET_LANG}.gz recaser/${TARGET_LANG}/* ${SOURCE_LANG}-${TARGET_LANG} 40 | 41 | cd - 42 | 43 | -------------------------------------------------------------------------------- /scripts/python-server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import threading 5 | import subprocess 6 | import cherrypy 7 | import json 8 | import itertools 9 | import logging 10 | import time 11 | import re 12 | import xmlrpclib 13 | import math 14 | from threading import Timer 15 | 16 | def popen(cmd): 17 | cmd = cmd.split() 18 | logger = logging.getLogger('translation_log.popen') 19 | logger.info("executing: %s" %(" ".join(cmd))) 20 | return subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 21 | 22 | def pclose(pipe): 23 | def kill_pipe(): 24 | pipe.kill() 25 | t = Timer(5., kill_pipe) 26 | t.start() 27 | pipe.terminate() 28 | t.cancel() 29 | 30 | def init_log(filename): 31 | logger = logging.getLogger('translation_log') 32 | logger.setLevel(logging.DEBUG) 33 | fh = logging.FileHandler(filename) 34 | fh.setLevel(logging.DEBUG) 35 | logformat = '%(asctime)s %(thread)d - %(filename)s:%(lineno)s: %(message)s' 36 | formatter = logging.Formatter(logformat) 37 | fh.setFormatter(formatter) 38 | logger.addHandler(fh) 39 | 40 | 41 | class Filter(object): 42 | 43 | def __init__(self, remove_newlines=True, collapse_spaces=True): 44 | self.filters = [] 45 | if remove_newlines: 46 | self.filters.append(self.__remove_newlines) 47 | if collapse_spaces: 48 | self.filters.append(self.__collapse_spaces) 49 | 50 | def filter(self, s): 51 | for f in self.filters: 52 | s = f(s) 53 | return s 54 | 55 | def __remove_newlines(self, s): 56 | s = s.replace('\r\n',' ') 57 | s = s.replace('\n',' ') 58 | return s 59 | 60 | def __collapse_spaces(self, s): 61 | s=re.sub('\s\s+', ' ', s) 62 | s=re.sub('\s([\',.])',r'\1',s) 63 | return s 64 | 65 | 66 | def json_error(status, message, traceback, version): 67 | err = {"status":status, "message":message, "traceback":traceback, "version":version} 68 | return json.dumps(err, sort_keys=True, indent=4) 69 | 70 | 71 | class ExternalProcessor(object): 72 | 73 | """ wraps an external script and does utf-8 conversions, is thread-safe """ 74 | def __init__(self, cmd): 75 | self.cmd = cmd 76 | if self.cmd != None: 77 | self.proc = popen(cmd) 78 | self._lock = threading.Lock() 79 | 80 | def process(self, line): 81 | if self.cmd == None: return line 82 | u_string = u"%s\n" %line 83 | u_string = u_string.encode("utf-8") 84 | result = u_string #fallback: return input 85 | with self._lock: 86 | self.proc.stdin.write(u_string) 87 | self.proc.stdin.flush() 88 | result = self.proc.stdout.readline() 89 | return result.decode("utf-8").strip() 90 | # should be rstrip but normalize_punctiation.perl inserts space 91 | # for lines starting with '(' 92 | 93 | 94 | class Root(object): 95 | 96 | def __init__(self, moses_home, moses_url, recaser_url, slang, tlang, pretty=False, verbose=0, timeout=-1): 97 | 98 | self.filter = Filter(remove_newlines=True, collapse_spaces=True) 99 | self.moses_url = moses_url 100 | self.recaser_url = recaser_url 101 | self.pretty = bool(pretty) 102 | self.timeout = timeout 103 | self.verbose = verbose 104 | 105 | tokenizer = ['perl',os.path.join(moses_home,"mosesdecoder","scripts","tokenizer","tokenizer.perl"),"-b","-X","-l",slang,'-a'] 106 | detokenizer = ['perl',os.path.join(moses_home,"mosesdecoder","scripts","tokenizer","detokenizer.perl"),"-b","-l",tlang] 107 | detruecaser = ['perl',os.path.join(moses_home,"mosesdecoder","scripts","recaser","detruecase.perl"),"-b"] 108 | 109 | self._tokenizer = map(ExternalProcessor, [u' '.join(tokenizer)]) 110 | self._detokenizer = map(ExternalProcessor,[u' '.join(detokenizer)]) 111 | self._detruecaser = map(ExternalProcessor,[u' '.join(detruecaser)]) 112 | 113 | self.tokenize = self._exec(self._tokenizer) 114 | self.detokenize = self._exec(self._detokenizer) 115 | self.detruecase = self._exec(self._detruecaser) 116 | 117 | def _exec(self, procs): 118 | def f(line): 119 | for proc in procs: 120 | line = proc.process(line) 121 | return line 122 | return f 123 | 124 | def _timeout_error(self, q, location): 125 | errors = [{"originalquery":q, "location" : location}] 126 | message = "Timeout after %ss" %self.timeout 127 | return {"error": {"errors":errors, "code":400, "message":message}} 128 | 129 | def _dump_json(self, data): 130 | if self.pretty: 131 | return json.dumps(data, indent=2) + "\n" 132 | return json.dumps(data) + "\n" 133 | 134 | def _load_json(self, string): 135 | return json.loads(string) 136 | 137 | def tokenize(self, sentence): 138 | sentence_tokenized = self.tokenize(sentence) 139 | return sentence_tokenized 140 | 141 | def detokenize(self, sentence): 142 | sentence_detokenized = self.detokenize(sentence) 143 | return sentence_detokenized 144 | 145 | def _translate(self, source): 146 | """ wraps the actual translate call to mosesserver via XMLPRC """ 147 | proxy = xmlrpclib.ServerProxy(self.moses_url) 148 | params = {"text":source} 149 | return proxy.translate(params) 150 | 151 | def _recaser(self, sentence): 152 | proxy=xmlrpclib.ServerProxy(self.recaser_url) 153 | params = {"text":sentence} 154 | return proxy.translate(params) 155 | 156 | @cherrypy.expose 157 | def translate(self, **kwargs): 158 | response = cherrypy.response 159 | response.headers['Content-Type'] = 'application/json' 160 | 161 | q = self.filter.filter(kwargs["q"]) 162 | callback = kwargs["callback"] 163 | 164 | raw_src = q 165 | self.log("The server is working on: %s" %repr(raw_src)) 166 | self.log_info("Request before preprocessing: %s" %repr(raw_src)) 167 | translationDict = {"sourceText":raw_src.strip()} 168 | 169 | lower_src = raw_src.lower() 170 | tokenized_src = self.tokenize(lower_src) 171 | 172 | translation = '' 173 | 174 | # query MT engine 175 | self.log_info("Requesting translation for %s" % repr(tokenized_src)) 176 | result = self._translate(tokenized_src) 177 | if 'text' in result: 178 | translation = result['text'] 179 | else: 180 | return self._timeout_error(tokenized_src, 'translation') 181 | self.log_info("Received translation: %s" % repr(translation)) 182 | 183 | # 184 | recased_result = self._recaser(translation) 185 | if 'text' in recased_result: 186 | recased_trans=recased_result['text'] 187 | else: 188 | recased_trans=translation 189 | 190 | detokenized_trans = self.detokenize(recased_trans) 191 | detruecased_trans = self.detruecase(detokenized_trans) 192 | translatedText = self.filter.filter(detruecased_trans) 193 | 194 | translationDict = {"translatedText":translatedText} 195 | 196 | data = {"data" : {"translations" : [translationDict]}} 197 | self.log("The server is returning: %s" %self._dump_json(data)) 198 | 199 | if callback: 200 | return callback + "(" + self._dump_json(data) + ");" 201 | else: 202 | return self._dump_json(data) 203 | 204 | 205 | def log_info(self, message): 206 | if self.verbose > 0: 207 | self.log(message, level=logging.INFO) 208 | 209 | def log(self, message, level=logging.INFO): 210 | logger = logging.getLogger('translation_log.info') 211 | logger.info(message) 212 | 213 | 214 | @cherrypy.expose 215 | def index(self): 216 | return """ 217 | 218 | 219 | 220 | 221 | 245 | 267 | 268 | 269 |
270 |
271 |
272 |
273 | 274 |

DEMO CYFIEITHU PEIRIANYDDOL ~ MACHINE TRANSLATION DEMO

275 | 276 |
277 |
278 | 279 | 282 | 286 |
280 | 281 | 283 | 284 | 285 | 287 |

288 |
289 |
290 |
291 | 292 | """ 293 | 294 | if __name__ == "__main__": 295 | 296 | import argparse 297 | 298 | parser = argparse.ArgumentParser() 299 | parser.add_argument('-ip', help='server ip to bind to, default: localhost', default="127.0.0.1") 300 | parser.add_argument('-port', action='store', help='server port to bind to, default: 8080', type=int, default=8080) 301 | parser.add_argument('-nthreads', help='number of server threads, default: 8', type=int, default=8) 302 | parser.add_argument('-mosesurl', dest="moses_url", action='store', help='url of mosesserver', required=True) 303 | parser.add_argument('-recaserurl', dest="recaser_url", action='store', help='url of moses recaser', required=True) 304 | parser.add_argument('-moseshome', dest="moses_home", action='store', help='path to mosesdecoder installation', required=True) 305 | parser.add_argument('-timeout', help='timeout for call to translation engine, default: unlimited', type=int) 306 | parser.add_argument('-pretty', action='store_true', help='pretty print json') 307 | parser.add_argument('-slang', help='source language code') 308 | parser.add_argument('-tlang', help='target language code') 309 | parser.add_argument('-logprefix', help='logfile prefix, default: write to stderr') 310 | parser.add_argument('-verbose', help='verbosity level, default: 0', type=int, default=0) 311 | 312 | args = parser.parse_args(sys.argv[1:]) 313 | 314 | if args.logprefix: 315 | init_log("%s.trans.log" %args.logprefix) 316 | 317 | cherrypy.config.update({'server.request_queue_size' : 1000, 318 | 'server.socket_port': args.port, 319 | 'server.thread_pool': args.nthreads, 320 | 'server.socket_host': args.ip}) 321 | cherrypy.config.update({'error_page.default': json_error}) 322 | cherrypy.config.update({'log.screen': True}) 323 | 324 | if args.logprefix: 325 | cherrypy.config.update({'log.access_file': "%s.access.log" %args.logprefix, 326 | 'log.error_file': "%s.error.log" %args.logprefix}) 327 | 328 | cherrypy.quickstart(Root(args.moses_home, 329 | args.moses_url, args.recaser_url, 330 | slang = args.slang, tlang = args.tlang, 331 | pretty = args.pretty, 332 | verbose = args.verbose)) 333 | 334 | -------------------------------------------------------------------------------- /tut/demo1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | try: 4 | from urllib import request 5 | from urllib.parse import urlencode 6 | except ImportError: 7 | import urllib2 as request 8 | from urllib import urlencode 9 | 10 | try: 11 | # python2 12 | input = raw_input 13 | except NameError: 14 | # python3 15 | input = input 16 | 17 | import json 18 | 19 | # ============================== 20 | # = GOSODIADAU / USER SETTINGS = 21 | # ============================== 22 | # Eich allwedd API - o https://api.techiaith.org 23 | # Your API Key - from https://api.techiaith.org 24 | # Cewch hefyd gadael hyn yn wag, a cadw'ch allwedd API mewn ffeil 'API_KEY' 25 | # You can also leave this empty and keep your API key in a file called 'API_KEY' 26 | API_KEY = "" 27 | 28 | # Gellir defnyddio 'cy' neu 'en' ar gyfer iaith yr API 29 | # Api lang parameter can be either 'cy' or 'en' 30 | API_LANG = 'cy' 31 | 32 | API_URL = "https://api.techiaith.org/translate/v1/translate/?" 33 | 34 | # ============== 35 | # = Cod / Code = 36 | # ============== 37 | 38 | if not API_KEY: 39 | # ceisio darllen yr API key o ffeil 40 | import os 41 | if os.path.exists("API_KEY"): 42 | with open("API_KEY", 'rb') as a: 43 | API_KEY = a.read().decode('utf-8').strip() 44 | 45 | if API_KEY == "": 46 | print(""" 47 | ================= 48 | ***GWALL/ERROR*** 49 | ================= 50 | 51 | RHAID gosod eich allwedd API in gwiriwr.py yn gyntaf. Gwelwch https://api.techiaith.org 52 | You MUST set your API Key in gwiriwr.py first. See https://api.techiaith.org 53 | """) 54 | import sys 55 | sys.exit(1) 56 | 57 | 58 | 59 | def get_translations(source,engine,sourcelang,targetlang): 60 | """ 61 | Galw'r API ar gyfer cyfieithu un llinell 62 | Call the API to check the spelling for one line 63 | """ 64 | params = { 65 | 'api_key': API_KEY.encode('utf-8'), 66 | 'source': sourcelang.encode('utf-8'), 67 | 'target': targetlang.encode('utf-8'), 68 | 'engine':engine.encode('utf-8'), 69 | 'q': source.encode('utf-8') 70 | } 71 | url = API_URL + urlencode(params) 72 | 73 | response = request.urlopen(url) 74 | response = json.loads(response.read().decode('utf-8')) 75 | if not response['success']: 76 | # Gwall gyda'r galwad API 77 | # something went wrong with the API call 78 | translations = u'\n'.join(response['translations']) 79 | raise ValueError(translations) 80 | 81 | return response['translations'] 82 | 83 | 84 | def cyfieithu_testun(testun, peiriant, iaithffynhonnell, iaithtarged): 85 | """ 86 | Cyfieithu un llinell ar y tro 87 | Translates one line at a time 88 | """ 89 | translations = get_translations(testun, peiriant, iaithffynhonnell, iaithtarged) 90 | 91 | if not len(translations): 92 | return testun 93 | 94 | return translations[0]['translatedText'] 95 | 96 | 97 | if __name__ == '__main__': 98 | 99 | import sys, argparse 100 | 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument('-e', help="enw'r peiriant cyfieithu", action='store', dest='engine', required=True) 103 | parser.add_argument('-s', help="iaith ffynhonnell",action='store',dest='sourcelang',required=True) 104 | parser.add_argument('-t', help="iaith targed",action='store',dest='targetlang',required=True) 105 | parser.add_argument('-f', help='ffeil ar gyfer cyfieithu', dest='sourcefile') 106 | 107 | args = parser.parse_args(sys.argv[1:]) 108 | 109 | if args.sourcefile: 110 | with open(sourcefile,'rb') as f: 111 | testun = f.read().decode('utf-8') 112 | else: 113 | testun = "" 114 | while not testun.strip(): 115 | testun = input(u"Ysgrifennwch testun i'w gyfieithu:\n") 116 | if sys.version_info[0]==2: 117 | testun = testun.decode('utf-8') 118 | 119 | llinellau = testun.split(u"\n") 120 | 121 | for llinell in llinellau: 122 | print cyfieithu_testun(llinell, args.engine, args.sourcelang, args.targetlang) 123 | 124 | --------------------------------------------------------------------------------