├── src ├── server │ ├── __init__.py │ ├── core │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── test_config.py │ │ │ └── test_app.py │ │ ├── extensions.py │ │ ├── models │ │ │ ├── translation.py │ │ │ ├── language.py │ │ │ ├── feedback.py │ │ │ └── predict.py │ │ ├── config.py │ │ ├── utils_bucket │ │ │ ├── upload_download.py │ │ │ └── bucket.py │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── resources │ │ │ └── translate.py │ │ └── model_load.py │ ├── nginx │ │ ├── Dockerfile │ │ └── nginx.conf │ ├── .env.dev │ ├── entrypoint.sh │ ├── entrypoint.prod.sh │ ├── Dockerfile │ ├── requirements.txt │ ├── Dockerfile.prod │ ├── manage.py │ └── README.md ├── client │ ├── public │ │ ├── robots.txt │ │ ├── favico.png │ │ ├── favicon.ico │ │ ├── e5b14e8b30296b86b78d06886aa5a458.png │ │ ├── manifest.json │ │ ├── bundle.js.LICENSE.txt │ │ ├── index.html │ │ └── 217.bundle.js │ ├── src │ │ ├── images │ │ │ ├── logo1.png │ │ │ ├── logo2.png │ │ │ ├── favico.png │ │ │ ├── favicon.ico │ │ │ ├── masakhane.png │ │ │ ├── masakhane_bg.png │ │ │ ├── masakhane_bg2.png │ │ │ └── masakhane-border.png │ │ ├── App.test.js │ │ ├── components │ │ │ ├── step3.test.js │ │ │ ├── multiStepForm.test.js │ │ │ ├── translateCard.test.js │ │ │ ├── step1.test.js │ │ │ ├── common │ │ │ │ ├── radioButton.js │ │ │ │ └── radioButton.test.js │ │ │ ├── step2.test.js │ │ │ ├── step3.js │ │ │ ├── multiStepForm.js │ │ │ ├── terms.js │ │ │ ├── step1.js │ │ │ ├── step2.js │ │ │ └── translateCard.js │ │ ├── setupTests.js │ │ ├── setupProxy.js │ │ ├── index.css │ │ ├── reportWebVitals.js │ │ ├── index.js │ │ ├── pages │ │ │ ├── Home.js │ │ │ ├── Faq.js │ │ │ └── About.js │ │ ├── logo.svg │ │ └── App.js │ ├── README.md │ ├── Dockerfile │ ├── package.json │ └── webpack.config.js ├── m_to_m_models │ ├── kubernetes │ │ ├── volume_claim.yaml │ │ ├── volume.yaml │ │ ├── secret.yaml │ │ ├── triton-deployment.yaml │ │ └── deployment.yaml │ ├── app.py │ ├── main.py │ ├── model_handlers.py │ ├── Dockerfile │ └── requirements.txt └── torchserve │ ├── setup_config.json │ ├── Download_Transformer_models.py │ └── transformer_handler.py ├── .python-version ├── .dockerignore ├── entrypoint.sh ├── kubernetes ├── ingress-def.yml └── sample-server.yaml ├── .github └── ISSUE_TEMPLATE │ └── dsfsi-standard-template.md ├── docker-compose.prod.yml ├── LICENSE ├── todo.md ├── docker-compose.yml ├── .gitignore ├── docs ├── start_app_prod_doc.md ├── debugging_setup.md ├── project_details.md └── start_app_locally_doc.md ├── requirements-python3.10.txt ├── environment.yaml ├── README.md └── Makefile /src/server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.7 2 | -------------------------------------------------------------------------------- /src/server/core/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/client/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /src/client/public/favico.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/public/favico.png -------------------------------------------------------------------------------- /src/client/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/public/favicon.ico -------------------------------------------------------------------------------- /src/client/src/images/logo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/logo1.png -------------------------------------------------------------------------------- /src/client/src/images/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/logo2.png -------------------------------------------------------------------------------- /src/client/src/images/favico.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/favico.png -------------------------------------------------------------------------------- /src/client/src/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/favicon.ico -------------------------------------------------------------------------------- /src/client/src/images/masakhane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/masakhane.png -------------------------------------------------------------------------------- /src/client/src/images/masakhane_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/masakhane_bg.png -------------------------------------------------------------------------------- /src/client/src/images/masakhane_bg2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/masakhane_bg2.png -------------------------------------------------------------------------------- /src/server/nginx/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx:1.17-alpine 2 | 3 | RUN rm /etc/nginx/conf.d/default.conf 4 | COPY nginx.conf /etc/nginx/conf.d -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | env 2 | .dockerignore 3 | Dockerfile-dev 4 | Dockerfile-prod 5 | 6 | src/server/models/joeynmt 7 | src/server/core/models/joeynmt -------------------------------------------------------------------------------- /src/client/src/images/masakhane-border.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/masakhane-border.png -------------------------------------------------------------------------------- /src/client/public/e5b14e8b30296b86b78d06886aa5a458.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/public/e5b14e8b30296b86b78d06886aa5a458.png -------------------------------------------------------------------------------- /src/server/core/extensions.py: -------------------------------------------------------------------------------- 1 | from flask_sqlalchemy import SQLAlchemy 2 | from flask_migrate import Migrate 3 | 4 | import os, sqlite3 5 | 6 | db = SQLAlchemy() 7 | migrate = Migrate() 8 | -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Waiting for postgres ..." 4 | 5 | while ! nc -z users-db 5432; do 6 | sleep 0.1 7 | done 8 | echo "PostgreSQL started" 9 | 10 | python app.py -------------------------------------------------------------------------------- /src/client/src/App.test.js: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react'; 2 | import App from './App'; 3 | 4 | describe('App', () => { 5 | test('renders App component', () => { 6 | render(); 7 | }); 8 | }); 9 | -------------------------------------------------------------------------------- /src/client/src/components/step3.test.js: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react'; 2 | import Step3 from './step3'; 3 | 4 | describe('Step3', () => { 5 | test('renders Step3 component', () => { 6 | render(); 7 | }); 8 | }); -------------------------------------------------------------------------------- /src/server/.env.dev: -------------------------------------------------------------------------------- 1 | FLASK_APP=core/__init__.py 2 | FLASK_ENV=development 3 | DATABASE_URL=postgresql://masakhane:masakhane@db:5432/masakhane 4 | SQL_HOST=db 5 | SQL_PORT=5432 6 | DATABASE=postgres 7 | SECRET_KEY=secret-key 8 | MODEL=./models/joeynmt/ 9 | FLASK_DEBUG=1 10 | -------------------------------------------------------------------------------- /src/client/src/setupTests.js: -------------------------------------------------------------------------------- 1 | // jest-dom adds custom jest matchers for asserting on DOM nodes. 2 | // allows you to do things like: 3 | // expect(element).toHaveTextContent(/react/i) 4 | // learn more: https://github.com/testing-library/jest-dom 5 | import '@testing-library/jest-dom'; 6 | -------------------------------------------------------------------------------- /src/client/src/components/multiStepForm.test.js: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react'; 2 | import MultiStepForm from './multiStepForm'; 3 | 4 | describe('MultiStepForm', () => { 5 | test('renders MultiStepForm component', () => { 6 | render(); 7 | }); 8 | }); -------------------------------------------------------------------------------- /src/client/src/components/translateCard.test.js: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react'; 2 | import TranslateCard from './translateCard'; 3 | 4 | describe('TranslateCard', () => { 5 | test('renders TranslateCard component', () => { 6 | render(); 7 | }); 8 | }); -------------------------------------------------------------------------------- /src/m_to_m_models/kubernetes/volume_claim.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: masakhane-model-cache-volume-claim 5 | namespace: masakhane 6 | spec: 7 | storageClassName: manual 8 | accessModes: 9 | - ReadWriteOnce 10 | resources: 11 | requests: 12 | storage: 8Gi 13 | -------------------------------------------------------------------------------- /src/client/src/setupProxy.js: -------------------------------------------------------------------------------- 1 | // const { createProxyMiddleware } = require('http-proxy-middleware'); 2 | 3 | // module.exports = function(app) { 4 | // app.use( 5 | // '/translate', 6 | // createProxyMiddleware({ 7 | // target: 'http://localhost:5000', 8 | // changeOrigin: true, 9 | // }) 10 | // ); 11 | // }; -------------------------------------------------------------------------------- /src/server/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "$DATABASE" = "postgres" ] 4 | then 5 | echo "Waiting for postgres..." 6 | 7 | while ! nc -z $SQL_HOST $SQL_PORT; do 8 | sleep 0.1 9 | done 10 | 11 | echo "PostgreSQL started" 12 | fi 13 | 14 | python manage.py create_db 15 | python manage.py add_language en-sw-JW300 16 | exec "$@" -------------------------------------------------------------------------------- /src/torchserve/setup_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "masakhane/m2m100_418M_en_swa_rel_news", 3 | "mode": "text_generation", 4 | "do_lower_case":false, 5 | "num_labels":"0", 6 | "save_mode":"pretrained", 7 | "max_length":"150", 8 | "captum_explanation":true, 9 | "embedding_name": "bert", 10 | "FasterTransformer":false, 11 | "BetterTransformer":false, 12 | "model_parallel":false 13 | } 14 | -------------------------------------------------------------------------------- /src/m_to_m_models/kubernetes/volume.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: translation-volume-storage 5 | namespace: masakhane 6 | labels: 7 | type: local 8 | spec: 9 | storageClassName: manual 10 | accessModes: 11 | - ReadWriteOnce 12 | capacity: 13 | storage: 10Gi 14 | hostPath: 15 | path: /models_datastore # the host on the minikube vm 16 | -------------------------------------------------------------------------------- /kubernetes/ingress-def.yml: -------------------------------------------------------------------------------- 1 | apiVersion: networking.k8s.io/v1 2 | kind: Ingress 3 | metadata: 4 | name: seldon-ingress 5 | namespace: seldon 6 | spec: 7 | rules: 8 | - host: seldon-ingress.com 9 | http: 10 | paths: 11 | - path: "/" 12 | pathType: Prefix 13 | backend: 14 | service: 15 | name: iris-model-sklearn-iris-predictor 16 | port: 17 | number: 8000 18 | -------------------------------------------------------------------------------- /src/client/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', 5 | sans-serif; 6 | -webkit-font-smoothing: antialiased; 7 | -moz-osx-font-smoothing: grayscale; 8 | } 9 | 10 | code { 11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New', 12 | monospace; 13 | } 14 | -------------------------------------------------------------------------------- /src/client/src/reportWebVitals.js: -------------------------------------------------------------------------------- 1 | const reportWebVitals = onPerfEntry => { 2 | if (onPerfEntry && onPerfEntry instanceof Function) { 3 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => { 4 | getCLS(onPerfEntry); 5 | getFID(onPerfEntry); 6 | getFCP(onPerfEntry); 7 | getLCP(onPerfEntry); 8 | getTTFB(onPerfEntry); 9 | }); 10 | } 11 | }; 12 | 13 | export default reportWebVitals; 14 | -------------------------------------------------------------------------------- /src/m_to_m_models/kubernetes/secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: masakhane-container-secret 5 | namespace: masakhane 6 | type: Opaque 7 | stringData: 8 | RCLONE_CONFIG_S3_TYPE: s3 9 | RCLONE_CONFIG_S3_PROVIDER: minio 10 | RCLONE_CONFIG_S3_ENV_AUTH: "false" 11 | RCLONE_CONFIG_S3_ACCESS_KEY_ID: minioadmin 12 | RCLONE_CONFIG_S3_SECRET_ACCESS_KEY: minioadmin 13 | RCLONE_CONFIG_S3_ENDPOINT: http://minio.minio-system.svc.cluster.local:9000 14 | -------------------------------------------------------------------------------- /src/client/src/components/step1.test.js: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react'; 2 | import Step1 from './step1'; 3 | 4 | describe('Step1', () => { 5 | test('renders Step1 component', () => { 6 | const props = { 7 | src_lang: "none", 8 | tgt_lang: "none", 9 | setForm: () => {}, 10 | formData: {}, 11 | navigation: {}, 12 | handleSubmitFeedback: () => {} 13 | }; 14 | render(); 15 | }); 16 | }); 17 | -------------------------------------------------------------------------------- /src/server/core/tests/base.py: -------------------------------------------------------------------------------- 1 | from flask_testing import TestCase 2 | from core.extensions import db 3 | from core import masakhane, load_model 4 | 5 | 6 | class BaseTestCase(TestCase): 7 | def create_app(self): 8 | masakhane.config.from_object('core.config.Config') 9 | return masakhane 10 | 11 | def setUp(self): 12 | db.create_all() 13 | db.session.commit() 14 | 15 | def tearDown(self): 16 | db.session.remove() 17 | db.drop_all() -------------------------------------------------------------------------------- /src/client/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "Masakhane Web", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favico.png", 7 | "type": "image/png", 8 | "sizes": "192x192" 9 | }, 10 | { 11 | "src": "favico.png", 12 | "type": "image/png", 13 | "sizes": "512x512" 14 | } 15 | ], 16 | "start_url": ".", 17 | "display": "standalone", 18 | "theme_color": "#000000", 19 | "background_color": "#ffffff" 20 | } 21 | -------------------------------------------------------------------------------- /src/client/src/components/common/radioButton.js: -------------------------------------------------------------------------------- 1 | import { Form } from 'react-bootstrap'; 2 | import React from 'react'; 3 | 4 | const RadioButton = ({ value, label, selected, ...otherProps }) => { 5 | return( 6 |
7 | {label} 8 | 9 |
10 | ); 11 | } 12 | 13 | export default RadioButton; -------------------------------------------------------------------------------- /src/client/src/components/step2.test.js: -------------------------------------------------------------------------------- 1 | import { render, screen } from '@testing-library/react'; 2 | import Step2 from './step2'; 3 | 4 | describe('Step2', () => { 5 | test('renders Step2 component', () => { 6 | const props = { 7 | src_lang: "none", 8 | tgt_lang: "none", 9 | text: "", 10 | translation: "", 11 | setForm: () => {}, 12 | formData: {}, 13 | navigation: {}, 14 | handleSubmitFeedback: () => {} 15 | }; 16 | render(); 17 | }); 18 | }); -------------------------------------------------------------------------------- /src/server/core/models/translation.py: -------------------------------------------------------------------------------- 1 | class Translation: 2 | def __init__(self, src_lang, tgt_lang, input, output) -> None: 3 | super().__init__() 4 | self.src_lang = src_lang 5 | self.tgt_lang = tgt_lang 6 | self.input = input 7 | self.output = output 8 | 9 | @property 10 | def data(self): 11 | return { 12 | 'src_lang': self.src_lang, 13 | 'tgt_lang': self.tgt_lang, 14 | 'input': self.input, 15 | 'output': self.output 16 | } -------------------------------------------------------------------------------- /src/server/entrypoint.prod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "$DATABASE" = "postgres" ] 4 | then 5 | echo "Waiting for postgres..." 6 | 7 | while ! nc -z $SQL_HOST $SQL_PORT; do 8 | sleep 0.1 9 | done 10 | 11 | echo "PostgreSQL started" 12 | fi 13 | 14 | if [ "$FLASK_ENV" = "development" ] 15 | then 16 | echo "Creating the database tables..." 17 | python manage.py clean 18 | echo "Tables created" 19 | fi 20 | 21 | python manage.py create_db 22 | 23 | python manage.py add_language en-sw-JW300 24 | 25 | exec "$@" -------------------------------------------------------------------------------- /src/client/src/components/common/radioButton.test.js: -------------------------------------------------------------------------------- 1 | import { 2 | render, 3 | screen, 4 | getByRole, 5 | findByText, 6 | } from '@testing-library/react'; 7 | import RadioButton from './radioButton'; 8 | 9 | describe('RadioButton', () => { 10 | test('renders RadioButton component', () => { 11 | render(); 12 | }); 13 | 14 | // test('should have a radio button input', () => { 15 | 16 | // }) 17 | 18 | // test('should fire an onchange event', () => { 19 | 20 | // }) 21 | 22 | }); 23 | -------------------------------------------------------------------------------- /src/client/src/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import App from './App'; 4 | import reportWebVitals from './reportWebVitals'; 5 | import "core-js/stable"; 6 | import "regenerator-runtime/runtime"; 7 | 8 | ReactDOM.render( 9 | , 10 | document.getElementById('root') 11 | ); 12 | 13 | // If you want to start measuring performance in your app, pass a function 14 | // to log results (for example: reportWebVitals(console.log)) 15 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals 16 | reportWebVitals(); 17 | -------------------------------------------------------------------------------- /src/m_to_m_models/kubernetes/triton-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: machinelearning.seldon.io/v1 2 | kind: SeldonDeployment 3 | metadata: 4 | name: triton-masakhane 5 | namespace: masakhane 6 | spec: 7 | name: default 8 | predictors: 9 | - graph: 10 | implementation: TRITON_SERVER 11 | logger: 12 | mode: all 13 | modelUri: s3://language-models/onnx-m2m100/1 14 | envSecretRefName: masakhane-container-secret 15 | name: triston-masakhane-predictor 16 | type: MODEL 17 | name: default 18 | replicas: 1 19 | protocol: kfserving 20 | -------------------------------------------------------------------------------- /src/client/README.md: -------------------------------------------------------------------------------- 1 | # The Frontend 2 | 3 | **NOTE** I know next to nothing about this frontend so update needed 4 | 5 | The client is running on http://localhost:3000 6 | 7 | It consists of 8 | - ReactJS 9 | - Webpack 10 | 11 | # Available npm scripts: 12 | 13 | | Command | Executes | 14 | | ------- | -------- | 15 | | `npm run develop` | `webpack-dev-server --host 0.0.0.0` | 16 | | `npm run start-api` | `cd ../server && python app.py` | 17 | | `npm run build` | `react-scripts build` | 18 | | `npm run test` | `react-scripts test` | 19 | | `npm run eject` | `react-scripts eject` | -------------------------------------------------------------------------------- /src/client/Dockerfile: -------------------------------------------------------------------------------- 1 | # Building the application 2 | FROM node:lts-buster as build 3 | 4 | WORKDIR /app 5 | 6 | ENV PATH /app/node_modules/.bin:$PATH 7 | # Increate node max memory, the default memory limit is too low for building 8 | ENV NODE_OPTIONS --max-old-space-size=8192 9 | 10 | # add dependencies 11 | COPY package.json package-lock.json ./ 12 | # install dependencies 13 | RUN npm install --legacy-peer-deps 14 | RUN npm i webpack webpack-cli --legacy-peer-deps 15 | RUN npm i @babel/core @babel/preset-env @babel/preset-react babel-loader --legacy-peer-deps 16 | 17 | # add app 18 | COPY . ./ 19 | 20 | # RUN npm command 21 | CMD ["npm", "run", "develop"] 22 | -------------------------------------------------------------------------------- /src/client/src/components/step3.js: -------------------------------------------------------------------------------- 1 | import { Button } from 'react-bootstrap'; 2 | import React from 'react'; 3 | 4 | const Step3 = ({ setShow }) => { 5 | const handleShow = () => setShow(false); 6 | 7 | return ( 8 |
9 |
THANK YOU!
10 | {/*

We appreciate your feedback and your contribution which help us make translations better.

*/} 11 |
12 | 13 |
14 |
15 | ) 16 | } 17 | 18 | export default Step3; 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/dsfsi-standard-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: DSFSI Standard Template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | #### Description 11 | A clear and concise description of what the issue is about. 12 | 13 | #### Screenshots 14 | ![Downhill Windmills](http://i.giphy.com/KO8AG2EByqkFi.gif) 15 | 16 | #### Files 17 | A list of relevant files for this issue. This will help people navigate the project and offer some clues of where to start. 18 | 19 | #### To Reproduce 20 | If this issue is describing a bug, include some steps to reproduce the behavior. 21 | 22 | #### Tasks 23 | Include specific tasks in the order they need to be done in. Include links to specific lines of code where the task should happen at. 24 | - [ ] Task 1 25 | - [ ] Task 2 26 | - [ ] Task 3 27 | -------------------------------------------------------------------------------- /kubernetes/sample-server.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: machinelearning.seldon.io/v1 2 | kind: SeldonDeployment 3 | metadata: 4 | name: iris-model 5 | namespace: seldon 6 | spec: 7 | name: iris 8 | annotations: 9 | prometheus.io/scrape: "false" 10 | predictors: 11 | - componentSpecs: 12 | - spec: 13 | containers: 14 | - env: 15 | - name: SELDON_LOG_LEVEL 16 | value: DEBUG 17 | - name: SELDON_DEBUG 18 | value: 'True' 19 | - name: FLASK_DEBUG 20 | value: 'True' 21 | image: seldonio/sklearn-iris:0.3 22 | imagePullPolicy: IfNotPresent 23 | name: sklearn-iris-classifier 24 | graph: 25 | endpoint: 26 | type: REST 27 | name: sklearn-iris-classifier 28 | type: MODEL 29 | name: sklearn-iris-predictor 30 | replicas: 1 31 | -------------------------------------------------------------------------------- /src/m_to_m_models/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, jsonify, request 2 | from flask_cors import CORS 3 | import logging 4 | 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | def create_app(model_handler): 9 | app = Flask(__name__, static_url_path="") 10 | CORS(app) 11 | 12 | @app.route("/predict", methods=["GET", "POST"]) 13 | def predict(): 14 | request_data = request.get_json() 15 | logger.debug("REST Request: %s", request) 16 | response = model_handler.predict_raw(request_data) 17 | 18 | json_response = jsonify(response) 19 | if ( 20 | isinstance(response, dict) 21 | and "status" in response 22 | and "code" in response["status"] 23 | ): 24 | json_response.status_code = response["status"]["code"] 25 | 26 | logger.debug("REST Response: %s", response) 27 | return json_response 28 | 29 | return app 30 | -------------------------------------------------------------------------------- /src/server/nginx/nginx.conf: -------------------------------------------------------------------------------- 1 | upstream masakhane-web { 2 | server api:5000; 3 | } 4 | 5 | upstream masakhane-web-client { 6 | server client:3000; 7 | } 8 | 9 | server { 10 | 11 | listen 80; 12 | 13 | root /images/; 14 | 15 | location / { 16 | proxy_pass http://masakhane-web-client; 17 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 18 | proxy_set_header Host $host; 19 | proxy_redirect off; 20 | } 21 | 22 | location /translate { 23 | proxy_pass http://masakhane-web; 24 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 25 | proxy_set_header Host $host; 26 | proxy_redirect off; 27 | } 28 | 29 | location /save { 30 | proxy_pass http://masakhane-web; 31 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 32 | proxy_set_header Host $host; 33 | proxy_redirect off; 34 | } 35 | } -------------------------------------------------------------------------------- /docker-compose.prod.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | api: 5 | build : 6 | context: ./src/server 7 | dockerfile: Dockerfile.prod 8 | command: gunicorn --bind 0.0.0.0:5000 manage:masakhane 9 | ports: 10 | - 5000:5000 11 | # expose: 12 | # - 5000 13 | env_file: 14 | - ./.env.prod 15 | depends_on: 16 | - db 17 | 18 | nginx: 19 | build: ./src/server/nginx 20 | ports: 21 | - 80:80 22 | depends_on: 23 | - api 24 | 25 | db: 26 | image: postgres:12-alpine 27 | volumes: 28 | - postgres_data:/var/lib/postgresql/data/ 29 | env_file: 30 | - ./.env.prod.db 31 | 32 | client: 33 | build : 34 | context: ./src/client 35 | dockerfile: Dockerfile 36 | # command: curl --location --request GET 'http://0.0.0.0:5000/update' --data-raw '' 37 | volumes: 38 | - './src/client:/usr/src/app' 39 | ports: 40 | - 3000:3000 41 | 42 | depends_on: 43 | - api 44 | 45 | volumes: 46 | postgres_data: -------------------------------------------------------------------------------- /src/server/core/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | basedir = os.path.abspath(os.path.dirname(__file__)) 4 | 5 | 6 | class Config: 7 | DEBUG = False 8 | SQLALCHEMY_DATABASE_URI = os.getenv("DATABASE_URL", "sqlite:///masakhane.db") 9 | 10 | SQLALCHEMY_TRACK_MODIFICATIONS = False 11 | MODEL = os.getenv("MODEL", "./models/joeynmt/") 12 | TEMP = "./temp/" 13 | MODEL_ALL_FILE = "./available_models.tsv" 14 | JSON = "./languages.json" 15 | 16 | 17 | class DevelopmentConfig(Config): 18 | DEBUG = True 19 | SECRET_KEY = 'super-secret-key' 20 | basedir = os.path.abspath(os.path.dirname(__file__)) 21 | FLASK_DEBUG=1 22 | 23 | 24 | class StagingConfig(Config): 25 | """ 26 | This is an imitation of the production environment for 27 | testing purpose. 28 | """ 29 | DEBUG = True 30 | TESTING = True 31 | SECRET_KEY = os.getenv('SECRET_KEY', "key_testing") 32 | # MODEL = os.getenv('MODEL', "./") 33 | 34 | 35 | class ProductionConfig(Config): 36 | SECRET_KEY = os.getenv('SECRET_KEY', "key_production") 37 | # MODEL = os.getenv('MODEL', "./") 38 | -------------------------------------------------------------------------------- /src/server/core/utils_bucket/upload_download.py: -------------------------------------------------------------------------------- 1 | from os import name, path 2 | from google.cloud.storage import Blob 3 | from google.cloud import storage 4 | 5 | 6 | client = storage.Client(project="dsfsi-232208") 7 | bucket = client.get_bucket("maskhane-web-test") 8 | encryption_key = "c7f32af42e45e85b9848a6a14dd2a8f6" 9 | 10 | # blob = Blob("secure-data", bucket, encryption_key=encryption_key) 11 | blob = Blob("secure-data", bucket) 12 | 13 | 14 | 15 | # Download 16 | # blob.upload_from_string("my secret message.") 17 | # with open("/tmp/my-secure-file", "wb") as file_obj: 18 | # client.download_to_file(blob, file_obj) 19 | 20 | if __name__ == "__main__": 21 | path_to_file_for_upload = "../../data/external/available_models.tsv" 22 | # if (path.exists(path_to_file_for_upload)): 23 | # # Upload 24 | # with open(path_to_file_for_upload, "rb") as my_file: 25 | # print("yes") 26 | # blob.upload_from_file(my_file) 27 | 28 | where_to_download = "../../data/" 29 | with open(where_to_download, "wb") as file_obj: 30 | client.download_to_file(blob, file_obj) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Data Science for Social Impact @ University of Pretoria 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /todo.md: -------------------------------------------------------------------------------- 1 | - should not put the model in a docker container, use the file storage instead and make it available as a volume to the container 2 | - use a model registry to store models, build one with mlflow. 3 | - Run different services for each model, and use a load balancer to route the requests to the right model. 4 | 5 | 6 | 7 | torch-model-archiver --model-name MasaknaneEnSwaRelNews \ 8 | --version 1.0 \ 9 | --serialized-file src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/pytorch_model.bin \ 10 | --handler src/torchserve/transformer_handler.py \ 11 | --extra-files "src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/config.json, 12 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/special_tokens_map.json, 13 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/tokenizer_config.json, 14 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/vocab.json, 15 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/generation_config.json, 16 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/sentencepiece.bpe.model" 17 | -------------------------------------------------------------------------------- /src/server/core/utils_bucket/bucket.py: -------------------------------------------------------------------------------- 1 | from google.cloud import storage 2 | from google.oauth2 import service_account 3 | import pathlib, io, ipdb 4 | 5 | # credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE) 6 | 7 | client = storage.Client( 8 | project="dsfsi-232208", 9 | # credentials=credentials 10 | ) 11 | 12 | 13 | from google.cloud import storage 14 | from zipfile import ZipFile, ZipInfo 15 | 16 | def upload(): 17 | source_dir = pathlib.Path("../../models/joeynmt/en-lua/") 18 | 19 | archive = io.BytesIO() 20 | with ZipFile(archive, 'w') as zip_archive: 21 | for file_path in source_dir.iterdir(): 22 | # ipdb.set_trace() 23 | with open(file_path, 'r') as file: 24 | zip_entry_name = file_path.name 25 | zip_file = ZipInfo(zip_entry_name) 26 | zip_archive.writestr(zip_file, file.read()) 27 | 28 | ipdb.set_trace() 29 | archive.seek(0) 30 | 31 | object_name = 'super-important-data-v1' 32 | bucket = client.bucket("maskhane-web-test") 33 | 34 | blob = storage.Blob(object_name, bucket) 35 | blob.upload_from_file(archive, content_type='application/zip') 36 | 37 | upload() -------------------------------------------------------------------------------- /src/client/src/pages/Home.js: -------------------------------------------------------------------------------- 1 | import { Navbar, Nav, Container, Jumbotron, Image, Row, Col } from 'react-bootstrap' 2 | import React from 'react'; 3 | import TranslateCard from '../components/translateCard'; 4 | import image from '../images/masakhane-border.png'; 5 | 6 | function Home() { 7 | return ( 8 |
9 | 10 | 11 |
12 |
13 | 14 |
15 |

This is a community research project and as such, this service is not a production system. Therefore, it should not be used for official translations. Don't see your language and interested in training one up yourself? Go here to learn how to contribute a model!

16 |

The models are powered by JoeyNMT🐨; a minimalist machine translation toolkit based on pytorch.

17 |
18 |
19 | ); 20 | } 21 | 22 | export default Home; 23 | -------------------------------------------------------------------------------- /src/server/Dockerfile: -------------------------------------------------------------------------------- 1 | # base image 2 | FROM python:3.6.9 3 | 4 | # set working directory 5 | WORKDIR /usr/src/app 6 | 7 | # set environment variables 8 | ENV PYTHONDONTWRITEBYTECODE 1 9 | ENV PYTHONUNBUFFERED 1 10 | 11 | # install system dependencies 12 | RUN apt-get update && apt-get install -y netcat 13 | RUN apt-get update 14 | RUN apt-get install -y gnupg lsb-release wget 15 | 16 | RUN lsb_release -c -s > /tmp/lsb_release 17 | RUN GCSFUSE_REPO=$(cat /tmp/lsb_release); echo "deb http://packages.cloud.google.com/apt gcsfuse-$GCSFUSE_REPO main" | tee /etc/apt/sources.list.d/gcsfuse.list 18 | RUN wget -O - https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - 19 | 20 | RUN apt-get update 21 | RUN apt-get install -y gcsfuse 22 | 23 | 24 | # add and 25 | COPY ./requirements.txt /usr/src/app/requirements.txt 26 | # RUN pip install to install requirements 27 | RUN pip install --upgrade pip 28 | RUN pip install -r requirements.txt 29 | 30 | # add entrypoint.sh 31 | COPY ./entrypoint.sh /usr/src/app/entrypoint.sh 32 | 33 | # add app 34 | COPY . /usr/src/app 35 | 36 | 37 | # run server (https://github.com/testdrivenio/testdriven-app/issues/25) 38 | CMD ["sh","-c","chmod 777 /usr/src/app/entrypoint.sh"] 39 | ENTRYPOINT ["/usr/src/app/entrypoint.sh"] 40 | -------------------------------------------------------------------------------- /src/m_to_m_models/main.py: -------------------------------------------------------------------------------- 1 | from src.seldon_core_components.app import create_app 2 | from typing import Tuple, List 3 | from pydoc import locate 4 | import argparse 5 | 6 | def parse_args() -> Tuple[argparse.Namespace, List[str]]: 7 | """parse the following arguments 8 | --model_handler : the path to the class of the model handler 9 | --model_path : the path to the model 10 | --src_lang : the source language 11 | --trg_lang : the target language 12 | Returns: 13 | Tuple[argparse.Namespace, List[str]]: _description_ 14 | """ 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model_handler", type=str, required=True) 17 | parser.add_argument("--model_path", type=str, required=True) 18 | parser.add_argument("--src_lang", type=str, required=True) 19 | parser.add_argument("--trg_lang", type=str, required=True) 20 | args, unknown = parser.parse_known_args() 21 | return args, unknown 22 | 23 | 24 | def main(): 25 | args, _ = parse_args() 26 | ModelHandleClass = locate(args.model_handler) 27 | model_handler = ModelHandleClass(args.model_path, args.src_lang, args.trg_lang) 28 | app = create_app(model_handler) 29 | app.run() 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /src/server/core/models/language.py: -------------------------------------------------------------------------------- 1 | from enum import unique 2 | 3 | from flask_sqlalchemy import SQLAlchemy 4 | 5 | from core.extensions import db 6 | 7 | class Language(db.Model): 8 | __tablename__ = 'language' 9 | # id = db.Column(db.Integer, primary_key=True) 10 | src_tgt_dmn = db.Column(db.String(50), primary_key=True) 11 | source_target_domain = db.Column(db.String(50), nullable=True) 12 | 13 | created_at = db.Column(db.DateTime(), nullable=False,\ 14 | server_default=db.func.now()) 15 | update_at = db.Column(db.DateTime(), nullable=False,\ 16 | server_default=db.func.now(), onupdate=db.func.now()) 17 | 18 | def __init__(self, src_tgt_dmn, source_target_domain="") : 19 | super().__init__() 20 | self.src_tgt_dmn = src_tgt_dmn 21 | self.source_target_domain = source_target_domain 22 | 23 | def save(self): 24 | db.session.add(self) 25 | db.session.commit() 26 | 27 | def to_json(self): 28 | source, target, domain = self.src_tgt_dmn.split('-') 29 | return { 30 | 'source': source, 31 | 'target': target, 32 | 'src-tgt_domn' : self.source_target_domain, 33 | 'domain': domain 34 | } -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | 3 | services: 4 | 5 | server: 6 | # container_name: flask-api 7 | build : 8 | context: ./src/server 9 | dockerfile: Dockerfile 10 | command: python manage.py run -h 0.0.0.0 11 | volumes: 12 | - './src/server:/usr/src/app' 13 | - './models/joeynmt:/usr/src/app/models/joeynmt' 14 | ports: 15 | - 5000:5000 16 | 17 | env_file: 18 | - ./src/server/.env.dev 19 | 20 | depends_on: 21 | - db 22 | 23 | db: 24 | image: postgres:12-alpine 25 | volumes: 26 | - postgres_data:/var/lib/postgresql/data/ 27 | environment: 28 | - POSTGRES_USER=masakhane 29 | - POSTGRES_PASSWORD=masakhane 30 | - POSTGRES_DB=masakhane 31 | 32 | client: 33 | build : 34 | context: ./src/client 35 | dockerfile: Dockerfile 36 | # command: curl --location --request GET 'http://0.0.0.0:5000/update' --data-raw '' 37 | volumes: 38 | - './src/client:/usr/src/app' 39 | ports: 40 | - 3000:3000 41 | 42 | depends_on: 43 | - server 44 | 45 | # To persist the data beyond the life of the container 46 | # we configured a volume. This config will bind 47 | # postgres_data to the "/var/lib/postgresql/data/" directory in the container. 48 | volumes: 49 | postgres_data: 50 | -------------------------------------------------------------------------------- /src/client/public/bundle.js.LICENSE.txt: -------------------------------------------------------------------------------- 1 | /* 2 | object-assign 3 | (c) Sindre Sorhus 4 | @license MIT 5 | */ 6 | 7 | /*! 8 | Copyright (c) 2017 Jed Watson. 9 | Licensed under the MIT License (MIT), see 10 | http://jedwatson.github.io/classnames 11 | */ 12 | 13 | /** @license React v0.20.1 14 | * scheduler.production.min.js 15 | * 16 | * Copyright (c) Facebook, Inc. and its affiliates. 17 | * 18 | * This source code is licensed under the MIT license found in the 19 | * LICENSE file in the root directory of this source tree. 20 | */ 21 | 22 | /** @license React v16.13.1 23 | * react-is.production.min.js 24 | * 25 | * Copyright (c) Facebook, Inc. and its affiliates. 26 | * 27 | * This source code is licensed under the MIT license found in the 28 | * LICENSE file in the root directory of this source tree. 29 | */ 30 | 31 | /** @license React v17.0.1 32 | * react-dom.production.min.js 33 | * 34 | * Copyright (c) Facebook, Inc. and its affiliates. 35 | * 36 | * This source code is licensed under the MIT license found in the 37 | * LICENSE file in the root directory of this source tree. 38 | */ 39 | 40 | /** @license React v17.0.1 41 | * react.production.min.js 42 | * 43 | * Copyright (c) Facebook, Inc. and its affiliates. 44 | * 45 | * This source code is licensed under the MIT license found in the 46 | * LICENSE file in the root directory of this source tree. 47 | */ 48 | -------------------------------------------------------------------------------- /src/m_to_m_models/model_handlers.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from optimum.onnxruntime import ORTModelForSeq2SeqLM 3 | from optimum.pipelines import pipeline 4 | from pathlib import Path 5 | 6 | 7 | class OptimizedM100Model: 8 | def __init__(self, model_path, src_lang, tgt_lang): 9 | model_path = Path(model_path) 10 | assert model_path.exists(), "Model path does not exist" 11 | print("start loading the model........") 12 | self._model = ORTModelForSeq2SeqLM.from_pretrained(model_path) 13 | print("Model loaded successfully!") 14 | self._tokenizer = AutoTokenizer.from_pretrained(model_path) 15 | print("Tokenizer loaded successfully") 16 | self.pipeline = pipeline(f"translation_{src_lang}_to_{tgt_lang}", model=self._model, tokenizer=self._tokenizer) 17 | print("Pipeline created successfully") 18 | 19 | def predict_raw(self, X): 20 | data_to_translate = X.get("data") 21 | output = self.pipeline(data_to_translate) 22 | return output 23 | 24 | def health_status(self): 25 | text_to_translate = {"data": "Hello, my name is Espoir Murhabazi, I am a Software Engineer from Congo DRC but living in UK"} 26 | translation = self.predict_raw(text_to_translate) 27 | assert len(translation) == 1, "health check returning bad translation" 28 | assert translation[0].get("translation_text") is not None, "health check returning bad translation" 29 | return translation[0].get("translation_text") 30 | -------------------------------------------------------------------------------- /src/server/core/models/feedback.py: -------------------------------------------------------------------------------- 1 | from enum import unique 2 | 3 | from flask_sqlalchemy import SQLAlchemy 4 | 5 | from core.extensions import db 6 | 7 | class Feedback(db.Model): 8 | __tablename__ = 'feedback' 9 | id = db.Column(db.Integer, primary_key=True) 10 | 11 | src_lang = db.Column(db.String(20), nullable=False) 12 | tgt_lang = db.Column(db.String(20), nullable=False) 13 | accurate_translation = db.Column(db.String(800), nullable=False) 14 | know_src_lang = db.Column(db.String(50), nullable=False) 15 | know_tgt_lang = db.Column(db.String(50), nullable=False) 16 | own_translation = db.Column(db.String(800), nullable=True) 17 | translation = db.Column(db.String(800), nullable=False) 18 | text = db.Column(db.String(800), nullable=False) 19 | understand_translation = db.Column(db.String(50), nullable=False) 20 | feedbackToken = db.Column(db.String(100), nullable=False) 21 | 22 | 23 | created_at = db.Column(db.DateTime(), nullable=False,\ 24 | server_default=db.func.now()) 25 | update_at = db.Column(db.DateTime(), nullable=False,\ 26 | server_default=db.func.now(), onupdate=db.func.now()) 27 | 28 | # TODO We need to decide how we deal with duplicate on the review saving 29 | # __table_args__ = ( 30 | # # this can be db.PrimaryKeyConstraint if you want it to be a primary key 31 | # db.UniqueConstraint('input', 'review', 'stars'),) 32 | 33 | 34 | def save(self): 35 | db.session.add(self) 36 | db.session.commit() 37 | -------------------------------------------------------------------------------- /src/server/core/tests/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from flask import current_app 5 | from flask_testing import TestCase 6 | 7 | from core import masakhane 8 | 9 | 10 | class TestDevelopmentConfig(TestCase): 11 | def create_app(self): 12 | masakhane.config.from_object('core.config.DevelopmentConfig') 13 | return masakhane 14 | 15 | def test_app_is_development(self): 16 | self.assertTrue(masakhane.config['SECRET_KEY'] == "super-secret-key") 17 | self.assertFalse(current_app is None) 18 | self.assertTrue( 19 | masakhane.config['SQLALCHEMY_DATABASE_URI'] == 20 | os.getenv('DATABASE_TEST_URL', "sqlite:///masakhane.db") 21 | ) 22 | 23 | class TestTestingConfig(TestCase): 24 | def create_app(self): 25 | masakhane.config.from_object('core.config.StagingConfig') 26 | return masakhane 27 | 28 | def test_app_is_testing(self): 29 | self.assertTrue(masakhane.config['SECRET_KEY'] == "key_testing") 30 | self.assertTrue(masakhane.config['TESTING']) 31 | self.assertTrue( 32 | masakhane.config['SQLALCHEMY_DATABASE_URI'] == 33 | os.getenv('DATABASE_TEST_URL', "sqlite:///masakhane.db") 34 | ) 35 | 36 | class TestProductionConfig(TestCase): 37 | def create_app(self): 38 | masakhane.config.from_object('core.config.ProductionConfig') 39 | return masakhane 40 | 41 | def test_app_is_production(self): 42 | self.assertTrue(masakhane.config['SECRET_KEY'] == "key_production") 43 | self.assertFalse(masakhane.config['TESTING']) 44 | 45 | if __name__ == '__main__': 46 | unittest.main() -------------------------------------------------------------------------------- /src/m_to_m_models/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10 as base 2 | LABEL maintainer="Espoir Murhabazi" 3 | 4 | 5 | # Never prompt the user for choices on installation/configuration of packages 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV PYTHONUNBUFFERED=1 \ 8 | PORT=9000 \ 9 | PYTHONDONTWRITEBYTECODE=1 \ 10 | PIP_NO_CACHE_DIR=off \ 11 | PIP_DISABLE_PIP_VERSION_CHECK=on \ 12 | PIP_DEFAULT_TIMEOUT=100 13 | 14 | 15 | FROM base AS python-deps 16 | RUN apt-get update \ 17 | && apt-get install --no-install-recommends -y \ 18 | curl \ 19 | build-essential\ 20 | software-properties-common 21 | 22 | RUN python -m venv /opt/venv 23 | # Make sure we use the virtualenv: 24 | ENV PATH="/opt/venv/bin:$PATH" 25 | 26 | # Install pip 27 | COPY requirements.txt ./ 28 | RUN pip install --upgrade pip 29 | RUN pip install -r requirements.txt 30 | 31 | 32 | 33 | FROM base AS runtime 34 | # copy nltk data 35 | COPY --from=python-deps /opt/venv /opt/venv 36 | 37 | 38 | RUN useradd --create-home masakhane 39 | RUN usermod -aG sudo masakhane 40 | RUN mkdir /home/masakhane/translation_app/ 41 | ENV WORKING_DIR=/home/masakhane/translation_app/ 42 | ENV PATH="${WORKING_DIR}:$PATH" 43 | ENV PATH="/opt/venv/bin:$PATH" 44 | ENV PYTHONPATH="/opt/venv/bin:$PYTHONPATH" 45 | ENV PYTHONPATH="${PYTHONPATH}:${WORKING_DIR}" 46 | 47 | ENV MODEL_NAME model_handlers.OptimizedM100Model 48 | 49 | ENV SERVICE_TYPE MODEL 50 | 51 | COPY model_handlers.py ${WORKING_DIR} 52 | WORKDIR ${WORKING_DIR} 53 | RUN chown -R masakhane:masakhane ${WORKING_DIR} 54 | RUN chmod -R 777 ${WORKING_DIR} 55 | USER masakhane 56 | EXPOSE 9000 5000 57 | 58 | CMD exec seldon-core-microservice $MODEL_NAME --service-type $SERVICE_TYPE 59 | -------------------------------------------------------------------------------- /src/client/src/components/multiStepForm.js: -------------------------------------------------------------------------------- 1 | import { useForm, useStep } from "react-hooks-helper"; 2 | import React from 'react'; 3 | 4 | import Terms from "./terms"; 5 | import Step1 from "./step1"; 6 | import Step2 from "./step2"; 7 | import Step3 from "./step3"; 8 | 9 | const steps = [ 10 | { id: "terms" }, 11 | { id: "step1" }, 12 | { id: "step2" }, 13 | { id: "step3" }, 14 | ]; 15 | 16 | const defaultData = { 17 | know_src_lang: "little", 18 | know_tgt_lang: "little", 19 | understand_translation: "none", 20 | accurate_translation: "nonsense", 21 | own_translation: "" 22 | }; 23 | 24 | const MultiStepForm = ({ src_lang, tgt_lang, text, translation, setShow, submitFeedBack, setFeedbackToken, feedbackToken}) => { 25 | const [formData, setForm] = useForm({...defaultData, src_lang, tgt_lang, text, translation, feedbackToken}); 26 | const { step, navigation } = useStep({ initialStep: 0, steps }); 27 | const { id } = step; 28 | 29 | const handleSubmitFeedback = () => { 30 | console.log({formData}); 31 | // set formData to be feedback form 32 | submitFeedBack(formData); 33 | } 34 | 35 | const props = { src_lang, tgt_lang, text, translation, setShow, formData, setForm, navigation, handleSubmitFeedback, setFeedbackToken, feedbackToken}; 36 | 37 | switch (id) { 38 | case "terms": 39 | return ; 40 | case "step1": 41 | return ; 42 | case "step2": 43 | return ; 44 | case "step3": 45 | return ; 46 | 47 | default: 48 | return null; 49 | } 50 | } 51 | 52 | export default MultiStepForm; 53 | -------------------------------------------------------------------------------- /src/server/core/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | # external imports 3 | from flask import Flask 4 | from flask_migrate import Migrate 5 | from flask_restful import Api 6 | from flask_cors import CORS 7 | # internal imports 8 | from core.resources.translate import TranslateResource, AddResource, SaveResource, HomeResource 9 | from core.extensions import db 10 | from core.config import Config, DevelopmentConfig, ProductionConfig, StagingConfig 11 | 12 | 13 | #application factory 14 | def create_app(saved_models): 15 | """Flask application factory to config and init app""" 16 | env = os.environ.get('ENV', 'Development') 17 | if env == 'Production': 18 | config_str = ProductionConfig() 19 | elif env == 'Staging': 20 | config_str = StagingConfig() 21 | else: 22 | config_str = DevelopmentConfig() 23 | 24 | app = Flask(__name__) 25 | CORS(app) 26 | app.config.from_object(config_str) 27 | # database init 28 | register_extensions(app) 29 | # api init 30 | register_resources(app, saved_models) 31 | 32 | return app 33 | 34 | 35 | def register_extensions(app): 36 | db.init_app(app) 37 | migrate = Migrate(app, db) 38 | 39 | 40 | def register_resources(app, saved_models): 41 | api = Api(app) 42 | api.add_resource(HomeResource, '/') 43 | api.add_resource(TranslateResource, '/translate', resource_class_kwargs={'saved_models': saved_models}) 44 | # TODO need to find a better way to updte the current app information whithout exposing to the public 45 | api.add_resource(AddResource, '/update', resource_class_kwargs={'saved_models': saved_models}) 46 | api.add_resource(SaveResource, '/save') 47 | 48 | 49 | models = {} 50 | masakhane = create_app(models) 51 | masakhane.models = models 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *.cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # DotEnv configuration 60 | .env 61 | 62 | # Database 63 | *.db 64 | *.rdb 65 | 66 | # Pycharm 67 | .idea 68 | 69 | # VS Code 70 | .vscode/ 71 | 72 | # Spyder 73 | .spyproject/ 74 | 75 | # Jupyter NB Checkpoints 76 | .ipynb_checkpoints/ 77 | 78 | # exclude data from source control by default 79 | /data/ 80 | 81 | # Mac OS-specific storage files 82 | .DS_Store 83 | 84 | # vim 85 | *.swp 86 | *.swo 87 | 88 | # Mypy cache 89 | .mypy_cache/ 90 | 91 | src/back-end/joeynmt/models/ 92 | models/joeynmt/ 93 | 94 | # node modules 95 | node_modules/ 96 | 97 | #cache 98 | .eslintcache 99 | 100 | src/server/models/joeynmt 101 | src/server/core/models/joeynmt 102 | 103 | .env.prod 104 | 105 | *.sqlite 106 | 107 | ### ignore model export 108 | 109 | onnx/ 110 | *.onnx 111 | model_store/ 112 | logs/ 113 | -------------------------------------------------------------------------------- /src/client/src/pages/Faq.js: -------------------------------------------------------------------------------- 1 | import { Container, Card } from 'react-bootstrap' 2 | import React from 'react'; 3 | 4 | export default function FAQPage() { 5 | return( 6 |
7 | 8 | 9 | 10 | FAQ 11 | {/* Enter subtitle here */} 12 |
13 | 14 | 1. I was not happy with the translation I got from the service. 15 | 16 |
17 |
18 |
19 | 20 | Thank you for trying this service. The Masakhane NLP Translation project built the models used to do the translation. 21 | This website provides a way for us to be able to test how well these models work. This service is still a work in progress and we expect the models to be improved every few months as we get more feedback from users such as yourself. 22 | Please do provide feedback by writing where there is a mistake in the translation so we can provide this information to the researchers. 23 | As such, this service is not a production system (should not be used for official translations). 24 | 25 |
26 |
27 |
28 |
29 |
30 |
31 | ) 32 | } -------------------------------------------------------------------------------- /src/server/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | alembic==1.5.4 3 | aniso8601==8.1.1 4 | astroid==2.4.2 5 | backcall==0.2.0 6 | cachetools==4.2.1 7 | chardet==4.0.0 8 | click==7.1.2 9 | cycler==0.10.0 10 | decorator==4.4.2 11 | Flask==1.1.2 12 | Flask-Cors==3.0.10 13 | Flask-Migrate==2.6.0 14 | Flask-RESTful==0.3.8 15 | Flask-SQLAlchemy==2.4.4 16 | future==0.18.2 17 | google-auth==1.26.1 18 | google-auth-oauthlib==0.4.2 19 | grpcio==1.35.0 20 | gdown==4.6.0 21 | idna==2.10 22 | importlib-metadata==3.4.0 23 | ipdb==0.13.4 24 | ipython==7.16.1 25 | ipython-genutils==0.2.0 26 | isort==5.7.0 27 | itsdangerous==1.1.0 28 | jedi==0.18.0 29 | Jinja2==2.11.3 30 | joeynmt==1.2 31 | kiwisolver==1.3.1 32 | lazy-object-proxy==1.4.3 33 | Mako==1.1.4 34 | Markdown==3.3.3 35 | MarkupSafe==1.1.1 36 | matplotlib==3.3.4 37 | mccabe==0.6.1 38 | Morfessor==2.0.6 39 | numpy==1.18.5 40 | oauthlib==3.1.0 41 | pandas==1.1.5 42 | parso==0.8.1 43 | pexpect==4.8.0 44 | pickleshare==0.7.5 45 | Pillow==8.1.0 46 | polyglot==16.7.4 47 | portalocker==2.2.1 48 | prompt-toolkit==3.0.16 49 | protobuf==3.14.0 50 | psycopg2-binary==2.8.6 51 | ptyprocess==0.7.0 52 | pyasn1==0.4.8 53 | pyasn1-modules==0.2.8 54 | pycld2==0.41 55 | pyglot==0.1.1 56 | Pygments==2.7.4 57 | PyICU==2.6 58 | pylint==2.6.0 59 | pyparsing==2.4.7 60 | python-dateutil==2.8.1 61 | python-editor==1.0.4 62 | pytz==2021.1 63 | PyYAML==5.4.1 64 | requests==2.25.1 65 | requests-oauthlib==1.3.0 66 | rsa==4.7 67 | sacrebleu==1.5.0 68 | scipy==1.5.4 69 | seaborn==0.11.1 70 | simplejson==3.17.2 71 | six==1.12.0 72 | SQLAlchemy==1.3.23 73 | subword-nmt==0.3.7 74 | tensorboard==2.4.1 75 | tensorboard-plugin-wit==1.8.0 76 | toml==0.10.2 77 | torch==1.7.1 78 | tqdm==4.56.2 79 | traitlets==4.3.3 80 | typed-ast==1.4.2 81 | typing-extensions==3.7.4.3 82 | urllib3==1.26.3 83 | wcwidth==0.2.5 84 | Werkzeug==0.16.1 85 | wrapt==1.11.1 86 | zipp==3.4.0 87 | sacremoses==0.0.43 88 | # https://gunicorn.org/#deployment 89 | gunicorn==20.0.4 90 | Flask-Testing==0.6.2 -------------------------------------------------------------------------------- /src/m_to_m_models/kubernetes/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: machinelearning.seldon.io/v1 2 | kind: SeldonDeployment 3 | metadata: 4 | name: translation-deployment 5 | namespace: masakhane 6 | spec: 7 | name: translation-worker 8 | predictors: 9 | - componentSpecs: 10 | - spec: 11 | containers: 12 | - image: masakhane/translation:alpha 13 | name: translation-container 14 | imagePullPolicy: IfNotPresent 15 | env: 16 | - name: TRANSFORMERS_CACHE 17 | value: "/models_datastore/.cache" 18 | - name: GUNICORN_WORKERS 19 | value: '1' 20 | - name: GRPC_WORKERS 21 | value: '0' 22 | - name: SELDON_LOG_LEVEL 23 | value: DEBUG 24 | - name: SELDON_DEBUG 25 | value: 'True' 26 | - name: FLASK_DEBUG 27 | value: 'True' 28 | volumeMounts: 29 | - mountPath: "/models_datastore/" # mount the cache volume here 30 | name: translation-volume-storage 31 | resources: 32 | requests: 33 | memory: 8Gi 34 | cpu: 3 35 | limits: 36 | memory: 9Gi 37 | cpu: 4 38 | terminationGracePeriodSeconds: 1 39 | volumes: 40 | - name: translation-volume-storage 41 | persistentVolumeClaim: 42 | claimName: masakhane-model-cache-volume-claim 43 | graph: 44 | envSecretRefName: masakhane-container-secret 45 | children: [] 46 | endpoint: 47 | type: REST 48 | name: translation-container 49 | type: MODEL 50 | parameters: 51 | - name: model_path 52 | type: STRING 53 | value: "/models_datastore/" # this should come form volume. 54 | - name: src_lang 55 | type: STRING 56 | value: "en" 57 | - name: tgt_lang 58 | type: STRING 59 | value: "sw" 60 | labels: 61 | version: v1 62 | name: translation-predictor 63 | replicas: 1 64 | -------------------------------------------------------------------------------- /src/server/core/utils.py: -------------------------------------------------------------------------------- 1 | from torchtext import data 2 | from torchtext.datasets import TranslationDataset 3 | 4 | 5 | from joeynmt.constants import UNK_TOKEN, EOS_TOKEN, BOS_TOKEN, PAD_TOKEN 6 | 7 | 8 | class MonoLineDataset(TranslationDataset): 9 | def __init__(self, line, field, **kwargs): 10 | examples = [] 11 | line = line.strip() 12 | fields = [('src', field)] 13 | examples.append(data.Example.fromlist([line], fields)) 14 | super(TranslationDataset, self).__init__(examples, fields, **kwargs) 15 | 16 | 17 | def load_line_as_data(line, level, lowercase, src_vocab, trg_vocab): 18 | """ 19 | Create a data set from one line. 20 | Workaround for the usual torchtext data handling. 21 | 22 | :param line: The input line to process. 23 | :param level: "char", "bpe" or "word". Determines segmentation of the input. 24 | :param lowercase: If True, lowercases inputs and outputs. 25 | :param src_vocab: Path to source vocabulary. 26 | :param trg_vocab: Path to target vocabulary. 27 | :return: 28 | """ 29 | if level == "char": 30 | tok_fun = lambda s: list(s) 31 | else: 32 | # bpe or word, pre-tokenized 33 | tok_fun = lambda s: s.split() 34 | 35 | src_field = data.Field(init_token=None, eos_token=EOS_TOKEN, 36 | pad_token=PAD_TOKEN, tokenize=tok_fun, 37 | batch_first=True, lower=lowercase, 38 | unk_token=UNK_TOKEN, 39 | include_lengths=True) 40 | trg_field = data.Field(init_token=BOS_TOKEN, eos_token=EOS_TOKEN, 41 | pad_token=PAD_TOKEN, tokenize=tok_fun, 42 | unk_token=UNK_TOKEN, 43 | batch_first=True, lower=lowercase, 44 | include_lengths=True) 45 | test_data = MonoLineDataset(line=line, field=(src_field)) 46 | src_field.vocab = src_vocab 47 | trg_field.vocab = trg_vocab 48 | return test_data, src_vocab, trg_vocab -------------------------------------------------------------------------------- /src/client/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "masakhane", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "@babel/core": "^7.20.5", 7 | "@babel/preset-env": "^7.20.2", 8 | "@babel/preset-react": "^7.18.6", 9 | "@babel/runtime": "^7.13.17", 10 | "@testing-library/jest-dom": "^5.11.9", 11 | "@testing-library/react": "^11.1.0", 12 | "@testing-library/user-event": "^12.1.10", 13 | "babel-loader": "^8.3.0", 14 | "bootstrap": "^4.6.0", 15 | "core-js": "^3.11.0", 16 | "file-loader": "^6.2.0", 17 | "http-proxy-middleware": "^1.3.1", 18 | "react": "^17.0.1", 19 | "react-bootstrap": "^1.4.3", 20 | "react-copy-to-clipboard": "^5.0.3", 21 | "react-dom": "^17.0.1", 22 | "react-gtm-module": "^2.0.11", 23 | "react-hooks-helper": "^1.6.0", 24 | "react-router-dom": "^5.2.0", 25 | "react-scripts": "4.0.1", 26 | "regenerator-runtime": "^0.13.7", 27 | "url-loader": "^4.1.1", 28 | "uuid": "^8.3.2", 29 | "web-vitals": "^0.2.4", 30 | "webpack": "^5.75.0", 31 | "webpack-cli": "^3.3.12", 32 | "webpack-config-utils": "^2.3.1" 33 | }, 34 | "scripts": { 35 | "develop": "webpack-dev-server --host 0.0.0.0", 36 | "start-api": "cd ../server && python app.py", 37 | "build": "react-scripts build", 38 | "test": "react-scripts test", 39 | "eject": "react-scripts eject" 40 | }, 41 | "eslintConfig": { 42 | "plugins": [ 43 | "testing-library" 44 | ], 45 | "rules": { 46 | "testing-library/await-async-query": "error", 47 | "testing-library/no-await-sync-query": "error", 48 | "testing-library/no-debug": "warn" 49 | } 50 | }, 51 | "browserslist": { 52 | "production": [ 53 | ">0.2%", 54 | "not dead", 55 | "not op_mini all" 56 | ], 57 | "development": [ 58 | "last 1 chrome version", 59 | "last 1 firefox version", 60 | "last 1 safari version" 61 | ] 62 | }, 63 | "devDependencies": { 64 | "eslint": "^7.18.0", 65 | "eslint-plugin-testing-library": "^3.10.1", 66 | "webpack-dev-server": "^3.11.2" 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/client/webpack.config.js: -------------------------------------------------------------------------------- 1 | const path = require('path') 2 | 3 | module.exports = { 4 | entry: path.resolve(__dirname, 'src', 'index.js'), 5 | output: { 6 | path: path.resolve(__dirname, 'public'), 7 | filename: 'bundle.js', 8 | publicPath: '/' 9 | }, 10 | devServer: { 11 | contentBase: path.resolve(__dirname, 'public'), 12 | open: true, 13 | clientLogLevel: 'silent', 14 | host: '0.0.0.0', 15 | port: 3000, 16 | historyApiFallback: true, 17 | compress: true, 18 | public: 'translate.masakhane.io:80', 19 | // proxy: { 20 | // '/': { 21 | // target: 'http://localhost:5000', 22 | // pathRewrite: { '^/api': '' }, 23 | // }, 24 | // "changeOrigin":true 25 | // } 26 | proxy: { 27 | '/': { 28 | // target: 'http://[::1]:5000', 29 | // todo: make the ip a configuration environment variable 30 | target: 'http://45.147.99.147:5000', 31 | // target: 'http://127.0.0.1:5000', 32 | bypass: function (req, res, proxyOptions) { 33 | if (req.headers.accept.indexOf('html') !== -1) { 34 | console.log('Skipping proxy for browser request.'); 35 | return '/index.html'; 36 | } 37 | }, 38 | }, 39 | }, 40 | }, 41 | module: { 42 | rules: [ 43 | { 44 | test: /\.(jsx|js)$/, 45 | include: path.resolve(__dirname, 'src'), 46 | exclude: /node_modules/, 47 | use: [{ 48 | loader: 'babel-loader', 49 | options: { 50 | presets: [ 51 | ['@babel/preset-env', { 52 | "targets": "defaults" 53 | }], 54 | '@babel/preset-react' 55 | ] 56 | } 57 | }] 58 | }, 59 | { 60 | test: /\.(jpg|png|svg)$/, 61 | include: path.resolve(__dirname, 'src'), 62 | exclude: /node_modules/, 63 | loader: 'url-loader', 64 | options: { 65 | limit: 25000, 66 | performance: { 67 | hints: false, 68 | maxEntrypointSize: 512000, 69 | maxAssetSize: 512000 70 | } 71 | }, 72 | 73 | } 74 | 75 | ] 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/server/Dockerfile.prod: -------------------------------------------------------------------------------- 1 | ########### 2 | # BUILDER # 3 | ########### 4 | 5 | # pull official base image 6 | FROM python:3.6.9 as builder 7 | 8 | 9 | # set working directory 10 | WORKDIR /usr/src/app 11 | 12 | 13 | # set environment variables 14 | ENV PYTHONDONTWRITEBYTECODE 1 15 | ENV PYTHONUNBUFFERED 1 16 | 17 | # install system dependencies 18 | RUN apt-get update && apt-get install -y netcat && \ 19 | apt-get install -y --no-install-recommends gcc 20 | 21 | 22 | RUN apt-get update 23 | RUN apt-get install -y gnupg lsb-release wget 24 | 25 | RUN lsb_release -c -s > /tmp/lsb_release 26 | RUN GCSFUSE_REPO=$(cat /tmp/lsb_release); echo "deb http://packages.cloud.google.com/apt gcsfuse-$GCSFUSE_REPO main" | tee /etc/apt/sources.list.d/gcsfuse.list 27 | RUN wget -O - https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - 28 | 29 | RUN apt-get update 30 | RUN apt-get install -y gcsfuse 31 | 32 | 33 | # lint 34 | RUN pip install --upgrade pip 35 | RUN pip install flake8 36 | RUN flake8 --ignore=E501,F401 . 37 | 38 | # add and install requirements 39 | COPY ./requirements.txt /usr/src/app/requirements.txt 40 | # RUN pip install -r requirements.txt 41 | RUN pip wheel --no-cache-dir --no-deps --wheel-dir /usr/src/app/wheels -r requirements.txt 42 | 43 | 44 | ######### 45 | # FINAL # 46 | ######### 47 | 48 | FROM python:3.6.9 49 | 50 | # create directory for the app user 51 | RUN mkdir -p /home/app 52 | 53 | # create the app user 54 | RUN addgroup --system app && adduser --system --group app 55 | 56 | # create the appropriate directories 57 | ENV HOME=/home/app 58 | ENV APP_HOME=/home/app/web 59 | RUN mkdir $APP_HOME 60 | WORKDIR $APP_HOME 61 | 62 | # install dependencies 63 | RUN apt-get update && apt-get install -y --no-install-recommends netcat 64 | COPY --from=builder /usr/src/app/wheels /wheels 65 | COPY --from=builder /usr/src/app/requirements.txt . 66 | RUN pip install --upgrade pip 67 | RUN pip install --no-cache /wheels/* 68 | 69 | # copy entrypoint-prod.sh 70 | COPY ./entrypoint.prod.sh $APP_HOME 71 | 72 | 73 | # copy project 74 | COPY . $APP_HOME 75 | 76 | # chown all the files to the app user 77 | RUN chown -R app:app $APP_HOME 78 | 79 | # change to the app user 80 | USER app 81 | 82 | # run entrypoint.prod.sh 83 | ENTRYPOINT ["/home/app/web/entrypoint.prod.sh"] -------------------------------------------------------------------------------- /docs/start_app_prod_doc.md: -------------------------------------------------------------------------------- 1 | # **Running the App In Production** 2 | To run the app locally, see [here](start_app_locally_doc.md) 3 | 4 | ## **Table of Contents** 5 | - [**Docker Setup**](#docker-setup) 6 | - [**Running the app**](#running-the-app) 7 | - [**Building the App**](#building-the-app) 8 | - [**Shut down the app**](#shut-down-the-app) 9 | - [**Add, Update, \& Delete Languages**](#add-update--delete-languages) 10 | - [**Running tests**](#running-tests) 11 | 12 | 13 | ## **Docker Setup** 14 | 15 | Ensure you have `docker` & `docker-compose` installed on your computer, you can check with the following commands: 16 | ```bash 17 | docker --version 18 | docker-compose --version 19 | ``` 20 | 21 | If the above commands return an error, please install [Docker](https://docs.docker.com/engine/install/) and [Docker-compose](https://docs.docker.com/compose/install/). 22 | 23 | ## **Running the app** 24 | ### **Building the App** 25 | To build the app, from the root project directory, run the following command: 26 | ```bash 27 | docker-compose -f docker-compose.prod.yml up -d --build 28 | ``` 29 | 30 | ### **Shut down the app** 31 | To shut down the app, run the following command to remove the docker container: 32 | ```bash 33 | docker-compose -f docker-compose.prod.yml down 34 | ``` 35 | 36 | ### **Add, Update, & Delete Languages** 37 | **Add a Language** 38 | ```bash 39 | docker-compose -f docker-compose.yml exec api python manage.py add_language en-sw-JW300 40 | ``` 41 | The language code parameter `en-sw-JW300` represents {src-lang}-{tgt-lang}-{shortform} 42 | So `en-sw-JW300` represents English-Swahili using JW300 shortform 43 | **Note** - A code parameter example without shortform is `en-tiv-` 44 | 45 | Download available languages csv [here](https://zenodo.org/record/7417644/files/masakhane-mt-current-models.csv) 46 | 47 | **Update Langugaes** 48 | ```bash 49 | curl --request GET 'http://127.0.0.1:5000/update' 50 | ``` 51 | 52 | **Check available languages** 53 | ```bash 54 | docker-compose -f docker-compose.prod.yml exec api python manage.py all_languages 55 | ``` 56 | 57 | **Remove a language** 58 | ```bash 59 | docker-compose -f docker-compose.prod.yml exec api python manage.py remove_language en-sw-JW300 60 | ``` 61 | 62 | ### **Running tests** 63 | ```bash 64 | docker-compose -f docker-compose.prod.yml exec api python manage.py tests 65 | ``` -------------------------------------------------------------------------------- /src/server/core/tests/test_app.py: -------------------------------------------------------------------------------- 1 | # test_hello.py 2 | # from app import create_app 3 | from flask import json, jsonify 4 | 5 | import os 6 | import unittest 7 | 8 | from flask import current_app 9 | from flask_testing import TestCase 10 | from core import masakhane, load_model, create_app 11 | 12 | # from core import masakhane 13 | from core.tests.base import BaseTestCase 14 | 15 | class TestAppService(BaseTestCase): 16 | 17 | def test_home_page(self): 18 | "Test the home endpoint" 19 | app = masakhane 20 | response = app.test_client().get('/') 21 | 22 | data = response.get_json() 23 | 24 | assert response.status_code == 200 25 | 26 | assert data['message'] == "welcome Masakhane Web" 27 | 28 | # TODO We will need to have a dump database to check this 29 | # def test_translation(self): 30 | # app = masakhane 31 | # response = app.test_client().post( 32 | # '/translate', 33 | # data = json.dumps({ 34 | # "src_lang":"English", 35 | # "tgt_lang":"swahili", 36 | # "input":"My name is Salomon" 37 | # }), 38 | # content_type='application/json', 39 | # ) 40 | 41 | # data = response.get_json() 42 | 43 | # # assert response.status_code == 201 # created 44 | 45 | # # Givent that we can't know exactly the output of the translation 46 | # # we can test that some result are return 47 | # print(data) 48 | # assert data['output'] != "" 49 | 50 | # def test_save(): 51 | # """ 52 | # Test the save endpoint by checking the status code 53 | # and the responce message. 54 | # """ 55 | # app = create_app() 56 | # response = app.test_client().post( 57 | # '/save', 58 | # data = json.dumps({ 59 | # "src_lang":"en", 60 | # "tgt_lang":"sw", 61 | # "input":"How are you doing today ?", 62 | # "review":"Test Saving", 63 | # "stars":"5", 64 | # "token":"ww2wki&idjj11yyy"}), 65 | # content_type='application/json', 66 | # ) 67 | 68 | 69 | # assert response.status_code == 201 70 | 71 | # assert b"Review saved" in response.data 72 | 73 | if __name__=='__main__': 74 | unittest.main() -------------------------------------------------------------------------------- /src/client/src/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements-python3.10.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | appnope==0.1.3 4 | asttokens==2.2.1 5 | async-timeout==4.0.2 6 | attrs==22.2.0 7 | backcall==0.2.0 8 | captum==0.6.0 9 | certifi==2022.12.7 10 | cffi==1.15.1 11 | charset-normalizer==3.0.1 12 | click==8.0.4 13 | coloredlogs==15.0.1 14 | comm==0.1.2 15 | contourpy==1.0.7 16 | cryptography==3.4.8 17 | cycler==0.11.0 18 | datasets==2.10.0 19 | debugpy==1.6.6 20 | decorator==5.1.1 21 | dill==0.3.6 22 | enum-compat==0.0.3 23 | executing==1.2.0 24 | filelock==3.9.0 25 | Flask==2.2.3 26 | Flask-Cors==3.0.10 27 | Flask-OpenTracing==1.1.0 28 | flatbuffers==1.12 29 | fonttools==4.38.0 30 | frozenlist==1.3.3 31 | fsspec==2023.1.0 32 | grpcio==1.51.3 33 | grpcio-opentracing==1.1.4 34 | grpcio-reflection==1.34.1 35 | gunicorn==20.1.0 36 | huggingface-hub==0.12.1 37 | humanfriendly==10.0 38 | idna==3.4 39 | ipykernel==6.21.2 40 | ipython==8.10.0 41 | itsdangerous==2.1.2 42 | jaeger-client==4.4.0 43 | jedi==0.18.2 44 | Jinja2==3.1.2 45 | jsonschema==3.2.0 46 | jupyter_client==8.0.3 47 | jupyter_core==5.2.0 48 | kiwisolver==1.4.4 49 | MarkupSafe==2.1.2 50 | matplotlib==3.7.0 51 | matplotlib-inline==0.1.6 52 | mpmath==1.2.1 53 | multidict==6.0.4 54 | multiprocess==0.70.14 55 | nest-asyncio==1.5.6 56 | numpy==1.23.5 57 | onnx==1.13.1 58 | onnxruntime==1.13.1 59 | onnxruntime-tools==1.7.0 60 | opentracing==2.4.0 61 | optimum==1.6.4 62 | ort-nightly==1.11.0.dev20220320001 63 | packaging==23.0 64 | pandas==1.5.3 65 | parso==0.8.3 66 | pexpect==4.8.0 67 | pickleshare==0.7.5 68 | Pillow==9.4.0 69 | platformdirs==3.0.0 70 | prometheus-client==0.8.0 71 | prompt-toolkit==3.0.37 72 | protobuf==3.20.3 73 | psutil==5.9.4 74 | ptyprocess==0.7.0 75 | pure-eval==0.2.2 76 | py-cpuinfo==9.0.0 77 | py3nvml==0.2.7 78 | pyarrow==11.0.0 79 | pycodestyle==2.10.0 80 | pycparser==2.21 81 | Pygments==2.14.0 82 | pyparsing==3.0.9 83 | pyrsistent==0.19.3 84 | python-dateutil==2.8.2 85 | pytz==2022.7.1 86 | PyYAML==5.4.1 87 | pyzmq==25.0.0 88 | regex==2022.10.31 89 | requests==2.28.2 90 | responses==0.18.0 91 | seldon-core==1.15.0 92 | sentencepiece==0.1.97 93 | six==1.16.0 94 | stack-data==0.6.2 95 | sympy==1.11.1 96 | threadloop==1.0.2 97 | thrift==0.16.0 98 | tokenizers==0.13.2 99 | torch==1.13.1 100 | torch-model-archiver==0.7.1 101 | torch-workflow-archiver==0.2.7 102 | torchserve==0.7.1 103 | tornado==6.2 104 | tqdm==4.64.1 105 | traitlets==5.9.0 106 | transformers==4.26.1 107 | typing_extensions==4.5.0 108 | urllib3==1.26.14 109 | wcwidth==0.2.6 110 | Werkzeug==2.2.3 111 | xmltodict==0.13.0 112 | xxhash==3.2.0 113 | yarl==1.8.2 114 | -------------------------------------------------------------------------------- /src/m_to_m_models/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | appnope==0.1.3 4 | asttokens==2.2.1 5 | async-timeout==4.0.2 6 | attrs==22.2.0 7 | backcall==0.2.0 8 | captum==0.6.0 9 | certifi==2022.12.7 10 | cffi==1.15.1 11 | charset-normalizer==3.0.1 12 | click==8.0.4 13 | coloredlogs==15.0.1 14 | comm==0.1.2 15 | contourpy==1.0.7 16 | cryptography==3.4.8 17 | cycler==0.11.0 18 | datasets==2.10.0 19 | debugpy==1.6.6 20 | decorator==5.1.1 21 | dill==0.3.6 22 | enum-compat==0.0.3 23 | executing==1.2.0 24 | filelock==3.9.0 25 | Flask==2.2.3 26 | Flask-Cors==3.0.10 27 | Flask-OpenTracing==1.1.0 28 | flatbuffers==1.12 29 | fonttools==4.38.0 30 | frozenlist==1.3.3 31 | fsspec==2023.1.0 32 | grpcio==1.51.3 33 | grpcio-opentracing==1.1.4 34 | grpcio-reflection==1.34.1 35 | gunicorn==20.1.0 36 | huggingface-hub==0.12.1 37 | humanfriendly==10.0 38 | idna==3.4 39 | ipykernel==6.21.2 40 | ipython==8.10.0 41 | itsdangerous==2.1.2 42 | jaeger-client==4.4.0 43 | jedi==0.18.2 44 | Jinja2==3.1.2 45 | jsonschema==3.2.0 46 | jupyter_client==8.0.3 47 | jupyter_core==5.2.0 48 | kiwisolver==1.4.4 49 | MarkupSafe==2.1.2 50 | matplotlib==3.7.0 51 | matplotlib-inline==0.1.6 52 | mpmath==1.2.1 53 | multidict==6.0.4 54 | multiprocess==0.70.14 55 | nest-asyncio==1.5.6 56 | numpy==1.23.5 57 | onnx==1.13.1 58 | onnxruntime==1.13.1 59 | onnxruntime-tools==1.7.0 60 | opentracing==2.4.0 61 | optimum==1.6.4 62 | ort-nightly==1.11.0.dev20220320001 63 | packaging==23.0 64 | pandas==1.5.3 65 | parso==0.8.3 66 | pexpect==4.8.0 67 | pickleshare==0.7.5 68 | Pillow==9.4.0 69 | platformdirs==3.0.0 70 | prometheus-client==0.8.0 71 | prompt-toolkit==3.0.37 72 | protobuf 73 | psutil==5.9.4 74 | ptyprocess==0.7.0 75 | pure-eval==0.2.2 76 | py-cpuinfo==9.0.0 77 | py3nvml==0.2.7 78 | pyarrow==11.0.0 79 | pycodestyle==2.10.0 80 | pycparser==2.21 81 | Pygments==2.14.0 82 | pyparsing==3.0.9 83 | pyrsistent==0.19.3 84 | python-dateutil==2.8.2 85 | pytz==2022.7.1 86 | PyYAML==5.4.1 87 | pyzmq==25.0.0 88 | regex==2022.10.31 89 | requests==2.28.2 90 | responses==0.18.0 91 | seldon-core==1.15.0 92 | sentencepiece==0.1.97 93 | six==1.16.0 94 | stack-data==0.6.2 95 | sympy==1.11.1 96 | threadloop==1.0.2 97 | thrift==0.16.0 98 | tokenizers==0.13.2 99 | torch==1.13.1 100 | torch-model-archiver==0.7.1 101 | torch-workflow-archiver==0.2.7 102 | torchserve==0.7.1 103 | tornado==6.2 104 | tqdm==4.64.1 105 | traitlets==5.9.0 106 | transformers==4.26.1 107 | typing_extensions==4.5.0 108 | urllib3==1.26.14 109 | wcwidth==0.2.6 110 | Werkzeug==2.2.3 111 | xmltodict==0.13.0 112 | xxhash==3.2.0 113 | yarl==1.8.2 114 | -------------------------------------------------------------------------------- /src/client/src/App.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import { 3 | BrowserRouter as Router, 4 | Switch, 5 | Route 6 | } from "react-router-dom"; 7 | import { Navbar, Nav, Container, Jumbotron, Image, Row, Col } from 'react-bootstrap' 8 | 9 | import Home from './pages/Home'; 10 | import About from './pages/About'; 11 | import FAQPage from './pages/Faq'; 12 | import image from './images/masakhane-border.png'; 13 | 14 | 15 | function App() { 16 | return ( 17 | 18 |
19 | 20 | Masakhane 21 | 22 | 23 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |

Masakhane

35 |

Machine translation service for African languages

36 |
37 |
38 |
39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | {/* 52 |
53 |
54 | 55 |
56 |

This is a community research project. Read more about it here

57 |
*/} 58 |
59 |
60 | ); 61 | } 62 | 63 | export default App; 64 | -------------------------------------------------------------------------------- /src/client/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 13 | 14 | 15 | 16 | 17 | 18 | 22 | 23 | 24 | 30 | 31 | 32 | 36 | 37 | 46 | Masakhane Web 47 | 48 | 49 | 50 |
51 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /src/client/public/217.bundle.js: -------------------------------------------------------------------------------- 1 | (self.webpackChunkmasakhane=self.webpackChunkmasakhane||[]).push([[217],{217:function(t,n,e){"use strict";e.r(n),e.d(n,{getCLS:function(){return m},getFCP:function(){return g},getFID:function(){return h},getLCP:function(){return y},getTTFB:function(){return F}});var i,a,r=function(){return"".concat(Date.now(),"-").concat(Math.floor(8999999999999*Math.random())+1e12)},o=function(t){var n=arguments.length>1&&void 0!==arguments[1]?arguments[1]:-1;return{name:t,value:n,delta:0,entries:[],id:r(),isFinal:!1}},u=function(t,n){try{if(PerformanceObserver.supportedEntryTypes.includes(t)){var e=new PerformanceObserver((function(t){return t.getEntries().map(n)}));return e.observe({type:t,buffered:!0}),e}}catch(t){}},s=!1,c=!1,f=function(t){s=!t.persisted},l=function(){addEventListener("pagehide",f),addEventListener("beforeunload",(function(){}))},p=function(t){var n=arguments.length>1&&void 0!==arguments[1]&&arguments[1];c||(l(),c=!0),addEventListener("visibilitychange",(function(n){var e=n.timeStamp;"hidden"===document.visibilityState&&t({timeStamp:e,isUnloading:s})}),{capture:!0,once:n})},d=function(t,n,e,i){var a;return function(){e&&n.isFinal&&e.disconnect(),n.value>=0&&(i||n.isFinal||"hidden"===document.visibilityState)&&(n.delta=n.value-(a||0),(n.delta||n.isFinal||void 0===a)&&(t(n),a=n.value))}},m=function(t){var n,e=arguments.length>1&&void 0!==arguments[1]&&arguments[1],i=o("CLS",0),a=function(t){t.hadRecentInput||(i.value+=t.value,i.entries.push(t),n())},r=u("layout-shift",a);r&&(n=d(t,i,r,e),p((function(t){var e=t.isUnloading;r.takeRecords().map(a),e&&(i.isFinal=!0),n()})))},v=function(){return void 0===i&&(i="hidden"===document.visibilityState?0:1/0,p((function(t){var n=t.timeStamp;return i=n}),!0)),{get timeStamp(){return i}}},g=function(t){var n,e=o("FCP"),i=v(),a=u("paint",(function(t){"first-contentful-paint"===t.name&&t.startTime1&&void 0!==arguments[1]&&arguments[1],i=o("LCP"),a=v(),r=function(t){var e=t.startTime;e 25 | 26 | Endpoint Description Returns (on success) 27 | 28 | 29 | 30 | `/` 31 | 32 | The base endpoint 33 | 34 | 35 | ```json 36 | { 37 | "message": "welcome Masakhane Web" 38 | } 39 | ``` 40 | 41 | 42 | 43 | 44 | 45 | `/translate` 46 | 47 | Lists the saved models 48 | 49 | 50 | ```json 51 | [ 52 | { 53 | "type": "source", 54 | "name": "English", 55 | "value": "en", 56 | "targets": [ 57 | { 58 | "name": "Swahili", 59 | "value": "sw" 60 | } 61 | ] 62 | } 63 | ] 64 | ``` 65 | 66 | 67 | 68 | 69 | 70 | 71 | `/update` 72 | 73 | Updates the local database with the newly loaded models 74 | 75 | 76 | ```json 77 | { 78 | "message": "models updated" 79 | } 80 | ``` 81 | 82 | 83 | 84 | 85 | 86 | ### **POST** 87 | 88 | 89 | 90 | 91 | 92 | 93 | 98 | 109 | 121 | 122 | 123 | 128 | 142 | 151 | 152 | 153 |
Endpoint Description Example Body Returns (on success)
94 | 95 | `/translate` 96 | 97 | Returns the translated text 99 | 100 | ```json 101 | { 102 | "src_lang": "english", 103 | "tgt_lang": "swahili", 104 | "input": "how are you?" 105 | } 106 | ``` 107 | 108 | 110 | 111 | ```json 112 | { 113 | "src_lang": "english", 114 | "tgt_lang": "swahili", 115 | "input": "Hello, how are you?", 116 | "output": "kwa ukunjufu" 117 | } 118 | ``` 119 | 120 |
124 | 125 | `/save` 126 | 127 | Saves the translation feedback 129 | 130 | ```json 131 | { 132 | "srcX_lang": "english", 133 | "tgt_lang": "swahili", 134 | "input": "Hello, how are you?", 135 | "review": "translation correction", 136 | "stars": "translation confidence", 137 | "token": "user auth (bool)" 138 | } 139 | ``` 140 | 141 | 143 | 144 | ```json 145 | { 146 | "message": "Review saved", 147 | } 148 | ``` 149 | 150 |
154 | 155 | # Manage CLI 156 | There is a cli program for managing the server - it is in [src/server/manage.py]() 157 | 158 | The command format is: 159 | ```bash 160 | python manage.py command optional_parameter 161 | ``` 162 | 163 | | Command | Parameter | Description | 164 | | ------- | --------- | ----------- | 165 | | `create_db` | none | Creates database tables for the db models Language & Feedback 166 | | `all_languages` | none | Lists the model info stored in the Language table 167 | | `add_language` | `name_tag` | Adds a language with a given name_tag, ie - `en-sw-JW300 OR en-tiv-`| 168 | | `remove_language` | `name_tag`| Removes a language with a given name_tag | 169 | | `clean` | none | Deletes and recreates an empty database | 170 | | `tests` | none | Runs the backend tests | 171 | 172 | # Tests 173 | 174 | **TODO** -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Masakhane WEB - A Machine Translation Web Platform for African Languages 2 | 3 |
4 | 5 |
6 | 7 | 8 | [**Masakhane**](https://www.masakhane.io/) meaning ‘we build together’, is a research effort for machine translation for African languages which is open source and online. So far, the community has built translation models based on [Joey NMT](https://github.com/joeynmt/joeynmt) for over 38 African languages. As such, **Masakhane Web** is a platform that aims to host the already trained models from the community and allow contributions from users to create new data for retraining. The objective of this web application is to provide access to an open-source platform that makes available relatively accurate translations for languages across Africa. If you can't find your language and/or would like to train your own machine translation model in your language, see https://github.com/masakhane-io/masakhane-mt on how you can contribute. 9 | 10 | 11 | **Disclaimer:** This system is for research purposes only and should be taken as work in progress. None of the trained models are suitable for production usage. 12 | 13 | ## Table of contents 14 | - [Running The App](#running-the-app) 15 | - [Contributing](#contributing) 16 | - [Options](#options) 17 | - [Submitting Changes\[Pull Request\]](#submitting-changespull-request) 18 | - [Contributors](#contributors) 19 | - [Contact Us](#contact-us) 20 | - [License](#license) 21 | - [Citing the project](#citing-the-project) 22 | - [Acknowledgements](#acknowledgements) 23 | 24 | 25 | # Running The App 26 | To run the app locally, see [here](/docs/start_app_locally_doc.md#running-the-app-locally) 27 | To run the app in a production, see [here](/docs/start_app_prod_doc.md#running-the-app-in-production) 28 | 29 | # Contributing 30 | 31 | 32 | ## Options 33 | 34 | - *Can't see your language as one of the supported languages: Visit [Masakhane:Building your first machine translation model](https://github.com/masakhane-io/masakhane-mt#building-your-first-machine-translation-model) to learn more about how you can train a model for your language.* 35 | 36 | - *I have an idea or a new feature: Create a new issue first, assign it to yourself and then fork the repo* 37 | 38 | - *I want to help in improving the accuracy of the models: Check out below on how you can reach out to us* 39 | 40 | 41 | 42 | ## Submitting Changes[Pull Request] 43 | 44 | - See [https://opensource.com/article/19/7/create-pull-request-github](https://opensource.com/article/19/7/create-pull-request-github) 45 | 46 | 47 | 48 | # Contributors 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | Made with [contributors-img](https://contrib.rocks). 59 | 60 | 61 | 62 | 63 | # Contact Us 64 | 65 | - Vukosi Marivate - vukosi.marivate@cs.up.ac.za 66 | 67 | - Abiodun Modupe - abiodun.modupe@cs.up.ac.za 68 | 69 | - Salomon Kabongo - skabenamualu@aimsammi.org 70 | 71 | - Catherine Gitau - cgitau@aimsammi.org 72 | 73 | 74 | 75 | # License 76 | 77 | [MIT](https://mit-license.org/) 78 | 79 | 80 | 81 | ## Citing the project 82 | 83 | **On a visualisation/notebook/webapp:** 84 | 85 | > Data Science for Social Impact Research Group @ University of Pretoria, Masakhane NLP, *Masakhane WEB - A Machine Translation Web Platform for African Languages* Available on: [https://github.com/dsfsi/masakhane-web](https://github.com/dsfsi/masakhane-web). 86 | 87 | **In a publication** 88 | Software 89 | 90 | > @software { marivate_vukosi_2021_4745501, 91 | > author = {Marivate, Vukosi and Gitau, Catherine and Kabenamualu, Salomon and Modupe, Abiodun and Masakhane NLP}, 92 | > title = {{Masakhane WEB - A Machine Translation Web Platform for African Languages}}, 93 | > month = may, year = 2021, 94 | > publisher = {Zenodo}, 95 | > version = {0.9}, 96 | > doi = {10.5281/zenodo.4745501}, 97 | > url = {[https://doi.org/10.5281/zenodo.4745501](https://doi.org/10.281/zenodo.4745501)} 98 | > } 99 | 100 | 101 | 102 | # Acknowledgements 103 | 104 | 105 | 106 | We want to acknowledge support from the following organisations 107 | 108 | - [Mozilla](https://www.mozilla.org/en-US/moss/) 109 | 110 | - [Google Cloud Platfrom](https://cloud.google.com/) -------------------------------------------------------------------------------- /src/client/src/components/terms.js: -------------------------------------------------------------------------------- 1 | import { Row, Card, Button } from 'react-bootstrap'; 2 | import { v4 as uuidv4 } from 'uuid'; 3 | import React from 'react'; 4 | 5 | const Terms = ({ setShow, navigation, setFeedbackToken, feedbackToken}) => { 6 | const { next } = navigation; 7 | 8 | const accept = () => { 9 | if(feedbackToken !== '') { 10 | next(); 11 | } else { 12 | // generate token 13 | const token = uuidv4(); 14 | // set token 15 | localStorage.setItem('feedbackToken', token); 16 | setFeedbackToken(token); 17 | // proceed 18 | next(); 19 | } 20 | } 21 | 22 | const handleDecline = () => { 23 | // close modal 24 | setShow(false); 25 | } 26 | 27 | return ( 28 |
29 | 30 | 31 | Terms & Conditions 32 | Dear Sir/Madam, 33 |
34 | 35 | I am Dr Vukosi Marivate, principal investigator of the Data Science for Social Impact research group at the Department of Computer Science at the University of Pretoria. 36 | The research project is titled Masakhane Web Feedback Analysis for African Language Task Models. 37 | The study aims to understand the challenges in automated translation models for African languages. 38 | The models themselves are sourced from the Masakhane project (our collaborators) and are all a work in progress. By better providing feedback to model designers, we can work to improve the models and conduct research on African Language Natural Language Processing. 39 | The purpose of this questionnaire/feedback form is to collect information on the quality of the translations that are on the Masakhane Web system currently. 40 | The user participation is voluntary, and you can withdraw at any time without penalty. 41 | 42 |
43 |
44 |
45 | 46 | Throughout the feedback from the participants, their privacy remains confidential. 47 | Hence, we only collect the following information: 48 | 49 | 1. The user has the option to accept or reject to participate in the feedback survey, 50 | 51 | 52 | 2. The participants are required to indicate their level of proficiencies of the languages translated by the model, 53 | 54 | 55 | 3. and your submitted feedback to the translations is stored on our server. No personal information is collected. 56 | 57 | 58 |
59 |
60 |
61 | 62 | If you agree to participate, please complete the survey that follows this cover letter. 63 | It should take about 5 minutes of your time at the most for feedback on each translation. 64 | By completing the survey, you indicate your willingness to participate in this research. 65 | 66 | If you have any concerns, please contact me with the details provided below. 67 |
68 | Dr. Vukosi Marivate 69 |
70 | vukosi.marivate@cs.up.ac.za 71 |
72 |
73 |
74 |
75 | 76 |
77 | 78 |
79 |
80 | 81 |
82 |
83 |
84 | ) 85 | } 86 | 87 | export default Terms; 88 | -------------------------------------------------------------------------------- /src/client/src/pages/About.js: -------------------------------------------------------------------------------- 1 | import { Container, Card } from 'react-bootstrap' 2 | import React from 'react'; 3 | 4 | export default function About() { 5 | return( 6 |
7 | 8 | 9 | 10 | About 11 | Masakhane Web 12 |
13 | 14 | Masakhane Web is an open source online machine translation service for solely African languages. 15 | This project is in line with the works of the Masakhane community . Masakhane meaning ‘we build together’, 16 | is a research effort whose mission is to strengthen and spur NLP research for African languages which is open source and online. 17 | So far, the community has trained translation models for over 38 African languages. As such, this platform aims at hosting the already trained machine translation models from the Masakhane community and allows contributions 18 | from users to create new data for retraining and improving the models.
19 |
20 |
21 |
22 | 23 |
24 | The Masakhane Web project is led by Data Science for Social Impact research group at the Department of Computer Science, University of Pretoria, South Africa. 25 |
26 |
27 |
28 |
29 | 30 | The feedback mechanism of this project has been approved by the University of Pretoria Faculty of Engineering, Built Environment and Information Technology(EBIT) Research Ethics Committee. 31 | 32 |
33 | 34 |
35 |
36 | 37 | If you would like to contribute to this project, train a model in your language or want to collaborate and work with Masakhane, find out how in https://github.com/dsfsi/masakhane-web or reach out to any of the Masakhane Web contributors in the following ways: 38 | 39 |
40 |
41 | 42 |
43 |
44 | Dr. Vukosi Marivate 45 |
46 | vukosi.marivate@cs.up.ac.za 47 |
48 | @vukosi 49 |
50 |
51 | Abiodun Modupe 52 |
53 | abiodun.modupe@cs.up.ac.za 54 |
55 |
56 | Salomon Kabongo 57 |
58 | skabenamualu@aimsammi.org 59 |
60 | @SalomonKabongo 61 |
62 |
63 | Catherine Gitau 64 |
65 | cgitau@aimsammi.org 66 |
67 | @categitau_ 68 |
69 | 70 | 71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 | ) 79 | } -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3 2 | 3 | ################################################################################# 4 | # GLOBALS # 5 | ################################################################################# 6 | 7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 8 | BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') 9 | PROFILE = default 10 | PROJECT_NAME = mit-808-starter 11 | PYTHON_INTERPRETER = python3 12 | 13 | ifeq (,$(shell which conda)) 14 | HAS_CONDA=False 15 | else 16 | HAS_CONDA=True 17 | endif 18 | 19 | ################################################################################# 20 | # COMMANDS # 21 | ################################################################################# 22 | 23 | ## Install Python Dependencies 24 | requirements: test_environment 25 | $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel 26 | $(PYTHON_INTERPRETER) -m pip install -r requirements.txt 27 | 28 | ## Make Dataset 29 | data: requirements 30 | $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed 31 | 32 | ## Delete all compiled Python files 33 | clean: 34 | find . -type f -name "*.py[co]" -delete 35 | find . -type d -name "__pycache__" -delete 36 | 37 | ## Lint using flake8 38 | lint: 39 | flake8 src 40 | 41 | ## Upload Data to S3 42 | sync_data_to_s3: 43 | ifeq (default,$(PROFILE)) 44 | aws s3 sync data/ s3://$(BUCKET)/data/ 45 | else 46 | aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE) 47 | endif 48 | 49 | ## Download Data from S3 50 | sync_data_from_s3: 51 | ifeq (default,$(PROFILE)) 52 | aws s3 sync s3://$(BUCKET)/data/ data/ 53 | else 54 | aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE) 55 | endif 56 | 57 | ## Set up python interpreter environment 58 | create_environment: 59 | ifeq (True,$(HAS_CONDA)) 60 | @echo ">>> Detected conda, creating conda environment." 61 | ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) 62 | conda create --name $(PROJECT_NAME) python=3 63 | else 64 | conda create --name $(PROJECT_NAME) python=2.7 65 | endif 66 | @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" 67 | else 68 | $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper 69 | @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ 70 | export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" 71 | @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" 72 | @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" 73 | endif 74 | 75 | ## Test python environment is setup correctly 76 | test_environment: 77 | $(PYTHON_INTERPRETER) test_environment.py 78 | 79 | ################################################################################# 80 | # PROJECT RULES # 81 | ################################################################################# 82 | 83 | 84 | 85 | ################################################################################# 86 | # Self Documenting Commands # 87 | ################################################################################# 88 | 89 | .DEFAULT_GOAL := help 90 | 91 | # Inspired by 92 | # sed script explained: 93 | # /^##/: 94 | # * save line in hold space 95 | # * purge line 96 | # * Loop: 97 | # * append newline + line to hold space 98 | # * go to next line 99 | # * if line starts with doc comment, strip comment character off and loop 100 | # * remove target prerequisites 101 | # * append hold space (+ newline) to line 102 | # * replace newline plus comments by `---` 103 | # * print line 104 | # Separate expressions are necessary because labels cannot be delimited by 105 | # semicolon; see 106 | .PHONY: help 107 | help: 108 | @echo "$$(tput bold)Available rules:$$(tput sgr0)" 109 | @echo 110 | @sed -n -e "/^## / { \ 111 | h; \ 112 | s/.*//; \ 113 | :doc" \ 114 | -e "H; \ 115 | n; \ 116 | s/^## //; \ 117 | t doc" \ 118 | -e "s/:.*//; \ 119 | G; \ 120 | s/\\n## /---/; \ 121 | s/\\n/ /g; \ 122 | p; \ 123 | }" ${MAKEFILE_LIST} \ 124 | | LC_ALL='C' sort --ignore-case \ 125 | | awk -F '---' \ 126 | -v ncol=$$(tput cols) \ 127 | -v indent=19 \ 128 | -v col_on="$$(tput setaf 6)" \ 129 | -v col_off="$$(tput sgr0)" \ 130 | '{ \ 131 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ 132 | n = split($$2, words, " "); \ 133 | line_length = ncol - indent; \ 134 | for (i = 1; i <= n; i++) { \ 135 | line_length -= length(words[i]) + 1; \ 136 | if (line_length <= 0) { \ 137 | line_length = ncol - indent - length(words[i]) - 1; \ 138 | printf "\n%*s ", -indent, " "; \ 139 | } \ 140 | printf "%s ", words[i]; \ 141 | } \ 142 | printf "\n"; \ 143 | }' \ 144 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') 145 | -------------------------------------------------------------------------------- /src/client/src/components/step1.js: -------------------------------------------------------------------------------- 1 | import { Row, Col, Form, Button } from 'react-bootstrap'; 2 | import RadioButton from './common/radioButton'; 3 | import React from 'react'; 4 | 5 | const Step1 = ({ src_lang, tgt_lang, setForm, formData, navigation, handleSubmitFeedback }) => { 6 | 7 | const { know_src_lang, know_tgt_lang } = formData; 8 | 9 | const { next, go } = navigation; 10 | 11 | const handleNext= () => { 12 | if(know_src_lang === "none" && know_tgt_lang === "none") { 13 | // submit feedback 14 | handleSubmitFeedback(); 15 | // then skip next step 16 | go("step3"); 17 | } else { 18 | // go to next page 19 | next(); 20 | } 21 | } 22 | 23 | return ( 24 |
25 |
26 |
Part 1/2
27 |
28 | 29 |
30 |

How well do you know {src_lang}?

31 |
32 | 33 | 34 | 41 | 42 | 43 | 50 | 51 | 52 | 59 | 60 | 61 | 68 | 69 | 70 |
71 |
72 | 73 |
74 |
75 |
76 | 77 |
78 |

How well do you know {tgt_lang}?

79 |
80 | 81 | 82 | 89 | 90 | 91 | 98 | 99 | 100 | 107 | 108 | 109 | 116 | 117 | 118 |
119 |
120 | 121 |
122 |
123 |
124 | 125 |
126 | 127 |
128 |
129 | ) 130 | } 131 | 132 | export default Step1; 133 | -------------------------------------------------------------------------------- /src/torchserve/Download_Transformer_models.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from pathlib import Path 5 | import torch 6 | import transformers 7 | from transformers import ( 8 | AutoConfig, 9 | AutoModelForCausalLM, 10 | AutoModelForQuestionAnswering, 11 | AutoModelForSequenceClassification, 12 | AutoModelForTokenClassification, 13 | AutoTokenizer, 14 | AutoModelForSeq2SeqLM, 15 | set_seed, 16 | ) 17 | 18 | print("Transformers version", transformers.__version__) 19 | set_seed(1) 20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 21 | 22 | 23 | def transformers_model_downloader( 24 | mode, pretrained_model_name, num_labels, do_lower_case, max_length, torchscript 25 | ): 26 | """This function, save the checkpoint, config file along with tokenizer config and vocab files 27 | of a transformer model of your choice. 28 | """ 29 | print("Download model and tokenizer", pretrained_model_name) 30 | # loading pre-trained model and tokenizer 31 | if mode == "sequence_classification": 32 | config = AutoConfig.from_pretrained( 33 | pretrained_model_name, num_labels=num_labels, torchscript=torchscript 34 | ) 35 | model = AutoModelForSequenceClassification.from_pretrained( 36 | pretrained_model_name, config=config 37 | ) 38 | tokenizer = AutoTokenizer.from_pretrained( 39 | pretrained_model_name, do_lower_case=do_lower_case 40 | ) 41 | elif mode == "question_answering": 42 | config = AutoConfig.from_pretrained( 43 | pretrained_model_name, torchscript=torchscript 44 | ) 45 | model = AutoModelForQuestionAnswering.from_pretrained( 46 | pretrained_model_name, config=config 47 | ) 48 | tokenizer = AutoTokenizer.from_pretrained( 49 | pretrained_model_name, do_lower_case=do_lower_case 50 | ) 51 | elif mode == "token_classification": 52 | config = AutoConfig.from_pretrained( 53 | pretrained_model_name, num_labels=num_labels, torchscript=torchscript 54 | ) 55 | model = AutoModelForTokenClassification.from_pretrained( 56 | pretrained_model_name, config=config 57 | ) 58 | tokenizer = AutoTokenizer.from_pretrained( 59 | pretrained_model_name, do_lower_case=do_lower_case 60 | ) 61 | elif mode == "text_generation": 62 | config = AutoConfig.from_pretrained( 63 | pretrained_model_name, num_labels=num_labels, torchscript=torchscript 64 | ) 65 | model = AutoModelForCausalLM.from_pretrained( 66 | pretrained_model_name, config=config 67 | ) 68 | tokenizer = AutoTokenizer.from_pretrained( 69 | pretrained_model_name, do_lower_case=do_lower_case 70 | ) 71 | elif mode == "translation": 72 | # new mode create to handle the masakhane translation models 73 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name) 74 | model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name) 75 | 76 | # NOTE : for demonstration purposes, we do not go through the fine-tune processing here. 77 | # A Fine_tunining process based on your needs can be added. 78 | # An example of Fine_tuned model has been provided in the README. 79 | 80 | NEW_DIR = Path(__file__).parent.joinpath("transformer_models", pretrained_model_name) 81 | NEW_DIR.mkdir(parents=True, exist_ok=True) 82 | print(f"Successfully created directory {NEW_DIR.__str__()} ") 83 | 84 | print( 85 | "Save model and tokenizer/ Torchscript model based on the setting from setup_config", 86 | pretrained_model_name, 87 | "in directory", 88 | NEW_DIR, 89 | ) 90 | if save_mode == "pretrained": 91 | model.save_pretrained(NEW_DIR) 92 | tokenizer.save_pretrained(NEW_DIR) 93 | elif save_mode == "torchscript": 94 | dummy_input = "This is a dummy input for torch jit trace" 95 | inputs = tokenizer.encode_plus( 96 | dummy_input, 97 | max_length=int(max_length), 98 | pad_to_max_length=True, 99 | add_special_tokens=True, 100 | return_tensors="pt", 101 | ) 102 | input_ids = inputs["input_ids"].to(device) 103 | attention_mask = inputs["attention_mask"].to(device) 104 | model.to(device).eval() 105 | traced_model = torch.jit.trace(model, (input_ids, attention_mask)) 106 | torch.jit.save(traced_model, os.path.join(NEW_DIR, "traced_model.pt")) 107 | return 108 | 109 | 110 | if __name__ == "__main__": 111 | dirname = os.path.dirname(__file__) 112 | if len(sys.argv) > 1: 113 | filename = os.path.join(dirname, sys.argv[1]) 114 | else: 115 | filename = os.path.join(dirname, "setup_config.json") 116 | f = open(filename) 117 | settings = json.load(f) 118 | mode = settings["mode"] 119 | model_name = settings["model_name"] 120 | num_labels = int(settings["num_labels"]) 121 | do_lower_case = settings["do_lower_case"] 122 | max_length = settings["max_length"] 123 | save_mode = settings["save_mode"] 124 | if save_mode == "torchscript": 125 | torchscript = True 126 | else: 127 | torchscript = False 128 | 129 | transformers_model_downloader( 130 | mode, model_name, num_labels, do_lower_case, max_length, torchscript 131 | ) 132 | -------------------------------------------------------------------------------- /docs/debugging_setup.md: -------------------------------------------------------------------------------- 1 | # Common SetUp errors and Debugging 2 | 3 | ## Table of Contents 4 | - [**Errors during setup**](#errors-during-setup) 5 | - [**Errors with Docker**](#errors-with-docker) 6 | - [**gcsfuse** - Noted on Mac M1 (Dec 2022)](#gcsfuse---noted-on-mac-m1-dec-2022) 7 | - [**failed to solve** - Noted on Mac M1 (Dec 2022)](#failed-to-solve---noted-on-mac-m1-dec-2022) 8 | - [**Errors with stand alone setup**](#errors-with-stand-alone-setup) 9 | - [**PyICU/Polyglot** - Noted on Linux/Ubuntu (Jun 2022)](#pyicupolyglot---noted-on-linuxubuntu-jun-2022) 10 | - [**Checking the client, server/api \& database**](#checking-the-client-serverapi--database) 11 | - [**Check the client**](#check-the-client) 12 | - [**Check the api**](#check-the-api) 13 | - [**Notable API endpoints to test using GET:**](#notable-api-endpoints-to-test-using-get) 14 | - [**Notable API endpoints to test using POST:**](#notable-api-endpoints-to-test-using-post) 15 | - [**Check the database**](#check-the-database) 16 | - [**With Docker**](#with-docker) 17 | - [**With Stand alone backend**](#with-stand-alone-backend) 18 | 19 | 20 | # **Errors during setup** 21 | 22 | ## **Errors with Docker** 23 | ### **gcsfuse** - Noted on Mac M1 (Dec 2022) 24 | Seems to be a architecture issue, resolved by running the command: 25 | ```bash 26 | export DOCKER_DEFAULT_PLATFORM=linux/amd64 27 | ``` 28 | [solution reference](https://github.com/GoogleCloudPlatform/gcsfuse/issues/586) 29 | 30 | ### **failed to solve** - Noted on Mac M1 (Dec 2022) 31 | Full err message: 32 | ``` 33 | failed to solve: rpc error: code = Unknown desc = failed to solve with frontend dockerfile.v0: failed to create LLB definition: failed to authorize: rpc error: code = Unknown desc = failed to fetch anonymous token: Get "https://auth.docker.io/token?scope=repository%3Alibrary%2Fnode%3Apull&service=registry.docker.io": dial tcp: lookup auth.docker.io on 192.168.0.1:53: no such host 34 | ``` 35 | 36 | This is a ad-hoc error, possible solutions: 37 | - Sign in to docker hub and docker cli ```docker signin``` 38 | - Within `Docker hub>Settings>Docker Engine`,set `buildkit` to `false` 39 | - Instead of `docker-compose`, try `docker compose` 40 | - Lost all hope? Go make a cup of coffee, sometimes it works if you just give it a minute... 41 | 42 | [solution signin reference](https://stackoverflow.com/questions/65361083/docker-build-failed-to-fetch-oauth-token-for-openjdk) | [solution buildkit reference](https://stackoverflow.com/questions/64221861/an-error-failed-to-solve-with-frontend-dockerfile-v0) 43 | 44 | **Note** Running these commands is not advisable: 45 | ```bash 46 | export DOCKER_BUILDKIT=0 47 | export COMPOSE_DOCKER_CLI_BUILD=0 48 | ``` 49 | This will invalidate the GCSFuse fix for Mac M1. 50 | 51 | ## **Errors with stand alone setup** 52 | 53 | ### **PyICU/Polyglot** - Noted on Linux/Ubuntu (Jun 2022) 54 | 55 | Resolved by running the commands: 56 | ```bash 57 | apt-get update 58 | ``` 59 | 60 | Then either - from apt directly : https://packages.debian.org/source/stable/pyicu: 61 | ```bash 62 | apt-get install python3-icu 63 | ``` 64 | OR - from source: 65 | ```bash 66 | apt-get install pkg-config libicu-dev 67 | pip install --no-binary=:pyicu: pyicu 68 | ``` 69 | 70 | # **Checking the client, server/api & database** 71 | ## **Check the client** 72 | The client should be running on http://localhost:3000. 73 | 74 | Check the terminal (standalone), inspect the webpage or view the docker logs for error output. 75 | ## **Check the api** 76 | The API should be running on http://localhost:5000 and return the following output: 77 | ```json 78 | { 79 | "message": "welcome Masakhane Web" 80 | } 81 | ``` 82 | Check the terminal (standalone) or view the docker logs for error output. 83 | 84 | ### **Notable API endpoints to test using GET:** 85 | Make get requests by going to the web endpoint in your browser 86 | | Endpoint | Description | 87 | | -------- | ----------- | 88 | | http://localhost:5000/update | Updates the local database with the newly loaded models | 89 | | http://localhost:5000/translate | Lists the saved models | 90 | 91 | 92 | 93 | ### **Notable API endpoints to test using POST:** 94 | Use a developer tool such as [Postman](https://www.postman.com/) to make POST requests 95 | | Endpoint | Description | Example Body | 96 | | ------ | --------- | --------- | 97 | | http://localhost:5000/translate | Returns the translated text |
{
"src_lang": "english",
"tgt_lang": "swahili",
"input": "Hello, how are you?"
}
| 98 | 99 | ## **Check the database** 100 | Docker makes use of a postgreSQL database 101 | The stand alone app uses sqlite, so there is an different method for access. 102 | 103 | ### **With Docker** 104 | The 'db-1' image in docker contains the database using PostgreSQL, you can access the DB system running on the image with the command: 105 | ``` 106 | docker-compose -f docker-compose.yml exec db psql --username=masakhane --dbname=masakhane 107 | ``` 108 | 109 | List all databases: 110 | ``` 111 | \l 112 | ``` 113 | 114 | Connect to the masakhane database: 115 | ``` 116 | \c masakhane 117 | ``` 118 | 119 | List relations 120 | ``` 121 | \dt 122 | ``` 123 | 124 | See saved information in a relation: 125 | ``` 126 | select * from language; 127 | ``` 128 | 129 | Quit the database: 130 | ``` 131 | \q 132 | ``` 133 | 134 | ### **With Stand alone backend** 135 | 136 | Within the `src/server/core/` directory, run this command to start the python interpreter: 137 | ``` 138 | python 139 | ``` 140 | 141 | Use the code below to check what is saved in the database 142 | 143 | ```python 144 | import sqlite3, os 145 | 146 | conn = sqlite3.connect("masakhane.db") 147 | c = conn.cursor() 148 | 149 | for row in c.execute('SELECT * FROM feedback'): 150 | print(row) 151 | 152 | for row in c.execute('SELECT * FROM language'): 153 | print(row) 154 | ``` 155 | -------------------------------------------------------------------------------- /docs/project_details.md: -------------------------------------------------------------------------------- 1 | # **Project Details** 2 | The requirements of Masakhane Web is to faciliate translations for African languages using different machine translation models. There is also an feauture to provide feedback and correction to inaccurate translations. 3 | 4 | ## **Table of Contents** 5 | - [**Tech Stack**](#tech-stack) 6 | - [**Frontend**](#frontend) 7 | - [**React**](#react) 8 | - [**Webpack**](#webpack) 9 | - [**Backend**](#backend) 10 | - [**Python**](#python) 11 | - [**Database**](#database) 12 | - [**Flask**](#flask) 13 | - [**File Structure**](#file-structure) 14 | 15 | 16 | 17 | # **Tech Stack** 18 | 19 | ## **Frontend** 20 | Review the [client readme](../../src/client/README.md) for more information. 21 | 22 | ### **React** 23 | The frontend is written using [React](https://reactjs.org/). 24 | 25 | ### **Webpack** 26 | The frontend also makes use of [Webpack](https://webpack.js.org/), a static module bundler for modern JavaScript applications. 27 | 28 | - **Webpack DevServer & Proxy** 29 | The [devServer](https://webpack.js.org/configuration/dev-server/) runs on http://translate.masakhane.io:80. 30 | The [proxy](https://webpack.js.org/configuration/dev-server/#devserverproxy) allows you to send requests to http://translate.masakhane.io/translate and have it hit the backen at http://localhost:5000/translate. 31 | 32 | 33 | ## **Backend** 34 | Review the [server readme](../../src/server/README.md) for more information 35 | 36 | ### **Python** 37 | The backend is written using [Python](https://www.python.org/) 38 | 39 | ### **Database** 40 | The backend database is predominantly PostgreSQL on Docker, but there is an option to use SQLite when running a stand-alone backend. 41 | 42 | ### **Flask** 43 | The backend also makes use of [Flask](https://flask.palletsprojects.com/en/2.2.x/), which is for web development in Python. 44 | 45 | - **App** 46 | Masakhane Web makes use of the Flask [application factory](https://flask.palletsprojects.com/en/2.2.x/patterns/appfactories/) pattern in `src/core/__init__.py` 47 | 48 | - **API** 49 | The API uses [flask_restful](https://flask-restful.readthedocs.io/en/latest/quickstart.html#resourceful-routing) and is defined in `src/core/resources/translate.py`. 50 | It is initialised along with the app in `src/core/__init__.py`. 51 | 52 | - **Database** 53 | The application interacts with the database using [flask_sqlalchemy](https://flask-sqlalchemy.palletsprojects.com/en/3.0.x/) and is defined in `src/core/extensions.py`. 54 | It is initialised along with the app in `src/core/__init__.py`. (Note the `.env.dev` for database config) 55 | 56 | # **File Structure** 57 | 58 | ``` 59 | .masakhane-web 60 | |-- docker-compose.yml # Docker compose for local instance 61 | |-- docker-compose.prod.yml # Docker compose for production instance 62 | |-- entrypoint.sh 63 | |-- environment.yaml 64 | `-- src 65 | |-- client # IDK much about the frontend, update required 66 | | |-- Dockerfile 67 | | |-- package-lock.json 68 | | |-- package.json 69 | | |-- public 70 | | |-- src 71 | | | |-- App.js 72 | | | |-- App.test.js 73 | | | |-- components 74 | | | | |-- translateCard.js 75 | | | | `-- *others* 76 | | | |-- images 77 | | | |-- index.css 78 | | | |-- index.js 79 | | | |-- logo.svg 80 | | | |-- pages 81 | | | | |-- About.js 82 | | | | |-- Faq.js 83 | | | | `-- Home.js 84 | | | |-- reportWebVitals.js 85 | | | |-- setupProxy.js 86 | | | `-- setupTests.js 87 | | `-- webpack.config.js 88 | `-- server 89 | |-- __init__.py 90 | |-- available_models.tsv # TSV file containing available models 91 | |-- languages.json # JSON file containing language information (names, etc) 92 | |-- Dockerfile 93 | |-- entrypoint.sh # Docker entrypoint for Dockerfile 94 | |-- Dockerfile.prod 95 | |-- entrypoint.prod.sh # Docker entrypoint for Dockerfile.prod 96 | |-- requirements.txt # Python dependencies 97 | |-- manage.py # Manage CLI 98 | |-- core 99 | | |-- __init__.py # Flask app factory & init 100 | | |-- resources 101 | | | `-- translate.py # Flask API 102 | | |-- extensions.py # Flask_SQLAlchemy init 103 | | |-- models 104 | | | |-- feedback.py # Feedback DB Model 105 | | | |-- language.py # Language DB Model 106 | | | |-- predict.py # I think this is in the wrong place, does the translation 107 | | | `-- translation.py # Translation object 108 | | |-- model_load.py # Class to manage the download and loading of different translation models 109 | | |-- config.py # Different config states for dev enviroments 110 | | |-- languages.json # Duplicate of ../languages.json 111 | | |-- tests 112 | | | |-- __init__.py 113 | | | |-- base.py # Test create app 114 | | | |-- test_app.py # Test API 115 | | | `-- test_config.py # Dev tests 116 | | |-- utils.py 117 | | `-- utils_bucket 118 | | |-- bucket.py 119 | | `-- upload_download.py 120 | |-- models # Translation models are stored here 121 | | `-- joeynmt 122 | | |-- en-sw-JW300 # File struct of a complete model for English to Swahili 123 | | | |-- config.yaml 124 | | | |-- config_orig.yaml 125 | | | |-- model.ckpt 126 | | | |-- src.bpe.model 127 | | | |-- src_vocab.txt 128 | | | |-- trg.bpe.model 129 | | | `-- trg_vocab.txt 130 | `-- nginx 131 | |-- Dockerfile 132 | `-- nginx.conf 133 | ``` -------------------------------------------------------------------------------- /docs/start_app_locally_doc.md: -------------------------------------------------------------------------------- 1 | # **Running the App Locally** 2 | 3 | The app can be run as a standalone or using Docker, unless you are working on an machine running linux/ubuntu, it is adviseable to use Docker. 4 | 5 | To run the app in production, see [here](start_app_prod_doc.md). 6 | 7 | For any errors during setup, please see the [debugging doc](debugging_setup.md). 8 | 9 | Review the [project details doc](project_details.md) for more information on the technology stack. 10 | Take note of the [Client](../../src/client/README.md) and [Server](../../src/server/README.md) README's. 11 | 12 | ## **Table of Contents** 13 | - [**Using Docker ( Preferred )**](#using-docker--preferred-) 14 | - [**Docker Setup**](#docker-setup) 15 | - [**Running the app**](#running-the-app) 16 | - [**Building the App**](#building-the-app) 17 | - [**Shut down the app**](#shut-down-the-app) 18 | - [**Add, Update, \& Delete Languages**](#add-update--delete-languages) 19 | - [**Running tests**](#running-tests) 20 | - [**The Database**](#the-database) 21 | - [**As a stand-alone app**](#as-a-stand-alone-app) 22 | - [**Backend Setup**](#backend-setup) 23 | - [**Run the server:**](#run-the-server) 24 | - [**The Database**](#the-database-1) 25 | - [**Add, Update, \& Delete Languages**](#add-update--delete-languages-1) 26 | - [**Running tests**](#running-tests-1) 27 | - [**Frontend Setup**](#frontend-setup) 28 | - [**Run the client:**](#run-the-client) 29 | - [**Errors during setup**](#errors-during-setup) 30 | 31 | 32 | # **Using Docker ( Preferred )** 33 | 34 | The better/easier way to run the app is to use Docker, which will build both the frontend and the backend with the correct enviroment setup. 35 | 36 | ## **Docker Setup** 37 | 38 | Ensure you have `docker` & `docker-compose` installed on your computer, you can check with the following commands: 39 | ```bash 40 | docker --version 41 | docker-compose --version 42 | ``` 43 | 44 | If the above commands return an error, please install [Docker](https://docs.docker.com/engine/install/) and [Docker-compose](https://docs.docker.com/compose/install/). 45 | 46 | ## **Running the app** 47 | ### **Building the App** 48 | To build the app, from the root project directory, run the following command: 49 | ```bash 50 | docker-compose -f docker-compose.yml up -d --build 51 | ``` 52 | 53 | Docker should create a container named 'masakhane-web' with the images 'db-1', 'server-1', and 'client-1'. 54 | The server should be active on http://localhost:5000 and the client on http://localhost:3000 55 | Look [here](debugging_setup.md#checking-the-client-serverapi--database) for checking these services manually. 56 | 57 | ### **Shut down the app** 58 | To shut down the app, run the following command to remove the docker container: 59 | ```bash 60 | docker-compose -f docker-compose.yml down 61 | ``` 62 | 63 | ### **Add, Update, & Delete Languages** 64 | **Add a Language** 65 | ```bash 66 | docker-compose -f docker-compose.yml exec server python manage.py add_language en-sw-JW300 67 | ``` 68 | The language code parameter `en-sw-JW300` represents {src-lang}-{tgt-lang}-{shortform} 69 | So `en-sw-JW300` represents English-Swahili using JW300 shortform 70 | **Note** - A code parameter example without shortform is `en-tiv-` 71 | 72 | Download available languages csv [here](https://zenodo.org/record/7417644/files/masakhane-mt-current-models.csv) 73 | 74 | **Update Langugaes** 75 | ```bash 76 | curl --request GET 'http://127.0.0.1:5000/update' 77 | ``` 78 | 79 | **Check available languages** 80 | ```bash 81 | docker-compose -f docker-compose.yml exec server python manage.py all_languages 82 | ``` 83 | 84 | **Remove a language** 85 | ```bash 86 | docker-compose -f docker-compose.yml exec server python manage.py remove_language en-sw-JW300 87 | ``` 88 | 89 | ### **Running tests** 90 | ```bash 91 | docker-compose -f docker-compose.yml exec server python manage.py tests 92 | ``` 93 | 94 | ### **The Database** 95 | Look [here](debugging_setup.md#with-docker) for more information about accessing the database 96 | 97 | # **As a stand-alone app** 98 | In order to run the app, we need to set up the backend and frontend seperately. 99 | **Note** It is advisable to be working on an linux/ubuntu machine. 100 | 101 | ## **Backend Setup** 102 | 103 | First, ensure you are running [Python 3.6.9](https://www.python.org/downloads/release/python-369/) 104 | 105 | Within the `src/server` directory of the project 106 | 107 | **Install required packages:** 108 | ```bash 109 | pip install -r requirements.txt 110 | ``` 111 | 112 | **Run the following commands:** 113 | ```bash 114 | export FLASK_APP=core/__init__.py 115 | export FLASK_ENV=development 116 | ``` 117 | 118 | ## **Run the server:** 119 | To start the API and database services, run the command: 120 | ```bash 121 | python manage.py run 122 | ``` 123 | 124 | ### **The Database** 125 | Look [here](debugging_setup.md#with-stand-alone-backend) for more information about accessing the database 126 | 127 | ### **Add, Update, & Delete Languages** 128 | **Add a Language** 129 | ```bash 130 | python manage.py add_language en-sw-JW300 131 | ``` 132 | The language code parameter `en-sw-JW300` represents {src-lang}-{tgt-lang}-{shortform} 133 | So `en-sw-JW300` represents English-Swahili using JW300 shortform 134 | **Note** - A code parameter example without shortform is `en-tiv-` 135 | 136 | Download available languages csv [here](https://zenodo.org/record/7417644/files/masakhane-mt-current-models.csv) 137 | 138 | **Update Langugaes** 139 | ```bash 140 | curl --request GET 'http://127.0.0.1:5000/update' 141 | ``` 142 | **Check available languages** 143 | ```bash 144 | python manage.py all_languages 145 | ``` 146 | 147 | **Remove a language** 148 | ```bash 149 | python manage.py remove_language en-sw-JW300 150 | ``` 151 | 152 | ### **Running tests** 153 | ```bash 154 | python manage.py tests 155 | ``` 156 | 157 | The API is available at `http://localhost:5000`, see notable API endpoints [here](debugging_setup.md#check-the-api) 158 | 159 | ## **Frontend Setup** 160 | 161 | Ensure you have [node.js](https://nodejs.org/en/) and [yarn](https://classic.yarnpkg.com/en/docs/install) installed 162 | 163 | Within the `src/client/` directory of the project: 164 | **Install required packages:** 165 | ```bash 166 | npm install --legacy-peer-deps 167 | ``` 168 | 169 | **Run the following commands:** 170 | ```bash 171 | npm i webpack webpack-cli --legacy-peer-deps 172 | npm i @babel/core @babel/preset-env @babel/preset-react babel-loader --legacy-peer-deps 173 | ``` 174 | 175 | ## **Run the client:** 176 | To start the client , run the command: 177 | ```bash 178 | npm run develop 179 | ``` 180 | 181 | The client is available at `http://localhost:3000` 182 | 183 | # **Errors during setup** 184 | If there was a problem during setup, review [this doc](debugging_setup.md) for possible errors and solutions. 185 | 186 | -------------------------------------------------------------------------------- /src/client/src/components/step2.js: -------------------------------------------------------------------------------- 1 | import { Row, Col, Form, Button } from 'react-bootstrap'; 2 | import RadioButton from './common/radioButton'; 3 | import React from 'react'; 4 | 5 | const Step2 = ({ src_lang, tgt_lang, text, translation, setForm, formData, navigation, handleSubmitFeedback }) => { 6 | 7 | const { understand_translation, accurate_translation, own_translation } = formData; 8 | const { next } = navigation; 9 | 10 | const handleSubmit = () => { 11 | // submit form 12 | handleSubmitFeedback(); 13 | // then navigate to next page 14 | next(); 15 | } 16 | return ( 17 |
18 |
19 |
Part 2/2
20 |
21 | 22 |
23 | 24 | 25 |

{!!src_lang && src_lang.toUpperCase()}

26 |

{text}

27 | 28 | 29 | 30 |

{!!tgt_lang && tgt_lang.toUpperCase()}

31 |

{!!translation && translation}

32 | 33 |
34 |
35 | 36 |
37 |
38 |
39 | 40 |
41 |

Did you understand the translation? / Did it make sense?

42 |
43 | 44 | 45 | 52 | 53 | 54 | 61 | 62 | 63 | 70 | 71 | 72 | 79 | 80 | 81 |
82 |
83 | 84 |
85 |
86 |
87 | 88 |
89 |

How accurate was the translation?

90 |
91 | 92 | 93 | 100 | 101 | 102 | 109 | 110 | 111 | 118 | 119 | 120 | 127 | 128 | 129 |
130 |
131 | 132 |
133 |
134 |
135 | 136 |
137 |

How would you have translated this? (Optional)

138 |
139 |
140 | 141 | 150 | 151 |
152 |
153 |
154 | 155 |
156 |
157 |
158 | 159 |
160 | 161 |
162 |
163 | ) 164 | } 165 | 166 | export default Step2; 167 | -------------------------------------------------------------------------------- /src/server/core/resources/translate.py: -------------------------------------------------------------------------------- 1 | #External modules 2 | from flask_restful import Resource 3 | from flask import request 4 | from http import HTTPStatus 5 | from collections import defaultdict 6 | import os, json 7 | #Internal modules 8 | from core.model_load import MasakhaneModelLoader 9 | from core.models.predict import Predicter 10 | from core.models.feedback import Feedback 11 | from core.models.language import Language 12 | from core.models.translation import Translation 13 | 14 | from pathlib import Path 15 | 16 | 17 | class TranslateResource(Resource): 18 | """ TranslateResource 19 | ----------------- 20 | #### User-Defined Flask API Resource accepting GET & POST\n 21 | GET - List's available models\\ 22 | POST - Performs translation from srg lang to tgt lang, review the server ReadMe for more info. 23 | """ 24 | def __init__(self, saved_models): 25 | self.models = saved_models 26 | 27 | # load languages.json into distros_dict 28 | json_file = os.environ.get('JSON','./languages.json') 29 | with open(json_file, 'r') as f: 30 | distros_dict = json.load(f) 31 | # init empty dicts to store full_name to short_name bindings 32 | self.languages_short_to_full = {} 33 | self.languages_full_to_short = {} 34 | 35 | for distro in distros_dict: 36 | self.languages_short_to_full[distro['language_short'].lower( 37 | )] = distro['language_en'].lower() 38 | self.languages_full_to_short[distro['language_en'].lower( 39 | )] = distro['language_short'].lower() 40 | # Example: languages_short_to_full['sw'] = 'swahili' 41 | # Example: languages_full_to_short['Swahili'] = 'sw' 42 | 43 | def post(self): 44 | """POST method to translate a given input 45 | --- 46 | 47 | ### Request Body 48 | ```json 49 | { 50 | "src_lang" : "src_lang_full", 51 | "tgt_lang" : "tgt_lang_full", 52 | "input": "input_text", 53 | } 54 | ``` 55 | ### Returns a Translation Object defined in `src/server/core/models/translation.py` 56 | ```json 57 | { 58 | "src_lang" : "src_lang_full", 59 | "tgt_lang" : "tgt_lang_full", 60 | "input": "input_text", 61 | "output": "translation_result" 62 | } 63 | ``` 64 | """ 65 | # Get req body 66 | data = request.get_json() 67 | source_language = data['src_lang'].lower() 68 | target_language = data['tgt_lang'].lower() 69 | 70 | #Get short_name from self.language_dicts 71 | source_language_short = self.languages_full_to_short[source_language] 72 | target_language_short = self.languages_full_to_short[target_language] 73 | 74 | #model key to provide translation 75 | input_model = source_language_short+'-'+target_language_short 76 | 77 | if input_model not in self.models.keys(): 78 | return {'message': 'model not found'}, HTTPStatus.NOT_FOUND 79 | else: 80 | translation_result = Predicter().translate( 81 | data['input'], model=self.models[input_model]['model'], 82 | src_vocab=self.models[input_model]['src_vocab'], 83 | trg_vocab=self.models[input_model]['trg_vocab'], 84 | preprocess=self.models[input_model]['preprocess'], 85 | postprocess=self.models[input_model]['postprocess'], 86 | logger=self.models[input_model]['logger'], 87 | beam_size=self.models[input_model]['beam_size'], 88 | beam_alpha=self.models[input_model]['beam_alpha'], 89 | level=self.models[input_model]['level'], 90 | lowercase=self.models[input_model]['lowercase'], 91 | max_output_length=self.models[input_model]['max_output_length'], 92 | use_cuda=self.models[input_model]['use_cuda'], 93 | ) 94 | 95 | trans = Translation(src_lang=data['src_lang'], 96 | tgt_lang=data['tgt_lang'], 97 | input=data['input'], 98 | output=translation_result) 99 | 100 | return trans.data, HTTPStatus.CREATED 101 | 102 | def get(self): 103 | """GET Method to list available models in memory 104 | --- 105 | 106 | Returns a json list, ie 107 | ```json 108 | [ 109 | { 110 | "type": "source", 111 | "name": "src_lang_full", 112 | "value": "src_lang_short", 113 | "targets": [ 114 | { 115 | "name": "tgt_lang_full", 116 | "value": "tgt_lang_short" 117 | } 118 | ] 119 | } 120 | ] 121 | ``` 122 | """ 123 | 124 | dict_output = defaultdict(lambda: []) 125 | #for each src-tgt key in model dict 126 | for couple in list(self.models.keys()): 127 | src, tgt = couple.split("-") 128 | dict_output[src].append( 129 | { 130 | 'name': self.languages_short_to_full[tgt].capitalize(), 131 | 'value': tgt 132 | } 133 | ) 134 | 135 | output = [] 136 | for source in dict_output: 137 | output.append( 138 | { 139 | "type": "source", 140 | "name": self.languages_short_to_full[source].capitalize(), 141 | "value": source, 142 | 'targets': dict_output[source] 143 | } 144 | ) 145 | 146 | return output, HTTPStatus.OK 147 | 148 | 149 | class AddResource(Resource): 150 | """ AddResource 151 | ----------------- 152 | #### User-Defined Flask API Resource accepting GET\n 153 | GET - Updates the models based on the model info stored in the Language table 154 | """ 155 | def __init__(self, saved_models): 156 | self.models = saved_models 157 | # Load file path to avialable_models.tsv which has all the github & google drive links that store the model files 158 | self.selected_models_file = os.environ.get('MODEL_ALL_FILE', 159 | "./available_models.tsv") 160 | 161 | def get(self): 162 | """GET Method to update the available models 163 | --- 164 | Returns a json Object, ie 165 | ```json 166 | { 167 | "message": "Models updated" 168 | } 169 | ``` 170 | """ 171 | model_loader = MasakhaneModelLoader(available_models_file=os.environ.get('MODEL_ALL_FILE', 172 | './available_models.tsv')) 173 | db_pairs = [] 174 | model_directory = Path.cwd().joinpath('models', 'joeynmt') 175 | downloaded_models = list(model_directory.iterdir()) 176 | #loads model info from the Language table 177 | for lan in Language.query.all(): 178 | language_pair = lan.to_json() 179 | src_language =language_pair['source'] 180 | tgt_language = language_pair['target'] 181 | domain = language_pair['domain'] 182 | db_pair = f"{language_pair['source']}-{language_pair['target']}" 183 | # check if the model is not already loaded in memory 184 | if db_pair not in list(self.models.keys()): 185 | name_tag = src_language+"-"+tgt_language+"-"+domain 186 | # check if the model is not already downloaded 187 | if name_tag not in downloaded_models: 188 | print("Downloading model for "+name_tag) 189 | model_loader.download_model(src_language, tgt_language, domain) 190 | # Attempts to download model and store in self.models 191 | self.models[db_pair] = model_loader.load_model(src_language, tgt_language, domain) 192 | print(f"db_pair : {db_pair} \n now : {list(self.models.keys())}") 193 | 194 | # keep all the pairs in the db 195 | db_pairs.append(db_pair) 196 | 197 | # Remove models from memory that are not listed in the DB Language table 198 | for pair in list(self.models.keys()): 199 | if pair not in db_pairs: 200 | del self.models[pair] 201 | 202 | return {'message': "Models updated"}, HTTPStatus.OK 203 | 204 | 205 | class SaveResource(Resource): 206 | """ SaveResource 207 | ------------ 208 | #### User-Defined Flask API Resource accepting POST\n 209 | POST - saves feedback/correction information into the Feedback database 210 | """ 211 | def __init__(self): 212 | super().__init__() 213 | 214 | def post(self): 215 | """POST Method to save feeback into the DB Feedback table 216 | --- 217 | ### Request Body 218 | ```json 219 | { 220 | "src_lang" : "src_lang_full", 221 | "tgt_lang" : "tgt_lang_full", 222 | "input": "input_text", 223 | "review": "translation_correction", 224 | "stars": "translation_confidence", 225 | "token": "user_auth(bool)", 226 | } 227 | ``` 228 | ### Returns a Translation Object defined in `src/server/core/models/translation.py 229 | ```json 230 | { 231 | "message": "Review saved" 232 | } 233 | """ 234 | 235 | data = request.get_json() 236 | 237 | feedback = Feedback( 238 | src_lang=data['src_lang'], 239 | tgt_lang=data['tgt_lang'], 240 | accurate_translation=data['accurate_translation'], 241 | know_src_lang=data['know_src_lang'], 242 | know_tgt_lang=data['know_tgt_lang'], 243 | own_translation=data['own_translation'], 244 | text=data['text'], 245 | translation=data['translation'], 246 | understand_translation=data['understand_translation'], 247 | feedbackToken=data['feedbackToken']) 248 | 249 | feedback.save() 250 | 251 | return {'message': "Review saved"}, HTTPStatus.CREATED 252 | 253 | 254 | class HomeResource(Resource): 255 | """ HomeResource 256 | ------------ 257 | User-Defined Flask API Resource accepting GET\n 258 | GET - returns {'message': "welcome Masakhane Web"} 259 | """ 260 | def __init__(self): 261 | super().__init__() 262 | 263 | def get(self): 264 | return {'message': "welcome Masakhane Web"}, HTTPStatus.OK 265 | -------------------------------------------------------------------------------- /src/torchserve/transformer_handler.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import logging 4 | import os 5 | from abc import ABC 6 | 7 | import torch 8 | import transformers 9 | from captum.attr import LayerIntegratedGradients 10 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 11 | 12 | from ts.torch_handler.base_handler import BaseHandler 13 | 14 | logger = logging.getLogger(__name__) 15 | logger.info("Transformers version %s", transformers.__version__) 16 | 17 | 18 | class M2MTranslatorHandler(BaseHandler, ABC): 19 | """ 20 | Transformer handler for machine translation task using the m2m_100 model. 21 | """ 22 | 23 | def __init__(self): 24 | super(M2MTranslatorHandler, self).__init__() 25 | self.initialized = False 26 | 27 | def initialize(self, ctx): 28 | """In this initialize function, the BERT model is loaded and 29 | the Layer Integrated Gradients Algorithm for Captum Explanations 30 | is initialized here. 31 | Args: 32 | ctx (context): It is a JSON Object containing information 33 | pertaining to the model artefacts parameters. 34 | """ 35 | self.manifest = ctx.manifest 36 | properties = ctx.system_properties 37 | model_dir = properties.get("model_dir") 38 | serialized_file = self.manifest["model"]["serializedFile"] 39 | model_pt_path = os.path.join(model_dir, serialized_file) 40 | 41 | self.device = torch.device( 42 | "cuda:" + str(properties.get("gpu_id")) 43 | if torch.cuda.is_available() and properties.get("gpu_id") is not None 44 | else "cpu" 45 | ) 46 | self.tokenizer = AutoTokenizer.from_pretrained(model_dir) 47 | 48 | self.model = AutoModelForSeq2SeqLM.from_pretrained(model_dir) 49 | self.model.eval() 50 | 51 | logger.info("Transformer model from path %s loaded successfully", model_dir) 52 | 53 | self.initialized = True 54 | 55 | def preprocess(self, requests): 56 | """Basic text preprocessing, based on the user's choice of application mode. 57 | Args: 58 | requests (str): The Input data in the form of text is passed on to the preprocess 59 | function. 60 | Returns: 61 | list : The preprocess function returns a list of Tensor for the size of the word tokens. 62 | """ 63 | input_ids_batch = None 64 | attention_mask_batch = None 65 | for idx, data in enumerate(requests): 66 | input_text = data.get("data") 67 | if input_text is None: 68 | input_text = data.get("body") 69 | if isinstance(input_text, (bytes, bytearray)): 70 | input_text = input_text.decode("utf-8") 71 | logger.info("Received text: '%s'", input_text) 72 | 73 | inputs = self.tokenizer.encode_plus( 74 | input_text, 75 | max_length=int(max_length), 76 | pad_to_max_length=True, 77 | add_special_tokens=True, 78 | return_tensors="pt", 79 | ) 80 | input_ids = inputs["input_ids"].to(self.device) 81 | attention_mask = inputs["attention_mask"].to(self.device) 82 | # making a batch out of the recieved requests 83 | # attention masks are passed for cases where input tokens are padded. 84 | if input_ids.shape is not None: 85 | if input_ids_batch is None: 86 | input_ids_batch = input_ids 87 | attention_mask_batch = attention_mask 88 | else: 89 | input_ids_batch = torch.cat((input_ids_batch, input_ids), 0) 90 | attention_mask_batch = torch.cat( 91 | (attention_mask_batch, attention_mask), 0 92 | ) 93 | return (input_ids_batch, attention_mask_batch) 94 | 95 | def inference(self, input_batch): 96 | """Predict the class (or classes) of the received text using the 97 | serialized transformers checkpoint. 98 | Args: 99 | input_batch (list): List of Text Tensors from the pre-process function is passed here 100 | Returns: 101 | list : It returns a list of the predicted value for the input text 102 | """ 103 | input_ids_batch, attention_mask_batch = input_batch 104 | inferences = [] 105 | # todos need to change this to use the generation configuration 106 | outputs = self.model.generate( 107 | input_ids_batch, max_length=50, do_sample=True, top_p=0.95, top_k=60 108 | ) 109 | for i, x in enumerate(outputs): 110 | inferences.append(self.tokenizer.decode(outputs[i], skip_special_tokens=True)) 111 | logger.info("Generated text: '%s'", inferences) 112 | 113 | logger.info("Generated text", inferences) 114 | return inferences 115 | 116 | def postprocess(self, inference_output): 117 | """Post Process Function converts the predicted response into Torchserve readable format. 118 | Args: 119 | inference_output (list): It contains the predicted response of the input text. 120 | Returns: 121 | (list): Returns a list of the Predictions and Explanations. 122 | """ 123 | return inference_output 124 | 125 | def get_insights(self, input_batch, text, target): 126 | """This function initialize and calls the layer integrated gradient to get word importance 127 | of the input text if captum explanation has been selected through setup_config 128 | Args: 129 | input_batch (int): Batches of tokens IDs of text 130 | text (str): The Text specified in the input request 131 | target (int): The Target can be set to any acceptable label under the user's discretion. 132 | Returns: 133 | (list): Returns a list of importances and words. 134 | """ 135 | 136 | if self.setup_config["captum_explanation"]: 137 | embedding_layer = getattr(self.model, self.setup_config["embedding_name"]) 138 | embeddings = embedding_layer.embeddings 139 | self.lig = LayerIntegratedGradients(captum_sequence_forward, embeddings) 140 | else: 141 | logger.warning("Captum Explanation is not chosen and will not be available") 142 | 143 | if isinstance(text, (bytes, bytearray)): 144 | text = text.decode("utf-8") 145 | text_target = ast.literal_eval(text) 146 | 147 | if not self.setup_config["mode"] == "question_answering": 148 | text = text_target["text"] 149 | self.target = text_target["target"] 150 | 151 | input_ids, ref_input_ids, attention_mask = construct_input_ref( 152 | text, self.tokenizer, self.device, self.setup_config["mode"] 153 | ) 154 | all_tokens = get_word_token(input_ids, self.tokenizer) 155 | response = {} 156 | response["words"] = all_tokens 157 | return [response] 158 | 159 | 160 | def construct_input_ref(text, tokenizer, device, mode): 161 | """For a given text, this function creates token id, reference id and 162 | attention mask based on encode which is faster for captum insights 163 | Args: 164 | text (str): The text specified in the input request 165 | tokenizer (AutoTokenizer Class Object): To word tokenize the input text 166 | device (cpu or gpu): Type of the Environment the server runs on. 167 | Returns: 168 | input_id(Tensor): It attributes to the tensor of the input tokenized words 169 | ref_input_ids(Tensor): Ref Input IDs are used as baseline for the attributions 170 | attention mask() : The attention mask is a binary tensor indicating the position 171 | of the padded indices so that the model does not attend to them. 172 | """ 173 | 174 | text_ids = tokenizer.encode(text, add_special_tokens=False) 175 | # construct input token ids 176 | logger.info("text_ids %s", text_ids) 177 | logger.info("[tokenizer.cls_token_id] %s", [tokenizer.cls_token_id]) 178 | input_ids = [tokenizer.cls_token_id] + text_ids + [tokenizer.sep_token_id] 179 | logger.info("input_ids %s", input_ids) 180 | 181 | input_ids = torch.tensor([input_ids], device=device) 182 | # construct reference token ids 183 | ref_input_ids = ( 184 | [tokenizer.cls_token_id] 185 | + [tokenizer.pad_token_id] * len(text_ids) 186 | + [tokenizer.sep_token_id] 187 | ) 188 | ref_input_ids = torch.tensor([ref_input_ids], device=device) 189 | # construct attention mask 190 | attention_mask = torch.ones_like(input_ids) 191 | return input_ids, ref_input_ids, attention_mask 192 | 193 | 194 | def captum_sequence_forward(inputs, attention_mask=None, position=0, model=None): 195 | """This function is used to get the predictions from the model and this function 196 | can be used independent of the type of the BERT Task. 197 | Args: 198 | inputs (list): Input for Predictions 199 | attention_mask (list, optional): The attention mask is a binary tensor indicating the position 200 | of the padded indices so that the model does not attend to them, it defaults to None. 201 | position (int, optional): Position depends on the BERT Task. 202 | model ([type], optional): Name of the model, it defaults to None. 203 | Returns: 204 | list: Prediction Outcome 205 | """ 206 | model.eval() 207 | model.zero_grad() 208 | pred = model(inputs, attention_mask=attention_mask) 209 | pred = pred[position] 210 | return pred 211 | 212 | 213 | def summarize_attributions(attributions): 214 | """Summarises the attribution across multiple runs 215 | Args: 216 | attributions ([list): attributions from the Layer Integrated Gradients 217 | Returns: 218 | list : Returns the attributions after normalizing them. 219 | """ 220 | attributions = attributions.sum(dim=-1).squeeze(0) 221 | attributions = attributions / torch.norm(attributions) 222 | return attributions 223 | 224 | 225 | def get_word_token(input_ids, tokenizer): 226 | """constructs word tokens from token id using the BERT's 227 | Auto Tokenizer 228 | Args: 229 | input_ids (list): Input IDs from construct_input_ref method 230 | tokenizer (class): The Auto Tokenizer Pre-Trained model object 231 | Returns: 232 | (list): Returns the word tokens 233 | """ 234 | indices = input_ids[0].detach().tolist() 235 | tokens = tokenizer.convert_ids_to_tokens(indices) 236 | # Remove unicode space character from BPE Tokeniser 237 | tokens = [token.replace("Ġ", "") for token in tokens] 238 | return tokens 239 | -------------------------------------------------------------------------------- /src/server/core/model_load.py: -------------------------------------------------------------------------------- 1 | import os, yaml, logging, re 2 | # external imports 3 | import torch 4 | from joeynmt.helpers import load_config 5 | from subword_nmt import apply_bpe 6 | from subword_nmt import apply_bpe 7 | from sacremoses import MosesTokenizer, MosesDetokenizer 8 | from joeynmt.helpers import load_config, get_latest_checkpoint, \ 9 | load_checkpoint 10 | from joeynmt.vocabulary import build_vocab 11 | from joeynmt.model import build_model 12 | from joeynmt.prediction import validate_on_data 13 | from urllib.request import urlopen 14 | from io import BytesIO 15 | from zipfile import ZipFile 16 | 17 | # internal imports 18 | from core.utils import load_line_as_data 19 | 20 | class MasakhaneModelLoader(): 21 | """User Defined Class to manage the download of machine trasnlation models""" 22 | def __init__(self, available_models_file): 23 | # model directory to store the modeks 24 | self._model_dir_prefix = os.environ.get('MODEL', 25 | "./models/joeynmt/") 26 | self._src_language = '' 27 | #load availiable models into memory 28 | self.models = self.load_available_models(available_models_file) 29 | 30 | def load_available_models(self, available_models_file): 31 | """Load a dictonary with available models to download""" 32 | models = {} 33 | with open(available_models_file, 'r') as ofile: 34 | # iterate over file entries 35 | for i, line in enumerate(ofile): 36 | entries = line.strip().split("\t") 37 | # extract headers 38 | if i == 0: 39 | header_keys = [h.__str__() for h in entries] 40 | continue 41 | 42 | # build available model dictionary from the headers & entries: 43 | # https://www.geeksforgeeks.org/python-dictionary-comprehension/ 44 | model = {key:value for key,value in zip(header_keys, entries)} 45 | # don't add incomplete models 46 | if model['complete'] != 'yes': 47 | continue 48 | 49 | models[f"{model['src_language']}-{model['tgt_language']}-{model['domain']}"] = model 50 | 51 | print('Found {} Masakhane models.'.format(len(models))) 52 | 53 | return models 54 | 55 | def download_model(self, src_language, tgt_language, domain): 56 | """ Download model for given trg language. """ 57 | model_dir = f"{self._model_dir_prefix}{src_language}-{tgt_language}-{domain}" 58 | 59 | if not os.path.exists(model_dir): 60 | os.system(f'mkdir -p {model_dir}') 61 | 62 | model_files = self.models[f"{src_language}-{tgt_language}-{domain}"] 63 | 64 | # Check if files exist 65 | ckpt_path = os.path.join(model_dir, 'model.ckpt') 66 | src_vocab_path = os.path.join(model_dir, 'src_vocab.txt') 67 | trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt') 68 | config_path = os.path.join(model_dir, 'config_orig.yaml') 69 | src_bpe_path = os.path.join(model_dir, 'src.bpe.model') 70 | trg_bpe_path = os.path.join(model_dir, 'trg.bpe.model') 71 | 72 | if not os.path.exists in [ckpt_path, src_vocab_path, trg_vocab_path, config_path, src_bpe_path, trg_bpe_path]: 73 | URL = "https://zenodo.org/record/7636723/files/" + \ 74 | src_language + "-" + tgt_language 75 | if domain == "": 76 | URL += "-baseline.zip?download=1" 77 | else: 78 | URL += "-" + domain + "-baseline.zip?download=1" 79 | 80 | http_response = urlopen(URL) 81 | zipfile = ZipFile(BytesIO(http_response.read())) 82 | zipfile.extractall(path=model_dir) 83 | 84 | # Rename config file to config_orig.yaml. 85 | os.rename(os.path.join(model_dir, 'config.yaml'), config_path) 86 | 87 | # Adjust config. 88 | config = load_config(config_path) 89 | new_config_file = os.path.join(model_dir, 'config.yaml') 90 | config = self._update_config(config, src_vocab_path, trg_vocab_path, 91 | model_dir, ckpt_path) 92 | with open(new_config_file, 'w') as cfile: 93 | yaml.dump(config, cfile) 94 | 95 | print('Downloaded model for {}-{}.'.format(src_language, tgt_language)) 96 | 97 | def load_model(self, src_language, tgt_language, domain, bpe_src_code=None, tokenize=None): 98 | """ Load model for given trg language. """ 99 | model_dir = f"{self._model_dir_prefix}{src_language}-{tgt_language}-{domain}" 100 | 101 | ckpt_path = os.path.join(model_dir, 'model.ckpt') 102 | src_vocab_path = os.path.join(model_dir, 'src_vocab.txt') 103 | trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt') 104 | config_path = os.path.join(model_dir, 'config_orig.yaml') 105 | 106 | # Adjust config. 107 | config = load_config(config_path) 108 | new_config_file = os.path.join(model_dir, 'config.yaml') 109 | config = self._update_config(config, src_vocab_path, trg_vocab_path, 110 | model_dir, ckpt_path) 111 | with open(new_config_file, 'w') as cfile: 112 | yaml.dump(config, cfile) 113 | 114 | print('Loaded model for {}-{}.'.format(src_language, tgt_language)) 115 | 116 | conf = {} 117 | 118 | logger = logging.getLogger(__name__) 119 | conf["logger"] = logger 120 | # load the Joey configuration 121 | cfg = load_config(new_config_file) 122 | # load the checkpoint 123 | if "load_model" in cfg['training'].keys(): 124 | ckpt = cfg['training']["load_model"] 125 | else: 126 | ckpt = get_latest_checkpoint(model_dir) 127 | if ckpt is None: 128 | raise FileNotFoundError("No checkpoint found in directory {}." 129 | .format(model_dir)) 130 | 131 | # prediction parameters from config 132 | conf["use_cuda"] = cfg["training"].get( 133 | "use_cuda", False) if torch.cuda.is_available() else False 134 | 135 | conf["level"] = cfg["data"]["level"] 136 | conf["max_output_length"] = cfg["training"].get( 137 | "max_output_length", None) 138 | conf["lowercase"] = cfg["data"].get("lowercase", False) 139 | 140 | # load the vocabularies 141 | src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt" 142 | trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt" 143 | 144 | conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file, 145 | dataset=None, max_size=-1, min_freq=0) 146 | conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file, 147 | dataset=None, max_size=-1, min_freq=0) 148 | 149 | # whether to use beam search for decoding, 0: greedy decoding 150 | if "testing" in cfg.keys(): 151 | conf["beam_size"] = cfg["testing"].get("beam_size", 0) 152 | conf["beam_alpha"] = cfg["testing"].get("alpha", -1) 153 | else: 154 | conf["beam_size"] = 1 155 | conf["beam_alpha"] = -1 156 | 157 | # pre-processing 158 | if tokenize is not None: 159 | src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"]) 160 | trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"]) 161 | # tokenize input 162 | def tokenizer(x): return src_tokenizer.tokenize(x, return_str=True) 163 | def detokenizer(x): return trg_tokenizer.detokenize( 164 | x.split(), return_str=True) 165 | else: 166 | def tokenizer(x): return x 167 | def detokenizer(x): return x 168 | 169 | if bpe_src_code is not None and level == "bpe": 170 | # load bpe merge file 171 | merge_file = open(bpe_src_code, "r") 172 | bpe = apply_bpe.BPE(codes=merge_file) 173 | def segmenter(x): return bpe.process_line(x.strip()) 174 | elif conf["level"] == "char": 175 | # split to chars 176 | def segmenter(x): return list(x.strip()) 177 | else: 178 | def segmenter(x): return x.strip() 179 | 180 | conf["preprocess"] = [tokenizer, segmenter] 181 | conf["postprocess"] = [detokenizer] 182 | # build model and load parameters into it 183 | model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"]) 184 | model = build_model( 185 | cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"]) 186 | model.load_state_dict(model_checkpoint["model_state"]) 187 | if conf["use_cuda"]: 188 | model.cuda() 189 | conf["model"] = model 190 | print("Joey NMT model loaded successfully.") 191 | 192 | return conf 193 | 194 | def _update_config(self, config, new_src_vocab_path, new_trg_vocab_path, 195 | new_model_dir, new_ckpt_path): 196 | """Overwrite the settings in the given config.""" 197 | config['data']['src_vocab'] = new_src_vocab_path 198 | if config['model'].get('tied_embeddings', False): 199 | config['data']['trg_vocab'] = new_src_vocab_path 200 | else: 201 | config['data']['trg_vocab'] = new_trg_vocab_path 202 | config['training']['model_dir'] = new_model_dir 203 | config['training']['load_model'] = new_ckpt_path 204 | return config 205 | 206 | def _is_lowercase(self, src_vocab_path): 207 | # Infer whether the model is built on lowercased data. 208 | lowercase = True 209 | with open(src_vocab_path, 'r') as ofile: 210 | for line in ofile: 211 | if line != line.lower(): 212 | lowercase = False 213 | break 214 | return lowercase 215 | 216 | # Doesn't look like these functions are ever called... 217 | 218 | def _download_gdrive_file(self, file_id, destination): 219 | """Download a file from Google Drive and store in local file.""" 220 | download_link = 'https://drive.google.com/uc?id={}'.format(file_id) 221 | os.system(f'gdown -q -O {destination} {download_link}') 222 | 223 | def _download_github_file(self, github_raw_path, destination): 224 | """Download a file from GitHub.""" 225 | os.system(f'wget -q -O {destination} {github_raw_path}') 226 | 227 | def _download(self, url, destination): 228 | """Download file from Github or Googledrive.""" 229 | try: 230 | if 'drive.google.com' in url: 231 | if url.startswith('https://drive.google.com/file'): 232 | file_id = url.split("/")[-1] 233 | elif url.startswith('https://drive.google.com/open?'): 234 | file_id = url.split('id=')[-1] 235 | self._download_gdrive_file(file_id, destination) 236 | else: 237 | self._download_github_file(url, destination) 238 | except: 239 | print("Download failed, didn't recognize url {}.".format(url)) 240 | 241 | -------------------------------------------------------------------------------- /src/server/core/models/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import ipdb 3 | import logging 4 | import re 5 | 6 | import pandas as pd 7 | from subword_nmt import apply_bpe 8 | from polyglot.text import Text 9 | from flask import current_app 10 | from subword_nmt import apply_bpe 11 | from sacremoses import MosesTokenizer, MosesDetokenizer 12 | from core.utils import load_line_as_data 13 | from joeynmt.helpers import load_config, get_latest_checkpoint, \ 14 | load_checkpoint 15 | from joeynmt.vocabulary import build_vocab 16 | from joeynmt.model import build_model 17 | from joeynmt.prediction import validate_on_data 18 | 19 | 20 | 21 | 22 | def load_model(model_dir, bpe_src_code=None, tokenize=None): 23 | """ 24 | Start the bot. This means loading the model according to the config file. 25 | 26 | :param model_dir: Model directory of trained Joey NMT model. 27 | :param bpe_src_code: BPE codes for source side processing (optional). 28 | :param tokenize: If True, tokenize inputs with Moses tokenizer. 29 | :return: 30 | """ 31 | conf = {} 32 | cfg_file = model_dir+"/config.yaml" 33 | 34 | logger = logging.getLogger(__name__) 35 | conf["logger"] = logger 36 | # load the Joey configuration 37 | cfg = load_config(cfg_file) 38 | 39 | # load the checkpoint 40 | if "load_model" in cfg['training'].keys(): 41 | ckpt = cfg['training']["load_model"] 42 | else: 43 | ckpt = get_latest_checkpoint(model_dir) 44 | if ckpt is None: 45 | raise FileNotFoundError("No checkpoint found in directory {}." 46 | .format(model_dir)) 47 | 48 | # prediction parameters from config 49 | conf["use_cuda"] = cfg["training"].get("use_cuda", False) 50 | conf["level"] = cfg["data"]["level"] 51 | conf["max_output_length"] = cfg["training"].get("max_output_length", None) 52 | conf["lowercase"] = cfg["data"].get("lowercase", False) 53 | 54 | # load the vocabularies 55 | src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt" 56 | trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt" 57 | 58 | conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file, 59 | dataset=None, max_size=-1, min_freq=0) 60 | conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file, 61 | dataset=None, max_size=-1, min_freq=0) 62 | 63 | # whether to use beam search for decoding, 0: greedy decoding 64 | if "testing" in cfg.keys(): 65 | conf["beam_size"] = cfg["testing"].get("beam_size", 0) 66 | conf["beam_alpha"] = cfg["testing"].get("alpha", -1) 67 | else: 68 | conf["beam_size"] = 1 69 | conf["beam_alpha"] = -1 70 | 71 | # pre-processing 72 | if tokenize is not None: 73 | src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"]) 74 | trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"]) 75 | # tokenize input 76 | tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True) 77 | detokenizer = lambda x: trg_tokenizer.detokenize( 78 | x.split(), return_str=True) 79 | else: 80 | tokenizer = lambda x: x 81 | detokenizer = lambda x: x 82 | 83 | if bpe_src_code is not None and level == "bpe": 84 | # load bpe merge file 85 | merge_file = open(bpe_src_code, "r") 86 | bpe = apply_bpe.BPE(codes=merge_file) 87 | segmenter = lambda x: bpe.process_line(x.strip()) 88 | elif conf["level"] == "char": 89 | # split to chars 90 | segmenter = lambda x: list(x.strip()) 91 | else: 92 | segmenter = lambda x: x.strip() 93 | 94 | conf["preprocess"] = [tokenizer, segmenter] 95 | conf["postprocess"] = [detokenizer] 96 | # build model and load parameters into it 97 | model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"]) 98 | model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"]) 99 | model.load_state_dict(model_checkpoint["model_state"]) 100 | 101 | if conf["use_cuda"]: 102 | model.cuda() 103 | conf["model"] = model 104 | print("Joey NMT model loaded successfully.") 105 | return conf 106 | 107 | 108 | class Predicter(): 109 | # def __init__(self): 110 | # pass 111 | 112 | def translate(self, message_text, model, src_vocab, trg_vocab, preprocess, postprocess, 113 | logger, beam_size, beam_alpha, level, lowercase, 114 | max_output_length, use_cuda): 115 | """ 116 | Describes how to translate a text message. 117 | 118 | :param message_text: Slack command, could be text. 119 | :param model: The Joey NMT model. 120 | :param src_vocab: Source vocabulary. 121 | :param trg_vocab: Target vocabulary. 122 | :param preprocess: Preprocessing pipeline (a list). 123 | :param postprocess: Postprocessing pipeline (a list). 124 | :param beam_size: Beam size for decoding. 125 | :param beam_alpha: Beam alpha for decoding. 126 | :param level: Segmentation level. 127 | :param lowercase: Lowercasing. 128 | :param max_output_length: Maximum output length. 129 | :param use_cuda: Using CUDA or not. 130 | :return: 131 | """ 132 | # ipdb.set_trace() 133 | sentence = message_text.strip() 134 | # remove emojis 135 | emoji_pattern = re.compile("\:[a-zA-Z]+\:") 136 | sentence = re.sub(emoji_pattern, "", sentence) 137 | sentence = sentence.strip() 138 | if lowercase: 139 | sentence = sentence.lower() 140 | for p in preprocess: 141 | sentence = p(sentence) 142 | 143 | # load the data which consists only of this sentence 144 | test_data, src_vocab, trg_vocab = load_line_as_data(lowercase=lowercase, 145 | line=sentence, src_vocab=src_vocab, trg_vocab=trg_vocab, level=level) 146 | 147 | # generate outputs 148 | score, loss, ppl, sources, sources_raw, references, hypotheses, \ 149 | hypotheses_raw, attention_scores = validate_on_data( 150 | model, data=test_data, batch_size=1, level=level, 151 | max_output_length=max_output_length, eval_metric=None, 152 | use_cuda=use_cuda, beam_size=beam_size, 153 | beam_alpha=beam_alpha, n_gpu=0) 154 | 155 | # validate_on_data(model: Model, data: Dataset, 156 | # batch_size: int, 157 | # use_cuda: bool, max_output_length: int, 158 | # level: str, eval_metric: Optional[str], 159 | # n_gpu: int, 160 | # batch_class: Batch = Batch, 161 | # compute_loss: bool = False, 162 | # beam_size: int = 1, beam_alpha: int = -1, 163 | # batch_type: str = "sentence", 164 | # postprocess: bool = True, 165 | # bpe_type: str = "subword-nmt", 166 | # sacrebleu: dict = None) \ 167 | 168 | # post-process 169 | if level == "char": 170 | response = "".join(hypotheses) 171 | else: 172 | response = " ".join(hypotheses) 173 | 174 | for p in postprocess: 175 | response = p(response) 176 | 177 | return response 178 | 179 | 180 | def predict_translation(self, source, model_dir, lc): 181 | new_config_path = os.path.join(model_dir, 'config.yaml') 182 | 183 | # joenmt takes as input a file, so for the moment 184 | # I made the code to write the input into a file, ... 185 | 186 | if not os.path.exists(current_app.config['TEMP']): 187 | os.mkdir(current_app.config['TEMP']) 188 | 189 | path_to_temp = current_app.config['TEMP'] 190 | 191 | # if not os.path.exists("../../data/temps/"): 192 | # os.mkdir("../../data/temps/") 193 | # path_to_temp = "../../data/temps/" 194 | 195 | if not os.path.exists(path_to_temp): 196 | os.mkdir(path_to_temp) 197 | 198 | 199 | src_input_file = 'src_input.bpe.txt' 200 | # src_bpe_path = os.path.join(model_dir, 'src.bpe.model') 201 | 202 | # ted_link = 'https://raw.githubusercontent.com/juliakreutzer/masakhane-eval/master/data/multitarget-ted-filt.en.tsv' 203 | os.system(f'echo {source} > {path_to_temp}input.tsv') 204 | # src_data = SourceData(path_to_temp+'input.tsv', lc, \ 205 | # bpe_path=src_bpe_path, out_file=path_to_temp+src_input_file) 206 | # sources = src_data.get_sources() 207 | # ted_df = src_data.get_df() 208 | 209 | os.system(f"sed 's/@@ //g' {path_to_temp}{src_input_file} > {path_to_temp}src_input.txt") 210 | 211 | # os.system(f'echo {source} > input.txt') 212 | os.system(f'python -m joeynmt translate {new_config_path} < {path_to_temp}src_input.txt > {path_to_temp}trg_output_file') 213 | 214 | targets = post_process(path_to_temp+'trg_output_file', lc) 215 | # 216 | # with open('output.txt', 'r') as file: 217 | # output = file.read().replace('\n', '') 218 | 219 | # with open('trg_output_file', 'r') as file: 220 | # output = file.read().replace('\n', '') 221 | 222 | # return output 223 | 224 | return targets[0] if len(targets)>0 else "" 225 | 226 | 227 | class SourceData(): 228 | def __init__(self, data_link, lc, bpe_path, out_file): 229 | self._src_df = pd.read_csv(data_link, sep='\t', header=None, 230 | names=['source']) 231 | print("Loaded {} lines.".format(len(self._src_df))) 232 | self._bpe_model = self.load_bpe(bpe_path) 233 | self._src_df, self._sources = self.preprocess(out_file, lc) 234 | self.lc = lc 235 | 236 | def get_df(self): 237 | return self._src_df 238 | 239 | def get_sources(self): 240 | return self._sources 241 | 242 | def preprocess(self, out_file, lc): 243 | """Tokenize, (lowercase,) sub-word split. 244 | 245 | Using Polyglot since it was used for JW300. 246 | Preprocess the source column of a dataframe object and write to file. 247 | 248 | Pipeline: 249 | - tokenize 250 | - split into sub-words 251 | 252 | Append pre-processed sources to dataframe.""" 253 | tokenized_sentences = [] 254 | bped_sentences = [] 255 | sources = [] 256 | with open(out_file, 'w') as ofile: 257 | for i, row in self._src_df.iterrows(): 258 | sentence_i = Text(row[0]).sentences[0] 259 | tokenized_sentence = "" 260 | bped_sentence = "" 261 | tokenized = " ".join(sentence_i.words) 262 | sources.append(str(sentence_i)) 263 | if lc: 264 | tokenized = tokenized.lower() 265 | tokenized_sentence = tokenized 266 | bped = self._bpe_model.process_line(tokenized) 267 | bped_sentence = bped 268 | ofile.write("{}\n".format(bped)) 269 | tokenized_sentences.append(tokenized_sentence) 270 | bped_sentences.append(bped_sentence) 271 | data = self._src_df.assign( 272 | tokenized_sentences=tokenized_sentences) 273 | data = data.assign( 274 | bped_sentences=bped_sentences) 275 | return data, sources 276 | 277 | def load_bpe(self, bpe_path): 278 | with open(bpe_path, 'r') as ofile: 279 | bpe_model = apply_bpe.BPE(codes=ofile) 280 | return bpe_model 281 | 282 | # Post-processing 283 | def post_process(output_file, lc): 284 | """Load and detokenize translations. 285 | 286 | There is no given Polyglot detokenizer, so we do it by heuristics. 287 | """ 288 | targets = [] 289 | with open(output_file, 'r') as ofile: 290 | for line in ofile: 291 | sent = line.strip() 292 | sent = sent.replace('', '') 293 | sent = re.sub(r'\s+([?.!"-,:’])', r'\1', sent) 294 | sent = sent.replace('( ', '(').replace(' - ', '-').replace(' / ', '/').replace(' /', '/') 295 | if lc: 296 | # Cheap casing restoration... only first character but better than nothing. 297 | sent = sent[0].upper() + sent[1:] 298 | targets.append(sent) 299 | return targets -------------------------------------------------------------------------------- /src/client/src/components/translateCard.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import { useState, useLayoutEffect,useRef, useEffect} from 'react'; 3 | import { Container, Row, Col, Form, Button, Modal, Toast, OverlayTrigger, Tooltip } from 'react-bootstrap'; 4 | import {CopyToClipboard} from 'react-copy-to-clipboard'; 5 | 6 | import MultiStepForm from './multiStepForm'; 7 | 8 | const MIN_TEXTAREA_HEIGHT = 200; 9 | 10 | export default function TranslateCard() { 11 | const [input, setText] = useState(""); 12 | const [translation, setTranslation] = useState('...'); 13 | const [srcLanguages, setSrcLanguages] = useState([]); 14 | const [tgtLanguages, setTgtLanguages] = useState([]); 15 | const [show, setShow] = useState(false); 16 | const [src_lang, setSrc_Lang] = useState('English'); 17 | const [tgt_lang, setTgt_Lang] = useState('Swahili'); 18 | const [feedBackForm, setFeedBackForm] = useState({}); 19 | const textareaRef = useRef(null); 20 | const textareaRef2= useRef(null); 21 | const [feedbackToken, setFeedbackToken] = useState( 22 | localStorage.getItem('feedbackToken') || '' 23 | ); 24 | 25 | const [copySuccess, setCopySuccess] = useState(''); 26 | const [showToast, setShowToast] = useState(''); 27 | 28 | const handleClose = () => setShow(false); 29 | const handleShow = () => setShow(true); 30 | 31 | const copyToClipboard = () => { 32 | setCopySuccess('Translation Copied!'); 33 | setShowToast(true); 34 | }; 35 | 36 | const handleChangeSrc_Lang= (e) => { 37 | //localstorage 38 | const name = e.target.value 39 | localStorage.setItem('src_lang', name); 40 | 41 | //set state 42 | setSrc_Lang(name); 43 | //get target languages 44 | const target = srcLanguages.filter(x => x.name === name) 45 | const target_languages = target[0].targets 46 | setTgtLanguages(target_languages) 47 | setTgt_Lang(target_languages[0].name) 48 | }; 49 | 50 | const handleChangeTgt_Lang = (e) => { 51 | //localstorage 52 | localStorage.setItem('tgt_lang', e.target.value); 53 | 54 | //set state 55 | setTgt_Lang(e.target.value); 56 | 57 | // console.log(e.target.value) 58 | 59 | }; 60 | 61 | const handleTranslate = (e) => { 62 | console.log('translating ..') 63 | console.log(src_lang) 64 | console.log(tgt_lang) 65 | e.preventDefault() 66 | 67 | fetch( 68 | '/translate', 69 | { 70 | method: 'post', 71 | // mode: 'no-cors', 72 | body: JSON.stringify({input, src_lang, tgt_lang}), 73 | headers: { 74 | 'Content-Type': 'application/json' 75 | }, 76 | // credentials: 'same-origin', 77 | }) 78 | .then(res => res.json()) 79 | .then(data => { 80 | console.log({ data }) 81 | // do something here 82 | setTranslation(data.output) 83 | }) 84 | }; 85 | 86 | const submitFeedBack = (formData) => { 87 | // first set state of feedback Form 88 | setFeedBackForm({...formData}); 89 | // then submit feedback form to db here 90 | // here's where you write the function to push feedback to backend 91 | 92 | console.log({formData}) 93 | 94 | fetch( 95 | '/save', 96 | { 97 | method: 'post', 98 | // mode: 'no-cors', 99 | body: JSON.stringify({ 100 | src_lang: formData.src_lang, 101 | tgt_lang: formData.tgt_lang, 102 | accurate_translation: formData.accurate_translation, 103 | know_src_lang: formData.know_src_lang, 104 | know_tgt_lang: formData.know_tgt_lang, 105 | own_translation: formData.own_translation, 106 | text: formData.text, 107 | translation: formData.translation, 108 | understand_translation: formData.understand_translation, 109 | feedbackToken: formData.feedbackToken 110 | }), 111 | headers: { 112 | 'Content-Type': 'application/json' 113 | }, 114 | // credentials: 'same-origin', 115 | }) 116 | .then(res => res.json()) 117 | .then(data => { 118 | //console.log({data}) 119 | // do something here 120 | handleClear() 121 | }) 122 | 123 | } 124 | 125 | 126 | const handleClear = () => { 127 | // clears text part 128 | setText(''); 129 | // clear translation 130 | setTranslation('...'); 131 | } 132 | 133 | useLayoutEffect(() => { 134 | // Reset height - important to shrink on delete 135 | textareaRef.current.style.height = "inherit"; 136 | // Set height 137 | textareaRef.current.style.height = `${Math.max( 138 | textareaRef.current.scrollHeight, 139 | MIN_TEXTAREA_HEIGHT 140 | )}px`; 141 | }, [input]); 142 | 143 | useLayoutEffect(() => { 144 | // Reset height - important to shrink on delete 145 | textareaRef2.current.style.height = "inherit"; 146 | // Set height 147 | textareaRef2.current.style.height = `${Math.max( 148 | textareaRef2.current.scrollHeight, 149 | MIN_TEXTAREA_HEIGHT 150 | )}px`; 151 | }, [input]); 152 | 153 | // console.log({feedbackToken}); 154 | // console.log({tgt_lang}); 155 | 156 | // console.log({feedbackToken}); 157 | 158 | let srcLang = []; 159 | let tgtLang = []; 160 | 161 | useEffect(()=> { 162 | // define fetch function 163 | let src = []; 164 | let tgt = []; 165 | const fetchLanguages = async ()=> { 166 | await fetch( 167 | '/update', 168 | { 169 | method: 'get', 170 | headers: { 171 | 'Content-Type': 'application/json' 172 | }, 173 | }) 174 | await fetch( 175 | '/translate', 176 | { 177 | method: 'get', 178 | headers: { 179 | 'Content-Type': 'application/json' 180 | }, 181 | // credentials: 'same-origin', 182 | }) 183 | .then(res => res.json()) 184 | .then(data => { 185 | console.log({ data}) 186 | // do something here 187 | setSrcLanguages(data) 188 | setTgtLanguages(data[0].targets) 189 | 190 | }) 191 | 192 | 193 | } 194 | // call fetch function 195 | fetchLanguages() 196 | 197 | }, []) 198 | // console.log(srcLanguages) 199 | // console.log(tgtLanguages) 200 | // console.log(tgt_lang) 201 | 202 | return ( 203 | 204 | 205 | 212 | 213 | 214 |

GIVE FEEDBACK

215 |

We appreciate your feedback and your contribution will help make our translation better.

216 | 217 |
218 | 219 | 229 | 230 |
231 | 232 | 233 | 234 | 235 | 236 |
237 | 238 | From: 239 | 240 | { 241 | srcLanguages && srcLanguages.map((option, index) => { 242 | return () 243 | }) 244 | } 245 | 246 | 247 |
248 | 249 | {/* 250 | 251 | { 252 | srcLanguages.length > 1 && srcLanguages 253 | .filter(x => x.value !== src_lang) 254 | .slice(0, 2) 255 | .map((option, index) => { 256 | return ( 257 | ) 258 | }) 259 | } 260 | 261 | */} 262 |
263 | 264 | 265 | 266 | 267 |
268 | 269 | To: 270 | 271 | 272 | { 273 | tgtLanguages.map((option, index) => { 274 | return () 275 | }) 276 | } 277 | 278 | 279 | 280 |
281 | 282 | {/* 283 | 284 | { 285 | tgtLanguages.length > 1 && tgtLanguages 286 | .filter(x => x.value !== tgt_lang) 287 | .slice(0, 2) 288 | .map((option, index) => { 289 | return ( 290 | ) 291 | }) 292 | } 293 | 294 | */} 295 |
296 | 297 |
298 | 299 | 300 |
301 | 302 | setText(e.target.value)} 311 | /> 312 | 313 |
314 | 315 | 316 | 317 | 318 | 319 | 320 | {' '} 321 | 322 | 323 | 324 | 325 |
326 | 327 | setText(e.target.value)} 339 | // autoFocus={showToast} 340 | /> 341 | {!translation && ( 342 | 343 | Sorry, there’s no translation for that phrase. 344 | 345 | )} 346 | 347 |
348 | 349 | 350 | 351 | {/* */} 352 | 353 | 354 | 358 | Copy Translation. 359 | 360 | } 361 | > 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 |
370 | 371 |
372 | setShowToast(false)} 374 | show={showToast} 375 | delay={3000} 376 | autohide 377 | style={{ 378 | position: 'absolute', 379 | bottom: 0, 380 | left: 0 381 | }} 382 | > 383 | {copySuccess} 384 | 385 |
386 |
387 | ) 388 | } 389 | --------------------------------------------------------------------------------