├── src
├── server
│ ├── __init__.py
│ ├── core
│ │ ├── tests
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── test_config.py
│ │ │ └── test_app.py
│ │ ├── extensions.py
│ │ ├── models
│ │ │ ├── translation.py
│ │ │ ├── language.py
│ │ │ ├── feedback.py
│ │ │ └── predict.py
│ │ ├── config.py
│ │ ├── utils_bucket
│ │ │ ├── upload_download.py
│ │ │ └── bucket.py
│ │ ├── __init__.py
│ │ ├── utils.py
│ │ ├── resources
│ │ │ └── translate.py
│ │ └── model_load.py
│ ├── nginx
│ │ ├── Dockerfile
│ │ └── nginx.conf
│ ├── .env.dev
│ ├── entrypoint.sh
│ ├── entrypoint.prod.sh
│ ├── Dockerfile
│ ├── requirements.txt
│ ├── Dockerfile.prod
│ ├── manage.py
│ └── README.md
├── client
│ ├── public
│ │ ├── robots.txt
│ │ ├── favico.png
│ │ ├── favicon.ico
│ │ ├── e5b14e8b30296b86b78d06886aa5a458.png
│ │ ├── manifest.json
│ │ ├── bundle.js.LICENSE.txt
│ │ ├── index.html
│ │ └── 217.bundle.js
│ ├── src
│ │ ├── images
│ │ │ ├── logo1.png
│ │ │ ├── logo2.png
│ │ │ ├── favico.png
│ │ │ ├── favicon.ico
│ │ │ ├── masakhane.png
│ │ │ ├── masakhane_bg.png
│ │ │ ├── masakhane_bg2.png
│ │ │ └── masakhane-border.png
│ │ ├── App.test.js
│ │ ├── components
│ │ │ ├── step3.test.js
│ │ │ ├── multiStepForm.test.js
│ │ │ ├── translateCard.test.js
│ │ │ ├── step1.test.js
│ │ │ ├── common
│ │ │ │ ├── radioButton.js
│ │ │ │ └── radioButton.test.js
│ │ │ ├── step2.test.js
│ │ │ ├── step3.js
│ │ │ ├── multiStepForm.js
│ │ │ ├── terms.js
│ │ │ ├── step1.js
│ │ │ ├── step2.js
│ │ │ └── translateCard.js
│ │ ├── setupTests.js
│ │ ├── setupProxy.js
│ │ ├── index.css
│ │ ├── reportWebVitals.js
│ │ ├── index.js
│ │ ├── pages
│ │ │ ├── Home.js
│ │ │ ├── Faq.js
│ │ │ └── About.js
│ │ ├── logo.svg
│ │ └── App.js
│ ├── README.md
│ ├── Dockerfile
│ ├── package.json
│ └── webpack.config.js
├── m_to_m_models
│ ├── kubernetes
│ │ ├── volume_claim.yaml
│ │ ├── volume.yaml
│ │ ├── secret.yaml
│ │ ├── triton-deployment.yaml
│ │ └── deployment.yaml
│ ├── app.py
│ ├── main.py
│ ├── model_handlers.py
│ ├── Dockerfile
│ └── requirements.txt
└── torchserve
│ ├── setup_config.json
│ ├── Download_Transformer_models.py
│ └── transformer_handler.py
├── .python-version
├── .dockerignore
├── entrypoint.sh
├── kubernetes
├── ingress-def.yml
└── sample-server.yaml
├── .github
└── ISSUE_TEMPLATE
│ └── dsfsi-standard-template.md
├── docker-compose.prod.yml
├── LICENSE
├── todo.md
├── docker-compose.yml
├── .gitignore
├── docs
├── start_app_prod_doc.md
├── debugging_setup.md
├── project_details.md
└── start_app_locally_doc.md
├── requirements-python3.10.txt
├── environment.yaml
├── README.md
└── Makefile
/src/server/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.7
2 |
--------------------------------------------------------------------------------
/src/server/core/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/client/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 |
--------------------------------------------------------------------------------
/src/client/public/favico.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/public/favico.png
--------------------------------------------------------------------------------
/src/client/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/public/favicon.ico
--------------------------------------------------------------------------------
/src/client/src/images/logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/logo1.png
--------------------------------------------------------------------------------
/src/client/src/images/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/logo2.png
--------------------------------------------------------------------------------
/src/client/src/images/favico.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/favico.png
--------------------------------------------------------------------------------
/src/client/src/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/favicon.ico
--------------------------------------------------------------------------------
/src/client/src/images/masakhane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/masakhane.png
--------------------------------------------------------------------------------
/src/client/src/images/masakhane_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/masakhane_bg.png
--------------------------------------------------------------------------------
/src/client/src/images/masakhane_bg2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/masakhane_bg2.png
--------------------------------------------------------------------------------
/src/server/nginx/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx:1.17-alpine
2 |
3 | RUN rm /etc/nginx/conf.d/default.conf
4 | COPY nginx.conf /etc/nginx/conf.d
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | env
2 | .dockerignore
3 | Dockerfile-dev
4 | Dockerfile-prod
5 |
6 | src/server/models/joeynmt
7 | src/server/core/models/joeynmt
--------------------------------------------------------------------------------
/src/client/src/images/masakhane-border.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/src/images/masakhane-border.png
--------------------------------------------------------------------------------
/src/client/public/e5b14e8b30296b86b78d06886aa5a458.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/masakhane-web/HEAD/src/client/public/e5b14e8b30296b86b78d06886aa5a458.png
--------------------------------------------------------------------------------
/src/server/core/extensions.py:
--------------------------------------------------------------------------------
1 | from flask_sqlalchemy import SQLAlchemy
2 | from flask_migrate import Migrate
3 |
4 | import os, sqlite3
5 |
6 | db = SQLAlchemy()
7 | migrate = Migrate()
8 |
--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | echo "Waiting for postgres ..."
4 |
5 | while ! nc -z users-db 5432; do
6 | sleep 0.1
7 | done
8 | echo "PostgreSQL started"
9 |
10 | python app.py
--------------------------------------------------------------------------------
/src/client/src/App.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from '@testing-library/react';
2 | import App from './App';
3 |
4 | describe('App', () => {
5 | test('renders App component', () => {
6 | render( );
7 | });
8 | });
9 |
--------------------------------------------------------------------------------
/src/client/src/components/step3.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from '@testing-library/react';
2 | import Step3 from './step3';
3 |
4 | describe('Step3', () => {
5 | test('renders Step3 component', () => {
6 | render( );
7 | });
8 | });
--------------------------------------------------------------------------------
/src/server/.env.dev:
--------------------------------------------------------------------------------
1 | FLASK_APP=core/__init__.py
2 | FLASK_ENV=development
3 | DATABASE_URL=postgresql://masakhane:masakhane@db:5432/masakhane
4 | SQL_HOST=db
5 | SQL_PORT=5432
6 | DATABASE=postgres
7 | SECRET_KEY=secret-key
8 | MODEL=./models/joeynmt/
9 | FLASK_DEBUG=1
10 |
--------------------------------------------------------------------------------
/src/client/src/setupTests.js:
--------------------------------------------------------------------------------
1 | // jest-dom adds custom jest matchers for asserting on DOM nodes.
2 | // allows you to do things like:
3 | // expect(element).toHaveTextContent(/react/i)
4 | // learn more: https://github.com/testing-library/jest-dom
5 | import '@testing-library/jest-dom';
6 |
--------------------------------------------------------------------------------
/src/client/src/components/multiStepForm.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from '@testing-library/react';
2 | import MultiStepForm from './multiStepForm';
3 |
4 | describe('MultiStepForm', () => {
5 | test('renders MultiStepForm component', () => {
6 | render( );
7 | });
8 | });
--------------------------------------------------------------------------------
/src/client/src/components/translateCard.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from '@testing-library/react';
2 | import TranslateCard from './translateCard';
3 |
4 | describe('TranslateCard', () => {
5 | test('renders TranslateCard component', () => {
6 | render( );
7 | });
8 | });
--------------------------------------------------------------------------------
/src/m_to_m_models/kubernetes/volume_claim.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: masakhane-model-cache-volume-claim
5 | namespace: masakhane
6 | spec:
7 | storageClassName: manual
8 | accessModes:
9 | - ReadWriteOnce
10 | resources:
11 | requests:
12 | storage: 8Gi
13 |
--------------------------------------------------------------------------------
/src/client/src/setupProxy.js:
--------------------------------------------------------------------------------
1 | // const { createProxyMiddleware } = require('http-proxy-middleware');
2 |
3 | // module.exports = function(app) {
4 | // app.use(
5 | // '/translate',
6 | // createProxyMiddleware({
7 | // target: 'http://localhost:5000',
8 | // changeOrigin: true,
9 | // })
10 | // );
11 | // };
--------------------------------------------------------------------------------
/src/server/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | if [ "$DATABASE" = "postgres" ]
4 | then
5 | echo "Waiting for postgres..."
6 |
7 | while ! nc -z $SQL_HOST $SQL_PORT; do
8 | sleep 0.1
9 | done
10 |
11 | echo "PostgreSQL started"
12 | fi
13 |
14 | python manage.py create_db
15 | python manage.py add_language en-sw-JW300
16 | exec "$@"
--------------------------------------------------------------------------------
/src/torchserve/setup_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "masakhane/m2m100_418M_en_swa_rel_news",
3 | "mode": "text_generation",
4 | "do_lower_case":false,
5 | "num_labels":"0",
6 | "save_mode":"pretrained",
7 | "max_length":"150",
8 | "captum_explanation":true,
9 | "embedding_name": "bert",
10 | "FasterTransformer":false,
11 | "BetterTransformer":false,
12 | "model_parallel":false
13 | }
14 |
--------------------------------------------------------------------------------
/src/m_to_m_models/kubernetes/volume.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolume
3 | metadata:
4 | name: translation-volume-storage
5 | namespace: masakhane
6 | labels:
7 | type: local
8 | spec:
9 | storageClassName: manual
10 | accessModes:
11 | - ReadWriteOnce
12 | capacity:
13 | storage: 10Gi
14 | hostPath:
15 | path: /models_datastore # the host on the minikube vm
16 |
--------------------------------------------------------------------------------
/kubernetes/ingress-def.yml:
--------------------------------------------------------------------------------
1 | apiVersion: networking.k8s.io/v1
2 | kind: Ingress
3 | metadata:
4 | name: seldon-ingress
5 | namespace: seldon
6 | spec:
7 | rules:
8 | - host: seldon-ingress.com
9 | http:
10 | paths:
11 | - path: "/"
12 | pathType: Prefix
13 | backend:
14 | service:
15 | name: iris-model-sklearn-iris-predictor
16 | port:
17 | number: 8000
18 |
--------------------------------------------------------------------------------
/src/client/src/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
5 | sans-serif;
6 | -webkit-font-smoothing: antialiased;
7 | -moz-osx-font-smoothing: grayscale;
8 | }
9 |
10 | code {
11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 | monospace;
13 | }
14 |
--------------------------------------------------------------------------------
/src/client/src/reportWebVitals.js:
--------------------------------------------------------------------------------
1 | const reportWebVitals = onPerfEntry => {
2 | if (onPerfEntry && onPerfEntry instanceof Function) {
3 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
4 | getCLS(onPerfEntry);
5 | getFID(onPerfEntry);
6 | getFCP(onPerfEntry);
7 | getLCP(onPerfEntry);
8 | getTTFB(onPerfEntry);
9 | });
10 | }
11 | };
12 |
13 | export default reportWebVitals;
14 |
--------------------------------------------------------------------------------
/src/m_to_m_models/kubernetes/secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 | name: masakhane-container-secret
5 | namespace: masakhane
6 | type: Opaque
7 | stringData:
8 | RCLONE_CONFIG_S3_TYPE: s3
9 | RCLONE_CONFIG_S3_PROVIDER: minio
10 | RCLONE_CONFIG_S3_ENV_AUTH: "false"
11 | RCLONE_CONFIG_S3_ACCESS_KEY_ID: minioadmin
12 | RCLONE_CONFIG_S3_SECRET_ACCESS_KEY: minioadmin
13 | RCLONE_CONFIG_S3_ENDPOINT: http://minio.minio-system.svc.cluster.local:9000
14 |
--------------------------------------------------------------------------------
/src/client/src/components/step1.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from '@testing-library/react';
2 | import Step1 from './step1';
3 |
4 | describe('Step1', () => {
5 | test('renders Step1 component', () => {
6 | const props = {
7 | src_lang: "none",
8 | tgt_lang: "none",
9 | setForm: () => {},
10 | formData: {},
11 | navigation: {},
12 | handleSubmitFeedback: () => {}
13 | };
14 | render( );
15 | });
16 | });
17 |
--------------------------------------------------------------------------------
/src/server/core/tests/base.py:
--------------------------------------------------------------------------------
1 | from flask_testing import TestCase
2 | from core.extensions import db
3 | from core import masakhane, load_model
4 |
5 |
6 | class BaseTestCase(TestCase):
7 | def create_app(self):
8 | masakhane.config.from_object('core.config.Config')
9 | return masakhane
10 |
11 | def setUp(self):
12 | db.create_all()
13 | db.session.commit()
14 |
15 | def tearDown(self):
16 | db.session.remove()
17 | db.drop_all()
--------------------------------------------------------------------------------
/src/client/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "short_name": "Masakhane Web",
3 | "name": "Create React App Sample",
4 | "icons": [
5 | {
6 | "src": "favico.png",
7 | "type": "image/png",
8 | "sizes": "192x192"
9 | },
10 | {
11 | "src": "favico.png",
12 | "type": "image/png",
13 | "sizes": "512x512"
14 | }
15 | ],
16 | "start_url": ".",
17 | "display": "standalone",
18 | "theme_color": "#000000",
19 | "background_color": "#ffffff"
20 | }
21 |
--------------------------------------------------------------------------------
/src/client/src/components/common/radioButton.js:
--------------------------------------------------------------------------------
1 | import { Form } from 'react-bootstrap';
2 | import React from 'react';
3 |
4 | const RadioButton = ({ value, label, selected, ...otherProps }) => {
5 | return(
6 |
7 |
{label}
8 |
9 |
10 | );
11 | }
12 |
13 | export default RadioButton;
--------------------------------------------------------------------------------
/src/client/src/components/step2.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from '@testing-library/react';
2 | import Step2 from './step2';
3 |
4 | describe('Step2', () => {
5 | test('renders Step2 component', () => {
6 | const props = {
7 | src_lang: "none",
8 | tgt_lang: "none",
9 | text: "",
10 | translation: "",
11 | setForm: () => {},
12 | formData: {},
13 | navigation: {},
14 | handleSubmitFeedback: () => {}
15 | };
16 | render( );
17 | });
18 | });
--------------------------------------------------------------------------------
/src/server/core/models/translation.py:
--------------------------------------------------------------------------------
1 | class Translation:
2 | def __init__(self, src_lang, tgt_lang, input, output) -> None:
3 | super().__init__()
4 | self.src_lang = src_lang
5 | self.tgt_lang = tgt_lang
6 | self.input = input
7 | self.output = output
8 |
9 | @property
10 | def data(self):
11 | return {
12 | 'src_lang': self.src_lang,
13 | 'tgt_lang': self.tgt_lang,
14 | 'input': self.input,
15 | 'output': self.output
16 | }
--------------------------------------------------------------------------------
/src/server/entrypoint.prod.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | if [ "$DATABASE" = "postgres" ]
4 | then
5 | echo "Waiting for postgres..."
6 |
7 | while ! nc -z $SQL_HOST $SQL_PORT; do
8 | sleep 0.1
9 | done
10 |
11 | echo "PostgreSQL started"
12 | fi
13 |
14 | if [ "$FLASK_ENV" = "development" ]
15 | then
16 | echo "Creating the database tables..."
17 | python manage.py clean
18 | echo "Tables created"
19 | fi
20 |
21 | python manage.py create_db
22 |
23 | python manage.py add_language en-sw-JW300
24 |
25 | exec "$@"
--------------------------------------------------------------------------------
/src/client/src/components/common/radioButton.test.js:
--------------------------------------------------------------------------------
1 | import {
2 | render,
3 | screen,
4 | getByRole,
5 | findByText,
6 | } from '@testing-library/react';
7 | import RadioButton from './radioButton';
8 |
9 | describe('RadioButton', () => {
10 | test('renders RadioButton component', () => {
11 | render( );
12 | });
13 |
14 | // test('should have a radio button input', () => {
15 |
16 | // })
17 |
18 | // test('should fire an onchange event', () => {
19 |
20 | // })
21 |
22 | });
23 |
--------------------------------------------------------------------------------
/src/client/src/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ReactDOM from 'react-dom';
3 | import App from './App';
4 | import reportWebVitals from './reportWebVitals';
5 | import "core-js/stable";
6 | import "regenerator-runtime/runtime";
7 |
8 | ReactDOM.render(
9 | ,
10 | document.getElementById('root')
11 | );
12 |
13 | // If you want to start measuring performance in your app, pass a function
14 | // to log results (for example: reportWebVitals(console.log))
15 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
16 | reportWebVitals();
17 |
--------------------------------------------------------------------------------
/src/m_to_m_models/kubernetes/triton-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: machinelearning.seldon.io/v1
2 | kind: SeldonDeployment
3 | metadata:
4 | name: triton-masakhane
5 | namespace: masakhane
6 | spec:
7 | name: default
8 | predictors:
9 | - graph:
10 | implementation: TRITON_SERVER
11 | logger:
12 | mode: all
13 | modelUri: s3://language-models/onnx-m2m100/1
14 | envSecretRefName: masakhane-container-secret
15 | name: triston-masakhane-predictor
16 | type: MODEL
17 | name: default
18 | replicas: 1
19 | protocol: kfserving
20 |
--------------------------------------------------------------------------------
/src/client/README.md:
--------------------------------------------------------------------------------
1 | # The Frontend
2 |
3 | **NOTE** I know next to nothing about this frontend so update needed
4 |
5 | The client is running on http://localhost:3000
6 |
7 | It consists of
8 | - ReactJS
9 | - Webpack
10 |
11 | # Available npm scripts:
12 |
13 | | Command | Executes |
14 | | ------- | -------- |
15 | | `npm run develop` | `webpack-dev-server --host 0.0.0.0` |
16 | | `npm run start-api` | `cd ../server && python app.py` |
17 | | `npm run build` | `react-scripts build` |
18 | | `npm run test` | `react-scripts test` |
19 | | `npm run eject` | `react-scripts eject` |
--------------------------------------------------------------------------------
/src/client/Dockerfile:
--------------------------------------------------------------------------------
1 | # Building the application
2 | FROM node:lts-buster as build
3 |
4 | WORKDIR /app
5 |
6 | ENV PATH /app/node_modules/.bin:$PATH
7 | # Increate node max memory, the default memory limit is too low for building
8 | ENV NODE_OPTIONS --max-old-space-size=8192
9 |
10 | # add dependencies
11 | COPY package.json package-lock.json ./
12 | # install dependencies
13 | RUN npm install --legacy-peer-deps
14 | RUN npm i webpack webpack-cli --legacy-peer-deps
15 | RUN npm i @babel/core @babel/preset-env @babel/preset-react babel-loader --legacy-peer-deps
16 |
17 | # add app
18 | COPY . ./
19 |
20 | # RUN npm command
21 | CMD ["npm", "run", "develop"]
22 |
--------------------------------------------------------------------------------
/src/client/src/components/step3.js:
--------------------------------------------------------------------------------
1 | import { Button } from 'react-bootstrap';
2 | import React from 'react';
3 |
4 | const Step3 = ({ setShow }) => {
5 | const handleShow = () => setShow(false);
6 |
7 | return (
8 |
9 |
THANK YOU!
10 | {/*
We appreciate your feedback and your contribution which help us make translations better.
*/}
11 |
12 | Done
13 |
14 |
15 | )
16 | }
17 |
18 | export default Step3;
19 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/dsfsi-standard-template.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: DSFSI Standard Template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | #### Description
11 | A clear and concise description of what the issue is about.
12 |
13 | #### Screenshots
14 | 
15 |
16 | #### Files
17 | A list of relevant files for this issue. This will help people navigate the project and offer some clues of where to start.
18 |
19 | #### To Reproduce
20 | If this issue is describing a bug, include some steps to reproduce the behavior.
21 |
22 | #### Tasks
23 | Include specific tasks in the order they need to be done in. Include links to specific lines of code where the task should happen at.
24 | - [ ] Task 1
25 | - [ ] Task 2
26 | - [ ] Task 3
27 |
--------------------------------------------------------------------------------
/kubernetes/sample-server.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: machinelearning.seldon.io/v1
2 | kind: SeldonDeployment
3 | metadata:
4 | name: iris-model
5 | namespace: seldon
6 | spec:
7 | name: iris
8 | annotations:
9 | prometheus.io/scrape: "false"
10 | predictors:
11 | - componentSpecs:
12 | - spec:
13 | containers:
14 | - env:
15 | - name: SELDON_LOG_LEVEL
16 | value: DEBUG
17 | - name: SELDON_DEBUG
18 | value: 'True'
19 | - name: FLASK_DEBUG
20 | value: 'True'
21 | image: seldonio/sklearn-iris:0.3
22 | imagePullPolicy: IfNotPresent
23 | name: sklearn-iris-classifier
24 | graph:
25 | endpoint:
26 | type: REST
27 | name: sklearn-iris-classifier
28 | type: MODEL
29 | name: sklearn-iris-predictor
30 | replicas: 1
31 |
--------------------------------------------------------------------------------
/src/m_to_m_models/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, jsonify, request
2 | from flask_cors import CORS
3 | import logging
4 |
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 | def create_app(model_handler):
9 | app = Flask(__name__, static_url_path="")
10 | CORS(app)
11 |
12 | @app.route("/predict", methods=["GET", "POST"])
13 | def predict():
14 | request_data = request.get_json()
15 | logger.debug("REST Request: %s", request)
16 | response = model_handler.predict_raw(request_data)
17 |
18 | json_response = jsonify(response)
19 | if (
20 | isinstance(response, dict)
21 | and "status" in response
22 | and "code" in response["status"]
23 | ):
24 | json_response.status_code = response["status"]["code"]
25 |
26 | logger.debug("REST Response: %s", response)
27 | return json_response
28 |
29 | return app
30 |
--------------------------------------------------------------------------------
/src/server/nginx/nginx.conf:
--------------------------------------------------------------------------------
1 | upstream masakhane-web {
2 | server api:5000;
3 | }
4 |
5 | upstream masakhane-web-client {
6 | server client:3000;
7 | }
8 |
9 | server {
10 |
11 | listen 80;
12 |
13 | root /images/;
14 |
15 | location / {
16 | proxy_pass http://masakhane-web-client;
17 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
18 | proxy_set_header Host $host;
19 | proxy_redirect off;
20 | }
21 |
22 | location /translate {
23 | proxy_pass http://masakhane-web;
24 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
25 | proxy_set_header Host $host;
26 | proxy_redirect off;
27 | }
28 |
29 | location /save {
30 | proxy_pass http://masakhane-web;
31 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
32 | proxy_set_header Host $host;
33 | proxy_redirect off;
34 | }
35 | }
--------------------------------------------------------------------------------
/docker-compose.prod.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 |
3 | services:
4 | api:
5 | build :
6 | context: ./src/server
7 | dockerfile: Dockerfile.prod
8 | command: gunicorn --bind 0.0.0.0:5000 manage:masakhane
9 | ports:
10 | - 5000:5000
11 | # expose:
12 | # - 5000
13 | env_file:
14 | - ./.env.prod
15 | depends_on:
16 | - db
17 |
18 | nginx:
19 | build: ./src/server/nginx
20 | ports:
21 | - 80:80
22 | depends_on:
23 | - api
24 |
25 | db:
26 | image: postgres:12-alpine
27 | volumes:
28 | - postgres_data:/var/lib/postgresql/data/
29 | env_file:
30 | - ./.env.prod.db
31 |
32 | client:
33 | build :
34 | context: ./src/client
35 | dockerfile: Dockerfile
36 | # command: curl --location --request GET 'http://0.0.0.0:5000/update' --data-raw ''
37 | volumes:
38 | - './src/client:/usr/src/app'
39 | ports:
40 | - 3000:3000
41 |
42 | depends_on:
43 | - api
44 |
45 | volumes:
46 | postgres_data:
--------------------------------------------------------------------------------
/src/server/core/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | basedir = os.path.abspath(os.path.dirname(__file__))
4 |
5 |
6 | class Config:
7 | DEBUG = False
8 | SQLALCHEMY_DATABASE_URI = os.getenv("DATABASE_URL", "sqlite:///masakhane.db")
9 |
10 | SQLALCHEMY_TRACK_MODIFICATIONS = False
11 | MODEL = os.getenv("MODEL", "./models/joeynmt/")
12 | TEMP = "./temp/"
13 | MODEL_ALL_FILE = "./available_models.tsv"
14 | JSON = "./languages.json"
15 |
16 |
17 | class DevelopmentConfig(Config):
18 | DEBUG = True
19 | SECRET_KEY = 'super-secret-key'
20 | basedir = os.path.abspath(os.path.dirname(__file__))
21 | FLASK_DEBUG=1
22 |
23 |
24 | class StagingConfig(Config):
25 | """
26 | This is an imitation of the production environment for
27 | testing purpose.
28 | """
29 | DEBUG = True
30 | TESTING = True
31 | SECRET_KEY = os.getenv('SECRET_KEY', "key_testing")
32 | # MODEL = os.getenv('MODEL', "./")
33 |
34 |
35 | class ProductionConfig(Config):
36 | SECRET_KEY = os.getenv('SECRET_KEY', "key_production")
37 | # MODEL = os.getenv('MODEL', "./")
38 |
--------------------------------------------------------------------------------
/src/server/core/utils_bucket/upload_download.py:
--------------------------------------------------------------------------------
1 | from os import name, path
2 | from google.cloud.storage import Blob
3 | from google.cloud import storage
4 |
5 |
6 | client = storage.Client(project="dsfsi-232208")
7 | bucket = client.get_bucket("maskhane-web-test")
8 | encryption_key = "c7f32af42e45e85b9848a6a14dd2a8f6"
9 |
10 | # blob = Blob("secure-data", bucket, encryption_key=encryption_key)
11 | blob = Blob("secure-data", bucket)
12 |
13 |
14 |
15 | # Download
16 | # blob.upload_from_string("my secret message.")
17 | # with open("/tmp/my-secure-file", "wb") as file_obj:
18 | # client.download_to_file(blob, file_obj)
19 |
20 | if __name__ == "__main__":
21 | path_to_file_for_upload = "../../data/external/available_models.tsv"
22 | # if (path.exists(path_to_file_for_upload)):
23 | # # Upload
24 | # with open(path_to_file_for_upload, "rb") as my_file:
25 | # print("yes")
26 | # blob.upload_from_file(my_file)
27 |
28 | where_to_download = "../../data/"
29 | with open(where_to_download, "wb") as file_obj:
30 | client.download_to_file(blob, file_obj)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Data Science for Social Impact @ University of Pretoria
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/todo.md:
--------------------------------------------------------------------------------
1 | - should not put the model in a docker container, use the file storage instead and make it available as a volume to the container
2 | - use a model registry to store models, build one with mlflow.
3 | - Run different services for each model, and use a load balancer to route the requests to the right model.
4 |
5 |
6 |
7 | torch-model-archiver --model-name MasaknaneEnSwaRelNews \
8 | --version 1.0 \
9 | --serialized-file src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/pytorch_model.bin \
10 | --handler src/torchserve/transformer_handler.py \
11 | --extra-files "src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/config.json,
12 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/special_tokens_map.json,
13 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/tokenizer_config.json,
14 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/vocab.json,
15 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/generation_config.json,
16 | src/torchserve/transformer_models/masakhane/m2m100_418M_en_swa_rel_news/sentencepiece.bpe.model"
17 |
--------------------------------------------------------------------------------
/src/server/core/utils_bucket/bucket.py:
--------------------------------------------------------------------------------
1 | from google.cloud import storage
2 | from google.oauth2 import service_account
3 | import pathlib, io, ipdb
4 |
5 | # credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE)
6 |
7 | client = storage.Client(
8 | project="dsfsi-232208",
9 | # credentials=credentials
10 | )
11 |
12 |
13 | from google.cloud import storage
14 | from zipfile import ZipFile, ZipInfo
15 |
16 | def upload():
17 | source_dir = pathlib.Path("../../models/joeynmt/en-lua/")
18 |
19 | archive = io.BytesIO()
20 | with ZipFile(archive, 'w') as zip_archive:
21 | for file_path in source_dir.iterdir():
22 | # ipdb.set_trace()
23 | with open(file_path, 'r') as file:
24 | zip_entry_name = file_path.name
25 | zip_file = ZipInfo(zip_entry_name)
26 | zip_archive.writestr(zip_file, file.read())
27 |
28 | ipdb.set_trace()
29 | archive.seek(0)
30 |
31 | object_name = 'super-important-data-v1'
32 | bucket = client.bucket("maskhane-web-test")
33 |
34 | blob = storage.Blob(object_name, bucket)
35 | blob.upload_from_file(archive, content_type='application/zip')
36 |
37 | upload()
--------------------------------------------------------------------------------
/src/client/src/pages/Home.js:
--------------------------------------------------------------------------------
1 | import { Navbar, Nav, Container, Jumbotron, Image, Row, Col } from 'react-bootstrap'
2 | import React from 'react';
3 | import TranslateCard from '../components/translateCard';
4 | import image from '../images/masakhane-border.png';
5 |
6 | function Home() {
7 | return (
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | This is a community research project and as such, this service is not a production system. Therefore, it should not be used for official translations . Don't see your language and interested in training one up yourself? Go here to learn how to contribute a model!
16 | The models are powered by JoeyNMT 🐨 ; a minimalist machine translation toolkit based on pytorch.
17 |
18 |
19 | );
20 | }
21 |
22 | export default Home;
23 |
--------------------------------------------------------------------------------
/src/server/Dockerfile:
--------------------------------------------------------------------------------
1 | # base image
2 | FROM python:3.6.9
3 |
4 | # set working directory
5 | WORKDIR /usr/src/app
6 |
7 | # set environment variables
8 | ENV PYTHONDONTWRITEBYTECODE 1
9 | ENV PYTHONUNBUFFERED 1
10 |
11 | # install system dependencies
12 | RUN apt-get update && apt-get install -y netcat
13 | RUN apt-get update
14 | RUN apt-get install -y gnupg lsb-release wget
15 |
16 | RUN lsb_release -c -s > /tmp/lsb_release
17 | RUN GCSFUSE_REPO=$(cat /tmp/lsb_release); echo "deb http://packages.cloud.google.com/apt gcsfuse-$GCSFUSE_REPO main" | tee /etc/apt/sources.list.d/gcsfuse.list
18 | RUN wget -O - https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
19 |
20 | RUN apt-get update
21 | RUN apt-get install -y gcsfuse
22 |
23 |
24 | # add and
25 | COPY ./requirements.txt /usr/src/app/requirements.txt
26 | # RUN pip install to install requirements
27 | RUN pip install --upgrade pip
28 | RUN pip install -r requirements.txt
29 |
30 | # add entrypoint.sh
31 | COPY ./entrypoint.sh /usr/src/app/entrypoint.sh
32 |
33 | # add app
34 | COPY . /usr/src/app
35 |
36 |
37 | # run server (https://github.com/testdrivenio/testdriven-app/issues/25)
38 | CMD ["sh","-c","chmod 777 /usr/src/app/entrypoint.sh"]
39 | ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
40 |
--------------------------------------------------------------------------------
/src/m_to_m_models/main.py:
--------------------------------------------------------------------------------
1 | from src.seldon_core_components.app import create_app
2 | from typing import Tuple, List
3 | from pydoc import locate
4 | import argparse
5 |
6 | def parse_args() -> Tuple[argparse.Namespace, List[str]]:
7 | """parse the following arguments
8 | --model_handler : the path to the class of the model handler
9 | --model_path : the path to the model
10 | --src_lang : the source language
11 | --trg_lang : the target language
12 | Returns:
13 | Tuple[argparse.Namespace, List[str]]: _description_
14 | """
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("--model_handler", type=str, required=True)
17 | parser.add_argument("--model_path", type=str, required=True)
18 | parser.add_argument("--src_lang", type=str, required=True)
19 | parser.add_argument("--trg_lang", type=str, required=True)
20 | args, unknown = parser.parse_known_args()
21 | return args, unknown
22 |
23 |
24 | def main():
25 | args, _ = parse_args()
26 | ModelHandleClass = locate(args.model_handler)
27 | model_handler = ModelHandleClass(args.model_path, args.src_lang, args.trg_lang)
28 | app = create_app(model_handler)
29 | app.run()
30 |
31 |
32 | if __name__ == "__main__":
33 | main()
34 |
--------------------------------------------------------------------------------
/src/server/core/models/language.py:
--------------------------------------------------------------------------------
1 | from enum import unique
2 |
3 | from flask_sqlalchemy import SQLAlchemy
4 |
5 | from core.extensions import db
6 |
7 | class Language(db.Model):
8 | __tablename__ = 'language'
9 | # id = db.Column(db.Integer, primary_key=True)
10 | src_tgt_dmn = db.Column(db.String(50), primary_key=True)
11 | source_target_domain = db.Column(db.String(50), nullable=True)
12 |
13 | created_at = db.Column(db.DateTime(), nullable=False,\
14 | server_default=db.func.now())
15 | update_at = db.Column(db.DateTime(), nullable=False,\
16 | server_default=db.func.now(), onupdate=db.func.now())
17 |
18 | def __init__(self, src_tgt_dmn, source_target_domain="") :
19 | super().__init__()
20 | self.src_tgt_dmn = src_tgt_dmn
21 | self.source_target_domain = source_target_domain
22 |
23 | def save(self):
24 | db.session.add(self)
25 | db.session.commit()
26 |
27 | def to_json(self):
28 | source, target, domain = self.src_tgt_dmn.split('-')
29 | return {
30 | 'source': source,
31 | 'target': target,
32 | 'src-tgt_domn' : self.source_target_domain,
33 | 'domain': domain
34 | }
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 |
3 | services:
4 |
5 | server:
6 | # container_name: flask-api
7 | build :
8 | context: ./src/server
9 | dockerfile: Dockerfile
10 | command: python manage.py run -h 0.0.0.0
11 | volumes:
12 | - './src/server:/usr/src/app'
13 | - './models/joeynmt:/usr/src/app/models/joeynmt'
14 | ports:
15 | - 5000:5000
16 |
17 | env_file:
18 | - ./src/server/.env.dev
19 |
20 | depends_on:
21 | - db
22 |
23 | db:
24 | image: postgres:12-alpine
25 | volumes:
26 | - postgres_data:/var/lib/postgresql/data/
27 | environment:
28 | - POSTGRES_USER=masakhane
29 | - POSTGRES_PASSWORD=masakhane
30 | - POSTGRES_DB=masakhane
31 |
32 | client:
33 | build :
34 | context: ./src/client
35 | dockerfile: Dockerfile
36 | # command: curl --location --request GET 'http://0.0.0.0:5000/update' --data-raw ''
37 | volumes:
38 | - './src/client:/usr/src/app'
39 | ports:
40 | - 3000:3000
41 |
42 | depends_on:
43 | - server
44 |
45 | # To persist the data beyond the life of the container
46 | # we configured a volume. This config will bind
47 | # postgres_data to the "/var/lib/postgresql/data/" directory in the container.
48 | volumes:
49 | postgres_data:
50 |
--------------------------------------------------------------------------------
/src/client/public/bundle.js.LICENSE.txt:
--------------------------------------------------------------------------------
1 | /*
2 | object-assign
3 | (c) Sindre Sorhus
4 | @license MIT
5 | */
6 |
7 | /*!
8 | Copyright (c) 2017 Jed Watson.
9 | Licensed under the MIT License (MIT), see
10 | http://jedwatson.github.io/classnames
11 | */
12 |
13 | /** @license React v0.20.1
14 | * scheduler.production.min.js
15 | *
16 | * Copyright (c) Facebook, Inc. and its affiliates.
17 | *
18 | * This source code is licensed under the MIT license found in the
19 | * LICENSE file in the root directory of this source tree.
20 | */
21 |
22 | /** @license React v16.13.1
23 | * react-is.production.min.js
24 | *
25 | * Copyright (c) Facebook, Inc. and its affiliates.
26 | *
27 | * This source code is licensed under the MIT license found in the
28 | * LICENSE file in the root directory of this source tree.
29 | */
30 |
31 | /** @license React v17.0.1
32 | * react-dom.production.min.js
33 | *
34 | * Copyright (c) Facebook, Inc. and its affiliates.
35 | *
36 | * This source code is licensed under the MIT license found in the
37 | * LICENSE file in the root directory of this source tree.
38 | */
39 |
40 | /** @license React v17.0.1
41 | * react.production.min.js
42 | *
43 | * Copyright (c) Facebook, Inc. and its affiliates.
44 | *
45 | * This source code is licensed under the MIT license found in the
46 | * LICENSE file in the root directory of this source tree.
47 | */
48 |
--------------------------------------------------------------------------------
/src/m_to_m_models/model_handlers.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer
2 | from optimum.onnxruntime import ORTModelForSeq2SeqLM
3 | from optimum.pipelines import pipeline
4 | from pathlib import Path
5 |
6 |
7 | class OptimizedM100Model:
8 | def __init__(self, model_path, src_lang, tgt_lang):
9 | model_path = Path(model_path)
10 | assert model_path.exists(), "Model path does not exist"
11 | print("start loading the model........")
12 | self._model = ORTModelForSeq2SeqLM.from_pretrained(model_path)
13 | print("Model loaded successfully!")
14 | self._tokenizer = AutoTokenizer.from_pretrained(model_path)
15 | print("Tokenizer loaded successfully")
16 | self.pipeline = pipeline(f"translation_{src_lang}_to_{tgt_lang}", model=self._model, tokenizer=self._tokenizer)
17 | print("Pipeline created successfully")
18 |
19 | def predict_raw(self, X):
20 | data_to_translate = X.get("data")
21 | output = self.pipeline(data_to_translate)
22 | return output
23 |
24 | def health_status(self):
25 | text_to_translate = {"data": "Hello, my name is Espoir Murhabazi, I am a Software Engineer from Congo DRC but living in UK"}
26 | translation = self.predict_raw(text_to_translate)
27 | assert len(translation) == 1, "health check returning bad translation"
28 | assert translation[0].get("translation_text") is not None, "health check returning bad translation"
29 | return translation[0].get("translation_text")
30 |
--------------------------------------------------------------------------------
/src/server/core/models/feedback.py:
--------------------------------------------------------------------------------
1 | from enum import unique
2 |
3 | from flask_sqlalchemy import SQLAlchemy
4 |
5 | from core.extensions import db
6 |
7 | class Feedback(db.Model):
8 | __tablename__ = 'feedback'
9 | id = db.Column(db.Integer, primary_key=True)
10 |
11 | src_lang = db.Column(db.String(20), nullable=False)
12 | tgt_lang = db.Column(db.String(20), nullable=False)
13 | accurate_translation = db.Column(db.String(800), nullable=False)
14 | know_src_lang = db.Column(db.String(50), nullable=False)
15 | know_tgt_lang = db.Column(db.String(50), nullable=False)
16 | own_translation = db.Column(db.String(800), nullable=True)
17 | translation = db.Column(db.String(800), nullable=False)
18 | text = db.Column(db.String(800), nullable=False)
19 | understand_translation = db.Column(db.String(50), nullable=False)
20 | feedbackToken = db.Column(db.String(100), nullable=False)
21 |
22 |
23 | created_at = db.Column(db.DateTime(), nullable=False,\
24 | server_default=db.func.now())
25 | update_at = db.Column(db.DateTime(), nullable=False,\
26 | server_default=db.func.now(), onupdate=db.func.now())
27 |
28 | # TODO We need to decide how we deal with duplicate on the review saving
29 | # __table_args__ = (
30 | # # this can be db.PrimaryKeyConstraint if you want it to be a primary key
31 | # db.UniqueConstraint('input', 'review', 'stars'),)
32 |
33 |
34 | def save(self):
35 | db.session.add(self)
36 | db.session.commit()
37 |
--------------------------------------------------------------------------------
/src/server/core/tests/test_config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from flask import current_app
5 | from flask_testing import TestCase
6 |
7 | from core import masakhane
8 |
9 |
10 | class TestDevelopmentConfig(TestCase):
11 | def create_app(self):
12 | masakhane.config.from_object('core.config.DevelopmentConfig')
13 | return masakhane
14 |
15 | def test_app_is_development(self):
16 | self.assertTrue(masakhane.config['SECRET_KEY'] == "super-secret-key")
17 | self.assertFalse(current_app is None)
18 | self.assertTrue(
19 | masakhane.config['SQLALCHEMY_DATABASE_URI'] ==
20 | os.getenv('DATABASE_TEST_URL', "sqlite:///masakhane.db")
21 | )
22 |
23 | class TestTestingConfig(TestCase):
24 | def create_app(self):
25 | masakhane.config.from_object('core.config.StagingConfig')
26 | return masakhane
27 |
28 | def test_app_is_testing(self):
29 | self.assertTrue(masakhane.config['SECRET_KEY'] == "key_testing")
30 | self.assertTrue(masakhane.config['TESTING'])
31 | self.assertTrue(
32 | masakhane.config['SQLALCHEMY_DATABASE_URI'] ==
33 | os.getenv('DATABASE_TEST_URL', "sqlite:///masakhane.db")
34 | )
35 |
36 | class TestProductionConfig(TestCase):
37 | def create_app(self):
38 | masakhane.config.from_object('core.config.ProductionConfig')
39 | return masakhane
40 |
41 | def test_app_is_production(self):
42 | self.assertTrue(masakhane.config['SECRET_KEY'] == "key_production")
43 | self.assertFalse(masakhane.config['TESTING'])
44 |
45 | if __name__ == '__main__':
46 | unittest.main()
--------------------------------------------------------------------------------
/src/m_to_m_models/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10 as base
2 | LABEL maintainer="Espoir Murhabazi"
3 |
4 |
5 | # Never prompt the user for choices on installation/configuration of packages
6 | ENV DEBIAN_FRONTEND noninteractive
7 | ENV PYTHONUNBUFFERED=1 \
8 | PORT=9000 \
9 | PYTHONDONTWRITEBYTECODE=1 \
10 | PIP_NO_CACHE_DIR=off \
11 | PIP_DISABLE_PIP_VERSION_CHECK=on \
12 | PIP_DEFAULT_TIMEOUT=100
13 |
14 |
15 | FROM base AS python-deps
16 | RUN apt-get update \
17 | && apt-get install --no-install-recommends -y \
18 | curl \
19 | build-essential\
20 | software-properties-common
21 |
22 | RUN python -m venv /opt/venv
23 | # Make sure we use the virtualenv:
24 | ENV PATH="/opt/venv/bin:$PATH"
25 |
26 | # Install pip
27 | COPY requirements.txt ./
28 | RUN pip install --upgrade pip
29 | RUN pip install -r requirements.txt
30 |
31 |
32 |
33 | FROM base AS runtime
34 | # copy nltk data
35 | COPY --from=python-deps /opt/venv /opt/venv
36 |
37 |
38 | RUN useradd --create-home masakhane
39 | RUN usermod -aG sudo masakhane
40 | RUN mkdir /home/masakhane/translation_app/
41 | ENV WORKING_DIR=/home/masakhane/translation_app/
42 | ENV PATH="${WORKING_DIR}:$PATH"
43 | ENV PATH="/opt/venv/bin:$PATH"
44 | ENV PYTHONPATH="/opt/venv/bin:$PYTHONPATH"
45 | ENV PYTHONPATH="${PYTHONPATH}:${WORKING_DIR}"
46 |
47 | ENV MODEL_NAME model_handlers.OptimizedM100Model
48 |
49 | ENV SERVICE_TYPE MODEL
50 |
51 | COPY model_handlers.py ${WORKING_DIR}
52 | WORKDIR ${WORKING_DIR}
53 | RUN chown -R masakhane:masakhane ${WORKING_DIR}
54 | RUN chmod -R 777 ${WORKING_DIR}
55 | USER masakhane
56 | EXPOSE 9000 5000
57 |
58 | CMD exec seldon-core-microservice $MODEL_NAME --service-type $SERVICE_TYPE
59 |
--------------------------------------------------------------------------------
/src/client/src/components/multiStepForm.js:
--------------------------------------------------------------------------------
1 | import { useForm, useStep } from "react-hooks-helper";
2 | import React from 'react';
3 |
4 | import Terms from "./terms";
5 | import Step1 from "./step1";
6 | import Step2 from "./step2";
7 | import Step3 from "./step3";
8 |
9 | const steps = [
10 | { id: "terms" },
11 | { id: "step1" },
12 | { id: "step2" },
13 | { id: "step3" },
14 | ];
15 |
16 | const defaultData = {
17 | know_src_lang: "little",
18 | know_tgt_lang: "little",
19 | understand_translation: "none",
20 | accurate_translation: "nonsense",
21 | own_translation: ""
22 | };
23 |
24 | const MultiStepForm = ({ src_lang, tgt_lang, text, translation, setShow, submitFeedBack, setFeedbackToken, feedbackToken}) => {
25 | const [formData, setForm] = useForm({...defaultData, src_lang, tgt_lang, text, translation, feedbackToken});
26 | const { step, navigation } = useStep({ initialStep: 0, steps });
27 | const { id } = step;
28 |
29 | const handleSubmitFeedback = () => {
30 | console.log({formData});
31 | // set formData to be feedback form
32 | submitFeedBack(formData);
33 | }
34 |
35 | const props = { src_lang, tgt_lang, text, translation, setShow, formData, setForm, navigation, handleSubmitFeedback, setFeedbackToken, feedbackToken};
36 |
37 | switch (id) {
38 | case "terms":
39 | return ;
40 | case "step1":
41 | return ;
42 | case "step2":
43 | return ;
44 | case "step3":
45 | return ;
46 |
47 | default:
48 | return null;
49 | }
50 | }
51 |
52 | export default MultiStepForm;
53 |
--------------------------------------------------------------------------------
/src/server/core/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | # external imports
3 | from flask import Flask
4 | from flask_migrate import Migrate
5 | from flask_restful import Api
6 | from flask_cors import CORS
7 | # internal imports
8 | from core.resources.translate import TranslateResource, AddResource, SaveResource, HomeResource
9 | from core.extensions import db
10 | from core.config import Config, DevelopmentConfig, ProductionConfig, StagingConfig
11 |
12 |
13 | #application factory
14 | def create_app(saved_models):
15 | """Flask application factory to config and init app"""
16 | env = os.environ.get('ENV', 'Development')
17 | if env == 'Production':
18 | config_str = ProductionConfig()
19 | elif env == 'Staging':
20 | config_str = StagingConfig()
21 | else:
22 | config_str = DevelopmentConfig()
23 |
24 | app = Flask(__name__)
25 | CORS(app)
26 | app.config.from_object(config_str)
27 | # database init
28 | register_extensions(app)
29 | # api init
30 | register_resources(app, saved_models)
31 |
32 | return app
33 |
34 |
35 | def register_extensions(app):
36 | db.init_app(app)
37 | migrate = Migrate(app, db)
38 |
39 |
40 | def register_resources(app, saved_models):
41 | api = Api(app)
42 | api.add_resource(HomeResource, '/')
43 | api.add_resource(TranslateResource, '/translate', resource_class_kwargs={'saved_models': saved_models})
44 | # TODO need to find a better way to updte the current app information whithout exposing to the public
45 | api.add_resource(AddResource, '/update', resource_class_kwargs={'saved_models': saved_models})
46 | api.add_resource(SaveResource, '/save')
47 |
48 |
49 | models = {}
50 | masakhane = create_app(models)
51 | masakhane.models = models
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *.cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | # DotEnv configuration
60 | .env
61 |
62 | # Database
63 | *.db
64 | *.rdb
65 |
66 | # Pycharm
67 | .idea
68 |
69 | # VS Code
70 | .vscode/
71 |
72 | # Spyder
73 | .spyproject/
74 |
75 | # Jupyter NB Checkpoints
76 | .ipynb_checkpoints/
77 |
78 | # exclude data from source control by default
79 | /data/
80 |
81 | # Mac OS-specific storage files
82 | .DS_Store
83 |
84 | # vim
85 | *.swp
86 | *.swo
87 |
88 | # Mypy cache
89 | .mypy_cache/
90 |
91 | src/back-end/joeynmt/models/
92 | models/joeynmt/
93 |
94 | # node modules
95 | node_modules/
96 |
97 | #cache
98 | .eslintcache
99 |
100 | src/server/models/joeynmt
101 | src/server/core/models/joeynmt
102 |
103 | .env.prod
104 |
105 | *.sqlite
106 |
107 | ### ignore model export
108 |
109 | onnx/
110 | *.onnx
111 | model_store/
112 | logs/
113 |
--------------------------------------------------------------------------------
/src/client/src/pages/Faq.js:
--------------------------------------------------------------------------------
1 | import { Container, Card } from 'react-bootstrap'
2 | import React from 'react';
3 |
4 | export default function FAQPage() {
5 | return(
6 |
7 |
8 |
9 |
10 | FAQ
11 | {/* Enter subtitle here */}
12 |
13 |
14 | 1. I was not happy with the translation I got from the service.
15 |
16 |
17 |
18 |
19 |
20 | Thank you for trying this service. The Masakhane NLP Translation project built the models used to do the translation.
21 | This website provides a way for us to be able to test how well these models work. This service is still a work in progress and we expect the models to be improved every few months as we get more feedback from users such as yourself.
22 | Please do provide feedback by writing where there is a mistake in the translation so we can provide this information to the researchers.
23 | As such, this service is not a production system (should not be used for official translations).
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | )
32 | }
--------------------------------------------------------------------------------
/src/server/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.11.0
2 | alembic==1.5.4
3 | aniso8601==8.1.1
4 | astroid==2.4.2
5 | backcall==0.2.0
6 | cachetools==4.2.1
7 | chardet==4.0.0
8 | click==7.1.2
9 | cycler==0.10.0
10 | decorator==4.4.2
11 | Flask==1.1.2
12 | Flask-Cors==3.0.10
13 | Flask-Migrate==2.6.0
14 | Flask-RESTful==0.3.8
15 | Flask-SQLAlchemy==2.4.4
16 | future==0.18.2
17 | google-auth==1.26.1
18 | google-auth-oauthlib==0.4.2
19 | grpcio==1.35.0
20 | gdown==4.6.0
21 | idna==2.10
22 | importlib-metadata==3.4.0
23 | ipdb==0.13.4
24 | ipython==7.16.1
25 | ipython-genutils==0.2.0
26 | isort==5.7.0
27 | itsdangerous==1.1.0
28 | jedi==0.18.0
29 | Jinja2==2.11.3
30 | joeynmt==1.2
31 | kiwisolver==1.3.1
32 | lazy-object-proxy==1.4.3
33 | Mako==1.1.4
34 | Markdown==3.3.3
35 | MarkupSafe==1.1.1
36 | matplotlib==3.3.4
37 | mccabe==0.6.1
38 | Morfessor==2.0.6
39 | numpy==1.18.5
40 | oauthlib==3.1.0
41 | pandas==1.1.5
42 | parso==0.8.1
43 | pexpect==4.8.0
44 | pickleshare==0.7.5
45 | Pillow==8.1.0
46 | polyglot==16.7.4
47 | portalocker==2.2.1
48 | prompt-toolkit==3.0.16
49 | protobuf==3.14.0
50 | psycopg2-binary==2.8.6
51 | ptyprocess==0.7.0
52 | pyasn1==0.4.8
53 | pyasn1-modules==0.2.8
54 | pycld2==0.41
55 | pyglot==0.1.1
56 | Pygments==2.7.4
57 | PyICU==2.6
58 | pylint==2.6.0
59 | pyparsing==2.4.7
60 | python-dateutil==2.8.1
61 | python-editor==1.0.4
62 | pytz==2021.1
63 | PyYAML==5.4.1
64 | requests==2.25.1
65 | requests-oauthlib==1.3.0
66 | rsa==4.7
67 | sacrebleu==1.5.0
68 | scipy==1.5.4
69 | seaborn==0.11.1
70 | simplejson==3.17.2
71 | six==1.12.0
72 | SQLAlchemy==1.3.23
73 | subword-nmt==0.3.7
74 | tensorboard==2.4.1
75 | tensorboard-plugin-wit==1.8.0
76 | toml==0.10.2
77 | torch==1.7.1
78 | tqdm==4.56.2
79 | traitlets==4.3.3
80 | typed-ast==1.4.2
81 | typing-extensions==3.7.4.3
82 | urllib3==1.26.3
83 | wcwidth==0.2.5
84 | Werkzeug==0.16.1
85 | wrapt==1.11.1
86 | zipp==3.4.0
87 | sacremoses==0.0.43
88 | # https://gunicorn.org/#deployment
89 | gunicorn==20.0.4
90 | Flask-Testing==0.6.2
--------------------------------------------------------------------------------
/src/m_to_m_models/kubernetes/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: machinelearning.seldon.io/v1
2 | kind: SeldonDeployment
3 | metadata:
4 | name: translation-deployment
5 | namespace: masakhane
6 | spec:
7 | name: translation-worker
8 | predictors:
9 | - componentSpecs:
10 | - spec:
11 | containers:
12 | - image: masakhane/translation:alpha
13 | name: translation-container
14 | imagePullPolicy: IfNotPresent
15 | env:
16 | - name: TRANSFORMERS_CACHE
17 | value: "/models_datastore/.cache"
18 | - name: GUNICORN_WORKERS
19 | value: '1'
20 | - name: GRPC_WORKERS
21 | value: '0'
22 | - name: SELDON_LOG_LEVEL
23 | value: DEBUG
24 | - name: SELDON_DEBUG
25 | value: 'True'
26 | - name: FLASK_DEBUG
27 | value: 'True'
28 | volumeMounts:
29 | - mountPath: "/models_datastore/" # mount the cache volume here
30 | name: translation-volume-storage
31 | resources:
32 | requests:
33 | memory: 8Gi
34 | cpu: 3
35 | limits:
36 | memory: 9Gi
37 | cpu: 4
38 | terminationGracePeriodSeconds: 1
39 | volumes:
40 | - name: translation-volume-storage
41 | persistentVolumeClaim:
42 | claimName: masakhane-model-cache-volume-claim
43 | graph:
44 | envSecretRefName: masakhane-container-secret
45 | children: []
46 | endpoint:
47 | type: REST
48 | name: translation-container
49 | type: MODEL
50 | parameters:
51 | - name: model_path
52 | type: STRING
53 | value: "/models_datastore/" # this should come form volume.
54 | - name: src_lang
55 | type: STRING
56 | value: "en"
57 | - name: tgt_lang
58 | type: STRING
59 | value: "sw"
60 | labels:
61 | version: v1
62 | name: translation-predictor
63 | replicas: 1
64 |
--------------------------------------------------------------------------------
/src/server/core/utils.py:
--------------------------------------------------------------------------------
1 | from torchtext import data
2 | from torchtext.datasets import TranslationDataset
3 |
4 |
5 | from joeynmt.constants import UNK_TOKEN, EOS_TOKEN, BOS_TOKEN, PAD_TOKEN
6 |
7 |
8 | class MonoLineDataset(TranslationDataset):
9 | def __init__(self, line, field, **kwargs):
10 | examples = []
11 | line = line.strip()
12 | fields = [('src', field)]
13 | examples.append(data.Example.fromlist([line], fields))
14 | super(TranslationDataset, self).__init__(examples, fields, **kwargs)
15 |
16 |
17 | def load_line_as_data(line, level, lowercase, src_vocab, trg_vocab):
18 | """
19 | Create a data set from one line.
20 | Workaround for the usual torchtext data handling.
21 |
22 | :param line: The input line to process.
23 | :param level: "char", "bpe" or "word". Determines segmentation of the input.
24 | :param lowercase: If True, lowercases inputs and outputs.
25 | :param src_vocab: Path to source vocabulary.
26 | :param trg_vocab: Path to target vocabulary.
27 | :return:
28 | """
29 | if level == "char":
30 | tok_fun = lambda s: list(s)
31 | else:
32 | # bpe or word, pre-tokenized
33 | tok_fun = lambda s: s.split()
34 |
35 | src_field = data.Field(init_token=None, eos_token=EOS_TOKEN,
36 | pad_token=PAD_TOKEN, tokenize=tok_fun,
37 | batch_first=True, lower=lowercase,
38 | unk_token=UNK_TOKEN,
39 | include_lengths=True)
40 | trg_field = data.Field(init_token=BOS_TOKEN, eos_token=EOS_TOKEN,
41 | pad_token=PAD_TOKEN, tokenize=tok_fun,
42 | unk_token=UNK_TOKEN,
43 | batch_first=True, lower=lowercase,
44 | include_lengths=True)
45 | test_data = MonoLineDataset(line=line, field=(src_field))
46 | src_field.vocab = src_vocab
47 | trg_field.vocab = trg_vocab
48 | return test_data, src_vocab, trg_vocab
--------------------------------------------------------------------------------
/src/client/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "masakhane",
3 | "version": "0.1.0",
4 | "private": true,
5 | "dependencies": {
6 | "@babel/core": "^7.20.5",
7 | "@babel/preset-env": "^7.20.2",
8 | "@babel/preset-react": "^7.18.6",
9 | "@babel/runtime": "^7.13.17",
10 | "@testing-library/jest-dom": "^5.11.9",
11 | "@testing-library/react": "^11.1.0",
12 | "@testing-library/user-event": "^12.1.10",
13 | "babel-loader": "^8.3.0",
14 | "bootstrap": "^4.6.0",
15 | "core-js": "^3.11.0",
16 | "file-loader": "^6.2.0",
17 | "http-proxy-middleware": "^1.3.1",
18 | "react": "^17.0.1",
19 | "react-bootstrap": "^1.4.3",
20 | "react-copy-to-clipboard": "^5.0.3",
21 | "react-dom": "^17.0.1",
22 | "react-gtm-module": "^2.0.11",
23 | "react-hooks-helper": "^1.6.0",
24 | "react-router-dom": "^5.2.0",
25 | "react-scripts": "4.0.1",
26 | "regenerator-runtime": "^0.13.7",
27 | "url-loader": "^4.1.1",
28 | "uuid": "^8.3.2",
29 | "web-vitals": "^0.2.4",
30 | "webpack": "^5.75.0",
31 | "webpack-cli": "^3.3.12",
32 | "webpack-config-utils": "^2.3.1"
33 | },
34 | "scripts": {
35 | "develop": "webpack-dev-server --host 0.0.0.0",
36 | "start-api": "cd ../server && python app.py",
37 | "build": "react-scripts build",
38 | "test": "react-scripts test",
39 | "eject": "react-scripts eject"
40 | },
41 | "eslintConfig": {
42 | "plugins": [
43 | "testing-library"
44 | ],
45 | "rules": {
46 | "testing-library/await-async-query": "error",
47 | "testing-library/no-await-sync-query": "error",
48 | "testing-library/no-debug": "warn"
49 | }
50 | },
51 | "browserslist": {
52 | "production": [
53 | ">0.2%",
54 | "not dead",
55 | "not op_mini all"
56 | ],
57 | "development": [
58 | "last 1 chrome version",
59 | "last 1 firefox version",
60 | "last 1 safari version"
61 | ]
62 | },
63 | "devDependencies": {
64 | "eslint": "^7.18.0",
65 | "eslint-plugin-testing-library": "^3.10.1",
66 | "webpack-dev-server": "^3.11.2"
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/client/webpack.config.js:
--------------------------------------------------------------------------------
1 | const path = require('path')
2 |
3 | module.exports = {
4 | entry: path.resolve(__dirname, 'src', 'index.js'),
5 | output: {
6 | path: path.resolve(__dirname, 'public'),
7 | filename: 'bundle.js',
8 | publicPath: '/'
9 | },
10 | devServer: {
11 | contentBase: path.resolve(__dirname, 'public'),
12 | open: true,
13 | clientLogLevel: 'silent',
14 | host: '0.0.0.0',
15 | port: 3000,
16 | historyApiFallback: true,
17 | compress: true,
18 | public: 'translate.masakhane.io:80',
19 | // proxy: {
20 | // '/': {
21 | // target: 'http://localhost:5000',
22 | // pathRewrite: { '^/api': '' },
23 | // },
24 | // "changeOrigin":true
25 | // }
26 | proxy: {
27 | '/': {
28 | // target: 'http://[::1]:5000',
29 | // todo: make the ip a configuration environment variable
30 | target: 'http://45.147.99.147:5000',
31 | // target: 'http://127.0.0.1:5000',
32 | bypass: function (req, res, proxyOptions) {
33 | if (req.headers.accept.indexOf('html') !== -1) {
34 | console.log('Skipping proxy for browser request.');
35 | return '/index.html';
36 | }
37 | },
38 | },
39 | },
40 | },
41 | module: {
42 | rules: [
43 | {
44 | test: /\.(jsx|js)$/,
45 | include: path.resolve(__dirname, 'src'),
46 | exclude: /node_modules/,
47 | use: [{
48 | loader: 'babel-loader',
49 | options: {
50 | presets: [
51 | ['@babel/preset-env', {
52 | "targets": "defaults"
53 | }],
54 | '@babel/preset-react'
55 | ]
56 | }
57 | }]
58 | },
59 | {
60 | test: /\.(jpg|png|svg)$/,
61 | include: path.resolve(__dirname, 'src'),
62 | exclude: /node_modules/,
63 | loader: 'url-loader',
64 | options: {
65 | limit: 25000,
66 | performance: {
67 | hints: false,
68 | maxEntrypointSize: 512000,
69 | maxAssetSize: 512000
70 | }
71 | },
72 |
73 | }
74 |
75 | ]
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/server/Dockerfile.prod:
--------------------------------------------------------------------------------
1 | ###########
2 | # BUILDER #
3 | ###########
4 |
5 | # pull official base image
6 | FROM python:3.6.9 as builder
7 |
8 |
9 | # set working directory
10 | WORKDIR /usr/src/app
11 |
12 |
13 | # set environment variables
14 | ENV PYTHONDONTWRITEBYTECODE 1
15 | ENV PYTHONUNBUFFERED 1
16 |
17 | # install system dependencies
18 | RUN apt-get update && apt-get install -y netcat && \
19 | apt-get install -y --no-install-recommends gcc
20 |
21 |
22 | RUN apt-get update
23 | RUN apt-get install -y gnupg lsb-release wget
24 |
25 | RUN lsb_release -c -s > /tmp/lsb_release
26 | RUN GCSFUSE_REPO=$(cat /tmp/lsb_release); echo "deb http://packages.cloud.google.com/apt gcsfuse-$GCSFUSE_REPO main" | tee /etc/apt/sources.list.d/gcsfuse.list
27 | RUN wget -O - https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
28 |
29 | RUN apt-get update
30 | RUN apt-get install -y gcsfuse
31 |
32 |
33 | # lint
34 | RUN pip install --upgrade pip
35 | RUN pip install flake8
36 | RUN flake8 --ignore=E501,F401 .
37 |
38 | # add and install requirements
39 | COPY ./requirements.txt /usr/src/app/requirements.txt
40 | # RUN pip install -r requirements.txt
41 | RUN pip wheel --no-cache-dir --no-deps --wheel-dir /usr/src/app/wheels -r requirements.txt
42 |
43 |
44 | #########
45 | # FINAL #
46 | #########
47 |
48 | FROM python:3.6.9
49 |
50 | # create directory for the app user
51 | RUN mkdir -p /home/app
52 |
53 | # create the app user
54 | RUN addgroup --system app && adduser --system --group app
55 |
56 | # create the appropriate directories
57 | ENV HOME=/home/app
58 | ENV APP_HOME=/home/app/web
59 | RUN mkdir $APP_HOME
60 | WORKDIR $APP_HOME
61 |
62 | # install dependencies
63 | RUN apt-get update && apt-get install -y --no-install-recommends netcat
64 | COPY --from=builder /usr/src/app/wheels /wheels
65 | COPY --from=builder /usr/src/app/requirements.txt .
66 | RUN pip install --upgrade pip
67 | RUN pip install --no-cache /wheels/*
68 |
69 | # copy entrypoint-prod.sh
70 | COPY ./entrypoint.prod.sh $APP_HOME
71 |
72 |
73 | # copy project
74 | COPY . $APP_HOME
75 |
76 | # chown all the files to the app user
77 | RUN chown -R app:app $APP_HOME
78 |
79 | # change to the app user
80 | USER app
81 |
82 | # run entrypoint.prod.sh
83 | ENTRYPOINT ["/home/app/web/entrypoint.prod.sh"]
--------------------------------------------------------------------------------
/docs/start_app_prod_doc.md:
--------------------------------------------------------------------------------
1 | # **Running the App In Production**
2 | To run the app locally, see [here](start_app_locally_doc.md)
3 |
4 | ## **Table of Contents**
5 | - [**Docker Setup**](#docker-setup)
6 | - [**Running the app**](#running-the-app)
7 | - [**Building the App**](#building-the-app)
8 | - [**Shut down the app**](#shut-down-the-app)
9 | - [**Add, Update, \& Delete Languages**](#add-update--delete-languages)
10 | - [**Running tests**](#running-tests)
11 |
12 |
13 | ## **Docker Setup**
14 |
15 | Ensure you have `docker` & `docker-compose` installed on your computer, you can check with the following commands:
16 | ```bash
17 | docker --version
18 | docker-compose --version
19 | ```
20 |
21 | If the above commands return an error, please install [Docker](https://docs.docker.com/engine/install/) and [Docker-compose](https://docs.docker.com/compose/install/).
22 |
23 | ## **Running the app**
24 | ### **Building the App**
25 | To build the app, from the root project directory, run the following command:
26 | ```bash
27 | docker-compose -f docker-compose.prod.yml up -d --build
28 | ```
29 |
30 | ### **Shut down the app**
31 | To shut down the app, run the following command to remove the docker container:
32 | ```bash
33 | docker-compose -f docker-compose.prod.yml down
34 | ```
35 |
36 | ### **Add, Update, & Delete Languages**
37 | **Add a Language**
38 | ```bash
39 | docker-compose -f docker-compose.yml exec api python manage.py add_language en-sw-JW300
40 | ```
41 | The language code parameter `en-sw-JW300` represents {src-lang}-{tgt-lang}-{shortform}
42 | So `en-sw-JW300` represents English-Swahili using JW300 shortform
43 | **Note** - A code parameter example without shortform is `en-tiv-`
44 |
45 | Download available languages csv [here](https://zenodo.org/record/7417644/files/masakhane-mt-current-models.csv)
46 |
47 | **Update Langugaes**
48 | ```bash
49 | curl --request GET 'http://127.0.0.1:5000/update'
50 | ```
51 |
52 | **Check available languages**
53 | ```bash
54 | docker-compose -f docker-compose.prod.yml exec api python manage.py all_languages
55 | ```
56 |
57 | **Remove a language**
58 | ```bash
59 | docker-compose -f docker-compose.prod.yml exec api python manage.py remove_language en-sw-JW300
60 | ```
61 |
62 | ### **Running tests**
63 | ```bash
64 | docker-compose -f docker-compose.prod.yml exec api python manage.py tests
65 | ```
--------------------------------------------------------------------------------
/src/server/core/tests/test_app.py:
--------------------------------------------------------------------------------
1 | # test_hello.py
2 | # from app import create_app
3 | from flask import json, jsonify
4 |
5 | import os
6 | import unittest
7 |
8 | from flask import current_app
9 | from flask_testing import TestCase
10 | from core import masakhane, load_model, create_app
11 |
12 | # from core import masakhane
13 | from core.tests.base import BaseTestCase
14 |
15 | class TestAppService(BaseTestCase):
16 |
17 | def test_home_page(self):
18 | "Test the home endpoint"
19 | app = masakhane
20 | response = app.test_client().get('/')
21 |
22 | data = response.get_json()
23 |
24 | assert response.status_code == 200
25 |
26 | assert data['message'] == "welcome Masakhane Web"
27 |
28 | # TODO We will need to have a dump database to check this
29 | # def test_translation(self):
30 | # app = masakhane
31 | # response = app.test_client().post(
32 | # '/translate',
33 | # data = json.dumps({
34 | # "src_lang":"English",
35 | # "tgt_lang":"swahili",
36 | # "input":"My name is Salomon"
37 | # }),
38 | # content_type='application/json',
39 | # )
40 |
41 | # data = response.get_json()
42 |
43 | # # assert response.status_code == 201 # created
44 |
45 | # # Givent that we can't know exactly the output of the translation
46 | # # we can test that some result are return
47 | # print(data)
48 | # assert data['output'] != ""
49 |
50 | # def test_save():
51 | # """
52 | # Test the save endpoint by checking the status code
53 | # and the responce message.
54 | # """
55 | # app = create_app()
56 | # response = app.test_client().post(
57 | # '/save',
58 | # data = json.dumps({
59 | # "src_lang":"en",
60 | # "tgt_lang":"sw",
61 | # "input":"How are you doing today ?",
62 | # "review":"Test Saving",
63 | # "stars":"5",
64 | # "token":"ww2wki&idjj11yyy"}),
65 | # content_type='application/json',
66 | # )
67 |
68 |
69 | # assert response.status_code == 201
70 |
71 | # assert b"Review saved" in response.data
72 |
73 | if __name__=='__main__':
74 | unittest.main()
--------------------------------------------------------------------------------
/src/client/src/logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements-python3.10.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.8.4
2 | aiosignal==1.3.1
3 | appnope==0.1.3
4 | asttokens==2.2.1
5 | async-timeout==4.0.2
6 | attrs==22.2.0
7 | backcall==0.2.0
8 | captum==0.6.0
9 | certifi==2022.12.7
10 | cffi==1.15.1
11 | charset-normalizer==3.0.1
12 | click==8.0.4
13 | coloredlogs==15.0.1
14 | comm==0.1.2
15 | contourpy==1.0.7
16 | cryptography==3.4.8
17 | cycler==0.11.0
18 | datasets==2.10.0
19 | debugpy==1.6.6
20 | decorator==5.1.1
21 | dill==0.3.6
22 | enum-compat==0.0.3
23 | executing==1.2.0
24 | filelock==3.9.0
25 | Flask==2.2.3
26 | Flask-Cors==3.0.10
27 | Flask-OpenTracing==1.1.0
28 | flatbuffers==1.12
29 | fonttools==4.38.0
30 | frozenlist==1.3.3
31 | fsspec==2023.1.0
32 | grpcio==1.51.3
33 | grpcio-opentracing==1.1.4
34 | grpcio-reflection==1.34.1
35 | gunicorn==20.1.0
36 | huggingface-hub==0.12.1
37 | humanfriendly==10.0
38 | idna==3.4
39 | ipykernel==6.21.2
40 | ipython==8.10.0
41 | itsdangerous==2.1.2
42 | jaeger-client==4.4.0
43 | jedi==0.18.2
44 | Jinja2==3.1.2
45 | jsonschema==3.2.0
46 | jupyter_client==8.0.3
47 | jupyter_core==5.2.0
48 | kiwisolver==1.4.4
49 | MarkupSafe==2.1.2
50 | matplotlib==3.7.0
51 | matplotlib-inline==0.1.6
52 | mpmath==1.2.1
53 | multidict==6.0.4
54 | multiprocess==0.70.14
55 | nest-asyncio==1.5.6
56 | numpy==1.23.5
57 | onnx==1.13.1
58 | onnxruntime==1.13.1
59 | onnxruntime-tools==1.7.0
60 | opentracing==2.4.0
61 | optimum==1.6.4
62 | ort-nightly==1.11.0.dev20220320001
63 | packaging==23.0
64 | pandas==1.5.3
65 | parso==0.8.3
66 | pexpect==4.8.0
67 | pickleshare==0.7.5
68 | Pillow==9.4.0
69 | platformdirs==3.0.0
70 | prometheus-client==0.8.0
71 | prompt-toolkit==3.0.37
72 | protobuf==3.20.3
73 | psutil==5.9.4
74 | ptyprocess==0.7.0
75 | pure-eval==0.2.2
76 | py-cpuinfo==9.0.0
77 | py3nvml==0.2.7
78 | pyarrow==11.0.0
79 | pycodestyle==2.10.0
80 | pycparser==2.21
81 | Pygments==2.14.0
82 | pyparsing==3.0.9
83 | pyrsistent==0.19.3
84 | python-dateutil==2.8.2
85 | pytz==2022.7.1
86 | PyYAML==5.4.1
87 | pyzmq==25.0.0
88 | regex==2022.10.31
89 | requests==2.28.2
90 | responses==0.18.0
91 | seldon-core==1.15.0
92 | sentencepiece==0.1.97
93 | six==1.16.0
94 | stack-data==0.6.2
95 | sympy==1.11.1
96 | threadloop==1.0.2
97 | thrift==0.16.0
98 | tokenizers==0.13.2
99 | torch==1.13.1
100 | torch-model-archiver==0.7.1
101 | torch-workflow-archiver==0.2.7
102 | torchserve==0.7.1
103 | tornado==6.2
104 | tqdm==4.64.1
105 | traitlets==5.9.0
106 | transformers==4.26.1
107 | typing_extensions==4.5.0
108 | urllib3==1.26.14
109 | wcwidth==0.2.6
110 | Werkzeug==2.2.3
111 | xmltodict==0.13.0
112 | xxhash==3.2.0
113 | yarl==1.8.2
114 |
--------------------------------------------------------------------------------
/src/m_to_m_models/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.8.4
2 | aiosignal==1.3.1
3 | appnope==0.1.3
4 | asttokens==2.2.1
5 | async-timeout==4.0.2
6 | attrs==22.2.0
7 | backcall==0.2.0
8 | captum==0.6.0
9 | certifi==2022.12.7
10 | cffi==1.15.1
11 | charset-normalizer==3.0.1
12 | click==8.0.4
13 | coloredlogs==15.0.1
14 | comm==0.1.2
15 | contourpy==1.0.7
16 | cryptography==3.4.8
17 | cycler==0.11.0
18 | datasets==2.10.0
19 | debugpy==1.6.6
20 | decorator==5.1.1
21 | dill==0.3.6
22 | enum-compat==0.0.3
23 | executing==1.2.0
24 | filelock==3.9.0
25 | Flask==2.2.3
26 | Flask-Cors==3.0.10
27 | Flask-OpenTracing==1.1.0
28 | flatbuffers==1.12
29 | fonttools==4.38.0
30 | frozenlist==1.3.3
31 | fsspec==2023.1.0
32 | grpcio==1.51.3
33 | grpcio-opentracing==1.1.4
34 | grpcio-reflection==1.34.1
35 | gunicorn==20.1.0
36 | huggingface-hub==0.12.1
37 | humanfriendly==10.0
38 | idna==3.4
39 | ipykernel==6.21.2
40 | ipython==8.10.0
41 | itsdangerous==2.1.2
42 | jaeger-client==4.4.0
43 | jedi==0.18.2
44 | Jinja2==3.1.2
45 | jsonschema==3.2.0
46 | jupyter_client==8.0.3
47 | jupyter_core==5.2.0
48 | kiwisolver==1.4.4
49 | MarkupSafe==2.1.2
50 | matplotlib==3.7.0
51 | matplotlib-inline==0.1.6
52 | mpmath==1.2.1
53 | multidict==6.0.4
54 | multiprocess==0.70.14
55 | nest-asyncio==1.5.6
56 | numpy==1.23.5
57 | onnx==1.13.1
58 | onnxruntime==1.13.1
59 | onnxruntime-tools==1.7.0
60 | opentracing==2.4.0
61 | optimum==1.6.4
62 | ort-nightly==1.11.0.dev20220320001
63 | packaging==23.0
64 | pandas==1.5.3
65 | parso==0.8.3
66 | pexpect==4.8.0
67 | pickleshare==0.7.5
68 | Pillow==9.4.0
69 | platformdirs==3.0.0
70 | prometheus-client==0.8.0
71 | prompt-toolkit==3.0.37
72 | protobuf
73 | psutil==5.9.4
74 | ptyprocess==0.7.0
75 | pure-eval==0.2.2
76 | py-cpuinfo==9.0.0
77 | py3nvml==0.2.7
78 | pyarrow==11.0.0
79 | pycodestyle==2.10.0
80 | pycparser==2.21
81 | Pygments==2.14.0
82 | pyparsing==3.0.9
83 | pyrsistent==0.19.3
84 | python-dateutil==2.8.2
85 | pytz==2022.7.1
86 | PyYAML==5.4.1
87 | pyzmq==25.0.0
88 | regex==2022.10.31
89 | requests==2.28.2
90 | responses==0.18.0
91 | seldon-core==1.15.0
92 | sentencepiece==0.1.97
93 | six==1.16.0
94 | stack-data==0.6.2
95 | sympy==1.11.1
96 | threadloop==1.0.2
97 | thrift==0.16.0
98 | tokenizers==0.13.2
99 | torch==1.13.1
100 | torch-model-archiver==0.7.1
101 | torch-workflow-archiver==0.2.7
102 | torchserve==0.7.1
103 | tornado==6.2
104 | tqdm==4.64.1
105 | traitlets==5.9.0
106 | transformers==4.26.1
107 | typing_extensions==4.5.0
108 | urllib3==1.26.14
109 | wcwidth==0.2.6
110 | Werkzeug==2.2.3
111 | xmltodict==0.13.0
112 | xxhash==3.2.0
113 | yarl==1.8.2
114 |
--------------------------------------------------------------------------------
/src/client/src/App.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import {
3 | BrowserRouter as Router,
4 | Switch,
5 | Route
6 | } from "react-router-dom";
7 | import { Navbar, Nav, Container, Jumbotron, Image, Row, Col } from 'react-bootstrap'
8 |
9 | import Home from './pages/Home';
10 | import About from './pages/About';
11 | import FAQPage from './pages/Faq';
12 | import image from './images/masakhane-border.png';
13 |
14 |
15 | function App() {
16 | return (
17 |
18 |
19 |
20 | Masakhane
21 |
22 |
23 |
24 | Home
25 | About
26 | FAQ
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | Masakhane
35 | Machine translation service for African languages
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | {/*
52 |
53 |
54 |
55 |
56 | This is a community research project. Read more about it here
57 | */}
58 |
59 |
60 | );
61 | }
62 |
63 | export default App;
64 |
--------------------------------------------------------------------------------
/src/client/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
13 |
14 |
15 |
16 |
17 |
18 |
22 |
23 |
24 |
30 |
31 |
32 |
36 |
37 |
46 | Masakhane Web
47 |
48 |
49 | You need to enable JavaScript to run this app.
50 |
51 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/src/client/public/217.bundle.js:
--------------------------------------------------------------------------------
1 | (self.webpackChunkmasakhane=self.webpackChunkmasakhane||[]).push([[217],{217:function(t,n,e){"use strict";e.r(n),e.d(n,{getCLS:function(){return m},getFCP:function(){return g},getFID:function(){return h},getLCP:function(){return y},getTTFB:function(){return F}});var i,a,r=function(){return"".concat(Date.now(),"-").concat(Math.floor(8999999999999*Math.random())+1e12)},o=function(t){var n=arguments.length>1&&void 0!==arguments[1]?arguments[1]:-1;return{name:t,value:n,delta:0,entries:[],id:r(),isFinal:!1}},u=function(t,n){try{if(PerformanceObserver.supportedEntryTypes.includes(t)){var e=new PerformanceObserver((function(t){return t.getEntries().map(n)}));return e.observe({type:t,buffered:!0}),e}}catch(t){}},s=!1,c=!1,f=function(t){s=!t.persisted},l=function(){addEventListener("pagehide",f),addEventListener("beforeunload",(function(){}))},p=function(t){var n=arguments.length>1&&void 0!==arguments[1]&&arguments[1];c||(l(),c=!0),addEventListener("visibilitychange",(function(n){var e=n.timeStamp;"hidden"===document.visibilityState&&t({timeStamp:e,isUnloading:s})}),{capture:!0,once:n})},d=function(t,n,e,i){var a;return function(){e&&n.isFinal&&e.disconnect(),n.value>=0&&(i||n.isFinal||"hidden"===document.visibilityState)&&(n.delta=n.value-(a||0),(n.delta||n.isFinal||void 0===a)&&(t(n),a=n.value))}},m=function(t){var n,e=arguments.length>1&&void 0!==arguments[1]&&arguments[1],i=o("CLS",0),a=function(t){t.hadRecentInput||(i.value+=t.value,i.entries.push(t),n())},r=u("layout-shift",a);r&&(n=d(t,i,r,e),p((function(t){var e=t.isUnloading;r.takeRecords().map(a),e&&(i.isFinal=!0),n()})))},v=function(){return void 0===i&&(i="hidden"===document.visibilityState?0:1/0,p((function(t){var n=t.timeStamp;return i=n}),!0)),{get timeStamp(){return i}}},g=function(t){var n,e=o("FCP"),i=v(),a=u("paint",(function(t){"first-contentful-paint"===t.name&&t.startTime1&&void 0!==arguments[1]&&arguments[1],i=o("LCP"),a=v(),r=function(t){var e=t.startTime;e
25 |
26 | Endpoint Description Returns (on success)
27 |
28 |
29 |
30 | `/`
31 |
32 | The base endpoint
33 |
34 |
35 | ```json
36 | {
37 | "message": "welcome Masakhane Web"
38 | }
39 | ```
40 |
41 |
42 |
43 |
44 |
45 | `/translate`
46 |
47 | Lists the saved models
48 |
49 |
50 | ```json
51 | [
52 | {
53 | "type": "source",
54 | "name": "English",
55 | "value": "en",
56 | "targets": [
57 | {
58 | "name": "Swahili",
59 | "value": "sw"
60 | }
61 | ]
62 | }
63 | ]
64 | ```
65 |
66 |
67 |
68 |
69 |
70 |
71 | `/update`
72 |
73 | Updates the local database with the newly loaded models
74 |
75 |
76 | ```json
77 | {
78 | "message": "models updated"
79 | }
80 | ```
81 |
82 |
83 |
84 |
85 |
86 | ### **POST**
87 |
88 |
89 |
90 | Endpoint Description Example Body Returns (on success)
91 |
92 |
93 |
94 |
95 | `/translate`
96 |
97 | Returns the translated text
98 |
99 |
100 | ```json
101 | {
102 | "src_lang": "english",
103 | "tgt_lang": "swahili",
104 | "input": "how are you?"
105 | }
106 | ```
107 |
108 |
109 |
110 |
111 | ```json
112 | {
113 | "src_lang": "english",
114 | "tgt_lang": "swahili",
115 | "input": "Hello, how are you?",
116 | "output": "kwa ukunjufu"
117 | }
118 | ```
119 |
120 |
121 |
122 |
123 |
124 |
125 | `/save`
126 |
127 | Saves the translation feedback
128 |
129 |
130 | ```json
131 | {
132 | "srcX_lang": "english",
133 | "tgt_lang": "swahili",
134 | "input": "Hello, how are you?",
135 | "review": "translation correction",
136 | "stars": "translation confidence",
137 | "token": "user auth (bool)"
138 | }
139 | ```
140 |
141 |
142 |
143 |
144 | ```json
145 | {
146 | "message": "Review saved",
147 | }
148 | ```
149 |
150 |
151 |
152 |
153 |
154 |
155 | # Manage CLI
156 | There is a cli program for managing the server - it is in [src/server/manage.py]()
157 |
158 | The command format is:
159 | ```bash
160 | python manage.py command optional_parameter
161 | ```
162 |
163 | | Command | Parameter | Description |
164 | | ------- | --------- | ----------- |
165 | | `create_db` | none | Creates database tables for the db models Language & Feedback
166 | | `all_languages` | none | Lists the model info stored in the Language table
167 | | `add_language` | `name_tag` | Adds a language with a given name_tag, ie - `en-sw-JW300 OR en-tiv-`|
168 | | `remove_language` | `name_tag`| Removes a language with a given name_tag |
169 | | `clean` | none | Deletes and recreates an empty database |
170 | | `tests` | none | Runs the backend tests |
171 |
172 | # Tests
173 |
174 | **TODO**
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Masakhane WEB - A Machine Translation Web Platform for African Languages
2 |
3 |
4 |
5 |
6 |
7 |
8 | [**Masakhane**](https://www.masakhane.io/) meaning ‘we build together’, is a research effort for machine translation for African languages which is open source and online. So far, the community has built translation models based on [Joey NMT](https://github.com/joeynmt/joeynmt) for over 38 African languages. As such, **Masakhane Web** is a platform that aims to host the already trained models from the community and allow contributions from users to create new data for retraining. The objective of this web application is to provide access to an open-source platform that makes available relatively accurate translations for languages across Africa. If you can't find your language and/or would like to train your own machine translation model in your language, see https://github.com/masakhane-io/masakhane-mt on how you can contribute.
9 |
10 |
11 | **Disclaimer:** This system is for research purposes only and should be taken as work in progress. None of the trained models are suitable for production usage.
12 |
13 | ## Table of contents
14 | - [Running The App](#running-the-app)
15 | - [Contributing](#contributing)
16 | - [Options](#options)
17 | - [Submitting Changes\[Pull Request\]](#submitting-changespull-request)
18 | - [Contributors](#contributors)
19 | - [Contact Us](#contact-us)
20 | - [License](#license)
21 | - [Citing the project](#citing-the-project)
22 | - [Acknowledgements](#acknowledgements)
23 |
24 |
25 | # Running The App
26 | To run the app locally, see [here](/docs/start_app_locally_doc.md#running-the-app-locally)
27 | To run the app in a production, see [here](/docs/start_app_prod_doc.md#running-the-app-in-production)
28 |
29 | # Contributing
30 |
31 |
32 | ## Options
33 |
34 | - *Can't see your language as one of the supported languages: Visit [Masakhane:Building your first machine translation model](https://github.com/masakhane-io/masakhane-mt#building-your-first-machine-translation-model) to learn more about how you can train a model for your language.*
35 |
36 | - *I have an idea or a new feature: Create a new issue first, assign it to yourself and then fork the repo*
37 |
38 | - *I want to help in improving the accuracy of the models: Check out below on how you can reach out to us*
39 |
40 |
41 |
42 | ## Submitting Changes[Pull Request]
43 |
44 | - See [https://opensource.com/article/19/7/create-pull-request-github](https://opensource.com/article/19/7/create-pull-request-github)
45 |
46 |
47 |
48 | # Contributors
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | Made with [contributors-img](https://contrib.rocks).
59 |
60 |
61 |
62 |
63 | # Contact Us
64 |
65 | - Vukosi Marivate - vukosi.marivate@cs.up.ac.za
66 |
67 | - Abiodun Modupe - abiodun.modupe@cs.up.ac.za
68 |
69 | - Salomon Kabongo - skabenamualu@aimsammi.org
70 |
71 | - Catherine Gitau - cgitau@aimsammi.org
72 |
73 |
74 |
75 | # License
76 |
77 | [MIT](https://mit-license.org/)
78 |
79 |
80 |
81 | ## Citing the project
82 |
83 | **On a visualisation/notebook/webapp:**
84 |
85 | > Data Science for Social Impact Research Group @ University of Pretoria, Masakhane NLP, *Masakhane WEB - A Machine Translation Web Platform for African Languages* Available on: [https://github.com/dsfsi/masakhane-web](https://github.com/dsfsi/masakhane-web).
86 |
87 | **In a publication**
88 | Software
89 |
90 | > @software { marivate_vukosi_2021_4745501,
91 | > author = {Marivate, Vukosi and Gitau, Catherine and Kabenamualu, Salomon and Modupe, Abiodun and Masakhane NLP},
92 | > title = {{Masakhane WEB - A Machine Translation Web Platform for African Languages}},
93 | > month = may, year = 2021,
94 | > publisher = {Zenodo},
95 | > version = {0.9},
96 | > doi = {10.5281/zenodo.4745501},
97 | > url = {[https://doi.org/10.5281/zenodo.4745501](https://doi.org/10.281/zenodo.4745501)}
98 | > }
99 |
100 |
101 |
102 | # Acknowledgements
103 |
104 |
105 |
106 | We want to acknowledge support from the following organisations
107 |
108 | - [Mozilla](https://www.mozilla.org/en-US/moss/)
109 |
110 | - [Google Cloud Platfrom](https://cloud.google.com/)
--------------------------------------------------------------------------------
/src/client/src/components/terms.js:
--------------------------------------------------------------------------------
1 | import { Row, Card, Button } from 'react-bootstrap';
2 | import { v4 as uuidv4 } from 'uuid';
3 | import React from 'react';
4 |
5 | const Terms = ({ setShow, navigation, setFeedbackToken, feedbackToken}) => {
6 | const { next } = navigation;
7 |
8 | const accept = () => {
9 | if(feedbackToken !== '') {
10 | next();
11 | } else {
12 | // generate token
13 | const token = uuidv4();
14 | // set token
15 | localStorage.setItem('feedbackToken', token);
16 | setFeedbackToken(token);
17 | // proceed
18 | next();
19 | }
20 | }
21 |
22 | const handleDecline = () => {
23 | // close modal
24 | setShow(false);
25 | }
26 |
27 | return (
28 |
29 |
30 |
31 | Terms & Conditions
32 | Dear Sir/Madam,
33 |
34 |
35 | I am Dr Vukosi Marivate , principal investigator of the Data Science for Social Impact research group at the Department of Computer Science at the University of Pretoria.
36 | The research project is titled Masakhane Web Feedback Analysis for African Language Task Models .
37 | The study aims to understand the challenges in automated translation models for African languages.
38 | The models themselves are sourced from the Masakhane project (our collaborators) and are all a work in progress. By better providing feedback to model designers, we can work to improve the models and conduct research on African Language Natural Language Processing.
39 | The purpose of this questionnaire/feedback form is to collect information on the quality of the translations that are on the Masakhane Web system currently.
40 | The user participation is voluntary, and you can withdraw at any time without penalty.
41 |
42 |
43 |
44 |
45 |
46 | Throughout the feedback from the participants, their privacy remains confidential.
47 | Hence, we only collect the following information:
48 |
49 | 1. The user has the option to accept or reject to participate in the feedback survey,
50 |
51 |
52 | 2. The participants are required to indicate their level of proficiencies of the languages translated by the model,
53 |
54 |
55 | 3. and your submitted feedback to the translations is stored on our server. No personal information is collected.
56 |
57 |
58 |
59 |
60 |
61 |
62 | If you agree to participate, please complete the survey that follows this cover letter.
63 | It should take about 5 minutes of your time at the most for feedback on each translation.
64 | By completing the survey, you indicate your willingness to participate in this research.
65 |
66 | If you have any concerns, please contact me with the details provided below.
67 |
68 | Dr. Vukosi Marivate
69 |
70 | vukosi.marivate@cs.up.ac.za
71 |
72 |
73 |
74 |
75 |
76 |
77 | NOT NOW
78 |
79 |
80 | ACCEPT TERMS
81 |
82 |
83 |
84 | )
85 | }
86 |
87 | export default Terms;
88 |
--------------------------------------------------------------------------------
/src/client/src/pages/About.js:
--------------------------------------------------------------------------------
1 | import { Container, Card } from 'react-bootstrap'
2 | import React from 'react';
3 |
4 | export default function About() {
5 | return(
6 |
7 |
8 |
9 |
10 | About
11 | Masakhane Web
12 |
13 |
14 | Masakhane Web is an open source online machine translation service for solely African languages.
15 | This project is in line with the works of the Masakhane community . Masakhane meaning ‘we build together’,
16 | is a research effort whose mission is to strengthen and spur NLP research for African languages which is open source and online.
17 | So far, the community has trained translation models for over 38 African languages. As such, this platform aims at hosting the already trained machine translation models from the Masakhane community and allows contributions
18 | from users to create new data for retraining and improving the models.
19 |
20 |
21 |
27 |
28 |
33 |
34 |
35 |
36 |
37 | If you would like to contribute to this project, train a model in your language or want to collaborate and work with Masakhane, find out how in https://github.com/dsfsi/masakhane-web or reach out to any of the Masakhane Web contributors in the following ways:
38 |
39 |
40 |
41 |
42 |
43 |
44 | Dr. Vukosi Marivate
45 |
46 | vukosi.marivate@cs.up.ac.za
47 |
48 | @vukosi
49 |
50 |
51 | Abiodun Modupe
52 |
53 | abiodun.modupe@cs.up.ac.za
54 |
55 |
56 | Salomon Kabongo
57 |
58 | skabenamualu@aimsammi.org
59 |
60 | @SalomonKabongo
61 |
62 |
63 | Catherine Gitau
64 |
65 | cgitau@aimsammi.org
66 |
67 | @categitau_
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 | )
79 | }
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
2 |
3 | #################################################################################
4 | # GLOBALS #
5 | #################################################################################
6 |
7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
8 | BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
9 | PROFILE = default
10 | PROJECT_NAME = mit-808-starter
11 | PYTHON_INTERPRETER = python3
12 |
13 | ifeq (,$(shell which conda))
14 | HAS_CONDA=False
15 | else
16 | HAS_CONDA=True
17 | endif
18 |
19 | #################################################################################
20 | # COMMANDS #
21 | #################################################################################
22 |
23 | ## Install Python Dependencies
24 | requirements: test_environment
25 | $(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
26 | $(PYTHON_INTERPRETER) -m pip install -r requirements.txt
27 |
28 | ## Make Dataset
29 | data: requirements
30 | $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
31 |
32 | ## Delete all compiled Python files
33 | clean:
34 | find . -type f -name "*.py[co]" -delete
35 | find . -type d -name "__pycache__" -delete
36 |
37 | ## Lint using flake8
38 | lint:
39 | flake8 src
40 |
41 | ## Upload Data to S3
42 | sync_data_to_s3:
43 | ifeq (default,$(PROFILE))
44 | aws s3 sync data/ s3://$(BUCKET)/data/
45 | else
46 | aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
47 | endif
48 |
49 | ## Download Data from S3
50 | sync_data_from_s3:
51 | ifeq (default,$(PROFILE))
52 | aws s3 sync s3://$(BUCKET)/data/ data/
53 | else
54 | aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
55 | endif
56 |
57 | ## Set up python interpreter environment
58 | create_environment:
59 | ifeq (True,$(HAS_CONDA))
60 | @echo ">>> Detected conda, creating conda environment."
61 | ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
62 | conda create --name $(PROJECT_NAME) python=3
63 | else
64 | conda create --name $(PROJECT_NAME) python=2.7
65 | endif
66 | @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
67 | else
68 | $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
69 | @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
70 | export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
71 | @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
72 | @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
73 | endif
74 |
75 | ## Test python environment is setup correctly
76 | test_environment:
77 | $(PYTHON_INTERPRETER) test_environment.py
78 |
79 | #################################################################################
80 | # PROJECT RULES #
81 | #################################################################################
82 |
83 |
84 |
85 | #################################################################################
86 | # Self Documenting Commands #
87 | #################################################################################
88 |
89 | .DEFAULT_GOAL := help
90 |
91 | # Inspired by
92 | # sed script explained:
93 | # /^##/:
94 | # * save line in hold space
95 | # * purge line
96 | # * Loop:
97 | # * append newline + line to hold space
98 | # * go to next line
99 | # * if line starts with doc comment, strip comment character off and loop
100 | # * remove target prerequisites
101 | # * append hold space (+ newline) to line
102 | # * replace newline plus comments by `---`
103 | # * print line
104 | # Separate expressions are necessary because labels cannot be delimited by
105 | # semicolon; see
106 | .PHONY: help
107 | help:
108 | @echo "$$(tput bold)Available rules:$$(tput sgr0)"
109 | @echo
110 | @sed -n -e "/^## / { \
111 | h; \
112 | s/.*//; \
113 | :doc" \
114 | -e "H; \
115 | n; \
116 | s/^## //; \
117 | t doc" \
118 | -e "s/:.*//; \
119 | G; \
120 | s/\\n## /---/; \
121 | s/\\n/ /g; \
122 | p; \
123 | }" ${MAKEFILE_LIST} \
124 | | LC_ALL='C' sort --ignore-case \
125 | | awk -F '---' \
126 | -v ncol=$$(tput cols) \
127 | -v indent=19 \
128 | -v col_on="$$(tput setaf 6)" \
129 | -v col_off="$$(tput sgr0)" \
130 | '{ \
131 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
132 | n = split($$2, words, " "); \
133 | line_length = ncol - indent; \
134 | for (i = 1; i <= n; i++) { \
135 | line_length -= length(words[i]) + 1; \
136 | if (line_length <= 0) { \
137 | line_length = ncol - indent - length(words[i]) - 1; \
138 | printf "\n%*s ", -indent, " "; \
139 | } \
140 | printf "%s ", words[i]; \
141 | } \
142 | printf "\n"; \
143 | }' \
144 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
145 |
--------------------------------------------------------------------------------
/src/client/src/components/step1.js:
--------------------------------------------------------------------------------
1 | import { Row, Col, Form, Button } from 'react-bootstrap';
2 | import RadioButton from './common/radioButton';
3 | import React from 'react';
4 |
5 | const Step1 = ({ src_lang, tgt_lang, setForm, formData, navigation, handleSubmitFeedback }) => {
6 |
7 | const { know_src_lang, know_tgt_lang } = formData;
8 |
9 | const { next, go } = navigation;
10 |
11 | const handleNext= () => {
12 | if(know_src_lang === "none" && know_tgt_lang === "none") {
13 | // submit feedback
14 | handleSubmitFeedback();
15 | // then skip next step
16 | go("step3");
17 | } else {
18 | // go to next page
19 | next();
20 | }
21 | }
22 |
23 | return (
24 |
25 |
26 |
Part 1/2
27 |
28 |
29 |
30 |
How well do you know {src_lang}?
31 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
How well do you know {tgt_lang}?
79 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 | NEXT
127 |
128 |
129 | )
130 | }
131 |
132 | export default Step1;
133 |
--------------------------------------------------------------------------------
/src/torchserve/Download_Transformer_models.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | from pathlib import Path
5 | import torch
6 | import transformers
7 | from transformers import (
8 | AutoConfig,
9 | AutoModelForCausalLM,
10 | AutoModelForQuestionAnswering,
11 | AutoModelForSequenceClassification,
12 | AutoModelForTokenClassification,
13 | AutoTokenizer,
14 | AutoModelForSeq2SeqLM,
15 | set_seed,
16 | )
17 |
18 | print("Transformers version", transformers.__version__)
19 | set_seed(1)
20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21 |
22 |
23 | def transformers_model_downloader(
24 | mode, pretrained_model_name, num_labels, do_lower_case, max_length, torchscript
25 | ):
26 | """This function, save the checkpoint, config file along with tokenizer config and vocab files
27 | of a transformer model of your choice.
28 | """
29 | print("Download model and tokenizer", pretrained_model_name)
30 | # loading pre-trained model and tokenizer
31 | if mode == "sequence_classification":
32 | config = AutoConfig.from_pretrained(
33 | pretrained_model_name, num_labels=num_labels, torchscript=torchscript
34 | )
35 | model = AutoModelForSequenceClassification.from_pretrained(
36 | pretrained_model_name, config=config
37 | )
38 | tokenizer = AutoTokenizer.from_pretrained(
39 | pretrained_model_name, do_lower_case=do_lower_case
40 | )
41 | elif mode == "question_answering":
42 | config = AutoConfig.from_pretrained(
43 | pretrained_model_name, torchscript=torchscript
44 | )
45 | model = AutoModelForQuestionAnswering.from_pretrained(
46 | pretrained_model_name, config=config
47 | )
48 | tokenizer = AutoTokenizer.from_pretrained(
49 | pretrained_model_name, do_lower_case=do_lower_case
50 | )
51 | elif mode == "token_classification":
52 | config = AutoConfig.from_pretrained(
53 | pretrained_model_name, num_labels=num_labels, torchscript=torchscript
54 | )
55 | model = AutoModelForTokenClassification.from_pretrained(
56 | pretrained_model_name, config=config
57 | )
58 | tokenizer = AutoTokenizer.from_pretrained(
59 | pretrained_model_name, do_lower_case=do_lower_case
60 | )
61 | elif mode == "text_generation":
62 | config = AutoConfig.from_pretrained(
63 | pretrained_model_name, num_labels=num_labels, torchscript=torchscript
64 | )
65 | model = AutoModelForCausalLM.from_pretrained(
66 | pretrained_model_name, config=config
67 | )
68 | tokenizer = AutoTokenizer.from_pretrained(
69 | pretrained_model_name, do_lower_case=do_lower_case
70 | )
71 | elif mode == "translation":
72 | # new mode create to handle the masakhane translation models
73 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
74 | model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)
75 |
76 | # NOTE : for demonstration purposes, we do not go through the fine-tune processing here.
77 | # A Fine_tunining process based on your needs can be added.
78 | # An example of Fine_tuned model has been provided in the README.
79 |
80 | NEW_DIR = Path(__file__).parent.joinpath("transformer_models", pretrained_model_name)
81 | NEW_DIR.mkdir(parents=True, exist_ok=True)
82 | print(f"Successfully created directory {NEW_DIR.__str__()} ")
83 |
84 | print(
85 | "Save model and tokenizer/ Torchscript model based on the setting from setup_config",
86 | pretrained_model_name,
87 | "in directory",
88 | NEW_DIR,
89 | )
90 | if save_mode == "pretrained":
91 | model.save_pretrained(NEW_DIR)
92 | tokenizer.save_pretrained(NEW_DIR)
93 | elif save_mode == "torchscript":
94 | dummy_input = "This is a dummy input for torch jit trace"
95 | inputs = tokenizer.encode_plus(
96 | dummy_input,
97 | max_length=int(max_length),
98 | pad_to_max_length=True,
99 | add_special_tokens=True,
100 | return_tensors="pt",
101 | )
102 | input_ids = inputs["input_ids"].to(device)
103 | attention_mask = inputs["attention_mask"].to(device)
104 | model.to(device).eval()
105 | traced_model = torch.jit.trace(model, (input_ids, attention_mask))
106 | torch.jit.save(traced_model, os.path.join(NEW_DIR, "traced_model.pt"))
107 | return
108 |
109 |
110 | if __name__ == "__main__":
111 | dirname = os.path.dirname(__file__)
112 | if len(sys.argv) > 1:
113 | filename = os.path.join(dirname, sys.argv[1])
114 | else:
115 | filename = os.path.join(dirname, "setup_config.json")
116 | f = open(filename)
117 | settings = json.load(f)
118 | mode = settings["mode"]
119 | model_name = settings["model_name"]
120 | num_labels = int(settings["num_labels"])
121 | do_lower_case = settings["do_lower_case"]
122 | max_length = settings["max_length"]
123 | save_mode = settings["save_mode"]
124 | if save_mode == "torchscript":
125 | torchscript = True
126 | else:
127 | torchscript = False
128 |
129 | transformers_model_downloader(
130 | mode, model_name, num_labels, do_lower_case, max_length, torchscript
131 | )
132 |
--------------------------------------------------------------------------------
/docs/debugging_setup.md:
--------------------------------------------------------------------------------
1 | # Common SetUp errors and Debugging
2 |
3 | ## Table of Contents
4 | - [**Errors during setup**](#errors-during-setup)
5 | - [**Errors with Docker**](#errors-with-docker)
6 | - [**gcsfuse** - Noted on Mac M1 (Dec 2022)](#gcsfuse---noted-on-mac-m1-dec-2022)
7 | - [**failed to solve** - Noted on Mac M1 (Dec 2022)](#failed-to-solve---noted-on-mac-m1-dec-2022)
8 | - [**Errors with stand alone setup**](#errors-with-stand-alone-setup)
9 | - [**PyICU/Polyglot** - Noted on Linux/Ubuntu (Jun 2022)](#pyicupolyglot---noted-on-linuxubuntu-jun-2022)
10 | - [**Checking the client, server/api \& database**](#checking-the-client-serverapi--database)
11 | - [**Check the client**](#check-the-client)
12 | - [**Check the api**](#check-the-api)
13 | - [**Notable API endpoints to test using GET:**](#notable-api-endpoints-to-test-using-get)
14 | - [**Notable API endpoints to test using POST:**](#notable-api-endpoints-to-test-using-post)
15 | - [**Check the database**](#check-the-database)
16 | - [**With Docker**](#with-docker)
17 | - [**With Stand alone backend**](#with-stand-alone-backend)
18 |
19 |
20 | # **Errors during setup**
21 |
22 | ## **Errors with Docker**
23 | ### **gcsfuse** - Noted on Mac M1 (Dec 2022)
24 | Seems to be a architecture issue, resolved by running the command:
25 | ```bash
26 | export DOCKER_DEFAULT_PLATFORM=linux/amd64
27 | ```
28 | [solution reference](https://github.com/GoogleCloudPlatform/gcsfuse/issues/586)
29 |
30 | ### **failed to solve** - Noted on Mac M1 (Dec 2022)
31 | Full err message:
32 | ```
33 | failed to solve: rpc error: code = Unknown desc = failed to solve with frontend dockerfile.v0: failed to create LLB definition: failed to authorize: rpc error: code = Unknown desc = failed to fetch anonymous token: Get "https://auth.docker.io/token?scope=repository%3Alibrary%2Fnode%3Apull&service=registry.docker.io": dial tcp: lookup auth.docker.io on 192.168.0.1:53: no such host
34 | ```
35 |
36 | This is a ad-hoc error, possible solutions:
37 | - Sign in to docker hub and docker cli ```docker signin```
38 | - Within `Docker hub>Settings>Docker Engine`,set `buildkit` to `false`
39 | - Instead of `docker-compose`, try `docker compose`
40 | - Lost all hope? Go make a cup of coffee, sometimes it works if you just give it a minute...
41 |
42 | [solution signin reference](https://stackoverflow.com/questions/65361083/docker-build-failed-to-fetch-oauth-token-for-openjdk) | [solution buildkit reference](https://stackoverflow.com/questions/64221861/an-error-failed-to-solve-with-frontend-dockerfile-v0)
43 |
44 | **Note** Running these commands is not advisable:
45 | ```bash
46 | export DOCKER_BUILDKIT=0
47 | export COMPOSE_DOCKER_CLI_BUILD=0
48 | ```
49 | This will invalidate the GCSFuse fix for Mac M1.
50 |
51 | ## **Errors with stand alone setup**
52 |
53 | ### **PyICU/Polyglot** - Noted on Linux/Ubuntu (Jun 2022)
54 |
55 | Resolved by running the commands:
56 | ```bash
57 | apt-get update
58 | ```
59 |
60 | Then either - from apt directly : https://packages.debian.org/source/stable/pyicu:
61 | ```bash
62 | apt-get install python3-icu
63 | ```
64 | OR - from source:
65 | ```bash
66 | apt-get install pkg-config libicu-dev
67 | pip install --no-binary=:pyicu: pyicu
68 | ```
69 |
70 | # **Checking the client, server/api & database**
71 | ## **Check the client**
72 | The client should be running on http://localhost:3000.
73 |
74 | Check the terminal (standalone), inspect the webpage or view the docker logs for error output.
75 | ## **Check the api**
76 | The API should be running on http://localhost:5000 and return the following output:
77 | ```json
78 | {
79 | "message": "welcome Masakhane Web"
80 | }
81 | ```
82 | Check the terminal (standalone) or view the docker logs for error output.
83 |
84 | ### **Notable API endpoints to test using GET:**
85 | Make get requests by going to the web endpoint in your browser
86 | | Endpoint | Description |
87 | | -------- | ----------- |
88 | | http://localhost:5000/update | Updates the local database with the newly loaded models |
89 | | http://localhost:5000/translate | Lists the saved models |
90 |
91 |
92 |
93 | ### **Notable API endpoints to test using POST:**
94 | Use a developer tool such as [Postman](https://www.postman.com/) to make POST requests
95 | | Endpoint | Description | Example Body |
96 | | ------ | --------- | --------- |
97 | | http://localhost:5000/translate | Returns the translated text | { "src_lang": "english", "tgt_lang": "swahili", "input": "Hello, how are you?" } |
98 |
99 | ## **Check the database**
100 | Docker makes use of a postgreSQL database
101 | The stand alone app uses sqlite, so there is an different method for access.
102 |
103 | ### **With Docker**
104 | The 'db-1' image in docker contains the database using PostgreSQL, you can access the DB system running on the image with the command:
105 | ```
106 | docker-compose -f docker-compose.yml exec db psql --username=masakhane --dbname=masakhane
107 | ```
108 |
109 | List all databases:
110 | ```
111 | \l
112 | ```
113 |
114 | Connect to the masakhane database:
115 | ```
116 | \c masakhane
117 | ```
118 |
119 | List relations
120 | ```
121 | \dt
122 | ```
123 |
124 | See saved information in a relation:
125 | ```
126 | select * from language;
127 | ```
128 |
129 | Quit the database:
130 | ```
131 | \q
132 | ```
133 |
134 | ### **With Stand alone backend**
135 |
136 | Within the `src/server/core/` directory, run this command to start the python interpreter:
137 | ```
138 | python
139 | ```
140 |
141 | Use the code below to check what is saved in the database
142 |
143 | ```python
144 | import sqlite3, os
145 |
146 | conn = sqlite3.connect("masakhane.db")
147 | c = conn.cursor()
148 |
149 | for row in c.execute('SELECT * FROM feedback'):
150 | print(row)
151 |
152 | for row in c.execute('SELECT * FROM language'):
153 | print(row)
154 | ```
155 |
--------------------------------------------------------------------------------
/docs/project_details.md:
--------------------------------------------------------------------------------
1 | # **Project Details**
2 | The requirements of Masakhane Web is to faciliate translations for African languages using different machine translation models. There is also an feauture to provide feedback and correction to inaccurate translations.
3 |
4 | ## **Table of Contents**
5 | - [**Tech Stack**](#tech-stack)
6 | - [**Frontend**](#frontend)
7 | - [**React**](#react)
8 | - [**Webpack**](#webpack)
9 | - [**Backend**](#backend)
10 | - [**Python**](#python)
11 | - [**Database**](#database)
12 | - [**Flask**](#flask)
13 | - [**File Structure**](#file-structure)
14 |
15 |
16 |
17 | # **Tech Stack**
18 |
19 | ## **Frontend**
20 | Review the [client readme](../../src/client/README.md) for more information.
21 |
22 | ### **React**
23 | The frontend is written using [React](https://reactjs.org/).
24 |
25 | ### **Webpack**
26 | The frontend also makes use of [Webpack](https://webpack.js.org/), a static module bundler for modern JavaScript applications.
27 |
28 | - **Webpack DevServer & Proxy**
29 | The [devServer](https://webpack.js.org/configuration/dev-server/) runs on http://translate.masakhane.io:80.
30 | The [proxy](https://webpack.js.org/configuration/dev-server/#devserverproxy) allows you to send requests to http://translate.masakhane.io/translate and have it hit the backen at http://localhost:5000/translate.
31 |
32 |
33 | ## **Backend**
34 | Review the [server readme](../../src/server/README.md) for more information
35 |
36 | ### **Python**
37 | The backend is written using [Python](https://www.python.org/)
38 |
39 | ### **Database**
40 | The backend database is predominantly PostgreSQL on Docker, but there is an option to use SQLite when running a stand-alone backend.
41 |
42 | ### **Flask**
43 | The backend also makes use of [Flask](https://flask.palletsprojects.com/en/2.2.x/), which is for web development in Python.
44 |
45 | - **App**
46 | Masakhane Web makes use of the Flask [application factory](https://flask.palletsprojects.com/en/2.2.x/patterns/appfactories/) pattern in `src/core/__init__.py`
47 |
48 | - **API**
49 | The API uses [flask_restful](https://flask-restful.readthedocs.io/en/latest/quickstart.html#resourceful-routing) and is defined in `src/core/resources/translate.py`.
50 | It is initialised along with the app in `src/core/__init__.py`.
51 |
52 | - **Database**
53 | The application interacts with the database using [flask_sqlalchemy](https://flask-sqlalchemy.palletsprojects.com/en/3.0.x/) and is defined in `src/core/extensions.py`.
54 | It is initialised along with the app in `src/core/__init__.py`. (Note the `.env.dev` for database config)
55 |
56 | # **File Structure**
57 |
58 | ```
59 | .masakhane-web
60 | |-- docker-compose.yml # Docker compose for local instance
61 | |-- docker-compose.prod.yml # Docker compose for production instance
62 | |-- entrypoint.sh
63 | |-- environment.yaml
64 | `-- src
65 | |-- client # IDK much about the frontend, update required
66 | | |-- Dockerfile
67 | | |-- package-lock.json
68 | | |-- package.json
69 | | |-- public
70 | | |-- src
71 | | | |-- App.js
72 | | | |-- App.test.js
73 | | | |-- components
74 | | | | |-- translateCard.js
75 | | | | `-- *others*
76 | | | |-- images
77 | | | |-- index.css
78 | | | |-- index.js
79 | | | |-- logo.svg
80 | | | |-- pages
81 | | | | |-- About.js
82 | | | | |-- Faq.js
83 | | | | `-- Home.js
84 | | | |-- reportWebVitals.js
85 | | | |-- setupProxy.js
86 | | | `-- setupTests.js
87 | | `-- webpack.config.js
88 | `-- server
89 | |-- __init__.py
90 | |-- available_models.tsv # TSV file containing available models
91 | |-- languages.json # JSON file containing language information (names, etc)
92 | |-- Dockerfile
93 | |-- entrypoint.sh # Docker entrypoint for Dockerfile
94 | |-- Dockerfile.prod
95 | |-- entrypoint.prod.sh # Docker entrypoint for Dockerfile.prod
96 | |-- requirements.txt # Python dependencies
97 | |-- manage.py # Manage CLI
98 | |-- core
99 | | |-- __init__.py # Flask app factory & init
100 | | |-- resources
101 | | | `-- translate.py # Flask API
102 | | |-- extensions.py # Flask_SQLAlchemy init
103 | | |-- models
104 | | | |-- feedback.py # Feedback DB Model
105 | | | |-- language.py # Language DB Model
106 | | | |-- predict.py # I think this is in the wrong place, does the translation
107 | | | `-- translation.py # Translation object
108 | | |-- model_load.py # Class to manage the download and loading of different translation models
109 | | |-- config.py # Different config states for dev enviroments
110 | | |-- languages.json # Duplicate of ../languages.json
111 | | |-- tests
112 | | | |-- __init__.py
113 | | | |-- base.py # Test create app
114 | | | |-- test_app.py # Test API
115 | | | `-- test_config.py # Dev tests
116 | | |-- utils.py
117 | | `-- utils_bucket
118 | | |-- bucket.py
119 | | `-- upload_download.py
120 | |-- models # Translation models are stored here
121 | | `-- joeynmt
122 | | |-- en-sw-JW300 # File struct of a complete model for English to Swahili
123 | | | |-- config.yaml
124 | | | |-- config_orig.yaml
125 | | | |-- model.ckpt
126 | | | |-- src.bpe.model
127 | | | |-- src_vocab.txt
128 | | | |-- trg.bpe.model
129 | | | `-- trg_vocab.txt
130 | `-- nginx
131 | |-- Dockerfile
132 | `-- nginx.conf
133 | ```
--------------------------------------------------------------------------------
/docs/start_app_locally_doc.md:
--------------------------------------------------------------------------------
1 | # **Running the App Locally**
2 |
3 | The app can be run as a standalone or using Docker, unless you are working on an machine running linux/ubuntu, it is adviseable to use Docker.
4 |
5 | To run the app in production, see [here](start_app_prod_doc.md).
6 |
7 | For any errors during setup, please see the [debugging doc](debugging_setup.md).
8 |
9 | Review the [project details doc](project_details.md) for more information on the technology stack.
10 | Take note of the [Client](../../src/client/README.md) and [Server](../../src/server/README.md) README's.
11 |
12 | ## **Table of Contents**
13 | - [**Using Docker ( Preferred )**](#using-docker--preferred-)
14 | - [**Docker Setup**](#docker-setup)
15 | - [**Running the app**](#running-the-app)
16 | - [**Building the App**](#building-the-app)
17 | - [**Shut down the app**](#shut-down-the-app)
18 | - [**Add, Update, \& Delete Languages**](#add-update--delete-languages)
19 | - [**Running tests**](#running-tests)
20 | - [**The Database**](#the-database)
21 | - [**As a stand-alone app**](#as-a-stand-alone-app)
22 | - [**Backend Setup**](#backend-setup)
23 | - [**Run the server:**](#run-the-server)
24 | - [**The Database**](#the-database-1)
25 | - [**Add, Update, \& Delete Languages**](#add-update--delete-languages-1)
26 | - [**Running tests**](#running-tests-1)
27 | - [**Frontend Setup**](#frontend-setup)
28 | - [**Run the client:**](#run-the-client)
29 | - [**Errors during setup**](#errors-during-setup)
30 |
31 |
32 | # **Using Docker ( Preferred )**
33 |
34 | The better/easier way to run the app is to use Docker, which will build both the frontend and the backend with the correct enviroment setup.
35 |
36 | ## **Docker Setup**
37 |
38 | Ensure you have `docker` & `docker-compose` installed on your computer, you can check with the following commands:
39 | ```bash
40 | docker --version
41 | docker-compose --version
42 | ```
43 |
44 | If the above commands return an error, please install [Docker](https://docs.docker.com/engine/install/) and [Docker-compose](https://docs.docker.com/compose/install/).
45 |
46 | ## **Running the app**
47 | ### **Building the App**
48 | To build the app, from the root project directory, run the following command:
49 | ```bash
50 | docker-compose -f docker-compose.yml up -d --build
51 | ```
52 |
53 | Docker should create a container named 'masakhane-web' with the images 'db-1', 'server-1', and 'client-1'.
54 | The server should be active on http://localhost:5000 and the client on http://localhost:3000
55 | Look [here](debugging_setup.md#checking-the-client-serverapi--database) for checking these services manually.
56 |
57 | ### **Shut down the app**
58 | To shut down the app, run the following command to remove the docker container:
59 | ```bash
60 | docker-compose -f docker-compose.yml down
61 | ```
62 |
63 | ### **Add, Update, & Delete Languages**
64 | **Add a Language**
65 | ```bash
66 | docker-compose -f docker-compose.yml exec server python manage.py add_language en-sw-JW300
67 | ```
68 | The language code parameter `en-sw-JW300` represents {src-lang}-{tgt-lang}-{shortform}
69 | So `en-sw-JW300` represents English-Swahili using JW300 shortform
70 | **Note** - A code parameter example without shortform is `en-tiv-`
71 |
72 | Download available languages csv [here](https://zenodo.org/record/7417644/files/masakhane-mt-current-models.csv)
73 |
74 | **Update Langugaes**
75 | ```bash
76 | curl --request GET 'http://127.0.0.1:5000/update'
77 | ```
78 |
79 | **Check available languages**
80 | ```bash
81 | docker-compose -f docker-compose.yml exec server python manage.py all_languages
82 | ```
83 |
84 | **Remove a language**
85 | ```bash
86 | docker-compose -f docker-compose.yml exec server python manage.py remove_language en-sw-JW300
87 | ```
88 |
89 | ### **Running tests**
90 | ```bash
91 | docker-compose -f docker-compose.yml exec server python manage.py tests
92 | ```
93 |
94 | ### **The Database**
95 | Look [here](debugging_setup.md#with-docker) for more information about accessing the database
96 |
97 | # **As a stand-alone app**
98 | In order to run the app, we need to set up the backend and frontend seperately.
99 | **Note** It is advisable to be working on an linux/ubuntu machine.
100 |
101 | ## **Backend Setup**
102 |
103 | First, ensure you are running [Python 3.6.9](https://www.python.org/downloads/release/python-369/)
104 |
105 | Within the `src/server` directory of the project
106 |
107 | **Install required packages:**
108 | ```bash
109 | pip install -r requirements.txt
110 | ```
111 |
112 | **Run the following commands:**
113 | ```bash
114 | export FLASK_APP=core/__init__.py
115 | export FLASK_ENV=development
116 | ```
117 |
118 | ## **Run the server:**
119 | To start the API and database services, run the command:
120 | ```bash
121 | python manage.py run
122 | ```
123 |
124 | ### **The Database**
125 | Look [here](debugging_setup.md#with-stand-alone-backend) for more information about accessing the database
126 |
127 | ### **Add, Update, & Delete Languages**
128 | **Add a Language**
129 | ```bash
130 | python manage.py add_language en-sw-JW300
131 | ```
132 | The language code parameter `en-sw-JW300` represents {src-lang}-{tgt-lang}-{shortform}
133 | So `en-sw-JW300` represents English-Swahili using JW300 shortform
134 | **Note** - A code parameter example without shortform is `en-tiv-`
135 |
136 | Download available languages csv [here](https://zenodo.org/record/7417644/files/masakhane-mt-current-models.csv)
137 |
138 | **Update Langugaes**
139 | ```bash
140 | curl --request GET 'http://127.0.0.1:5000/update'
141 | ```
142 | **Check available languages**
143 | ```bash
144 | python manage.py all_languages
145 | ```
146 |
147 | **Remove a language**
148 | ```bash
149 | python manage.py remove_language en-sw-JW300
150 | ```
151 |
152 | ### **Running tests**
153 | ```bash
154 | python manage.py tests
155 | ```
156 |
157 | The API is available at `http://localhost:5000`, see notable API endpoints [here](debugging_setup.md#check-the-api)
158 |
159 | ## **Frontend Setup**
160 |
161 | Ensure you have [node.js](https://nodejs.org/en/) and [yarn](https://classic.yarnpkg.com/en/docs/install) installed
162 |
163 | Within the `src/client/` directory of the project:
164 | **Install required packages:**
165 | ```bash
166 | npm install --legacy-peer-deps
167 | ```
168 |
169 | **Run the following commands:**
170 | ```bash
171 | npm i webpack webpack-cli --legacy-peer-deps
172 | npm i @babel/core @babel/preset-env @babel/preset-react babel-loader --legacy-peer-deps
173 | ```
174 |
175 | ## **Run the client:**
176 | To start the client , run the command:
177 | ```bash
178 | npm run develop
179 | ```
180 |
181 | The client is available at `http://localhost:3000`
182 |
183 | # **Errors during setup**
184 | If there was a problem during setup, review [this doc](debugging_setup.md) for possible errors and solutions.
185 |
186 |
--------------------------------------------------------------------------------
/src/client/src/components/step2.js:
--------------------------------------------------------------------------------
1 | import { Row, Col, Form, Button } from 'react-bootstrap';
2 | import RadioButton from './common/radioButton';
3 | import React from 'react';
4 |
5 | const Step2 = ({ src_lang, tgt_lang, text, translation, setForm, formData, navigation, handleSubmitFeedback }) => {
6 |
7 | const { understand_translation, accurate_translation, own_translation } = formData;
8 | const { next } = navigation;
9 |
10 | const handleSubmit = () => {
11 | // submit form
12 | handleSubmitFeedback();
13 | // then navigate to next page
14 | next();
15 | }
16 | return (
17 |
18 |
19 |
Part 2/2
20 |
21 |
22 |
23 |
24 |
25 | {!!src_lang && src_lang.toUpperCase()}
26 | {text}
27 |
28 |
29 |
30 | {!!tgt_lang && tgt_lang.toUpperCase()}
31 | {!!translation && translation}
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
Did you understand the translation? / Did it make sense?
42 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
How accurate was the translation?
90 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
How would you have translated this? (Optional)
138 |
139 |
141 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 | SUBMIT
161 |
162 |
163 | )
164 | }
165 |
166 | export default Step2;
167 |
--------------------------------------------------------------------------------
/src/server/core/resources/translate.py:
--------------------------------------------------------------------------------
1 | #External modules
2 | from flask_restful import Resource
3 | from flask import request
4 | from http import HTTPStatus
5 | from collections import defaultdict
6 | import os, json
7 | #Internal modules
8 | from core.model_load import MasakhaneModelLoader
9 | from core.models.predict import Predicter
10 | from core.models.feedback import Feedback
11 | from core.models.language import Language
12 | from core.models.translation import Translation
13 |
14 | from pathlib import Path
15 |
16 |
17 | class TranslateResource(Resource):
18 | """ TranslateResource
19 | -----------------
20 | #### User-Defined Flask API Resource accepting GET & POST\n
21 | GET - List's available models\\
22 | POST - Performs translation from srg lang to tgt lang, review the server ReadMe for more info.
23 | """
24 | def __init__(self, saved_models):
25 | self.models = saved_models
26 |
27 | # load languages.json into distros_dict
28 | json_file = os.environ.get('JSON','./languages.json')
29 | with open(json_file, 'r') as f:
30 | distros_dict = json.load(f)
31 | # init empty dicts to store full_name to short_name bindings
32 | self.languages_short_to_full = {}
33 | self.languages_full_to_short = {}
34 |
35 | for distro in distros_dict:
36 | self.languages_short_to_full[distro['language_short'].lower(
37 | )] = distro['language_en'].lower()
38 | self.languages_full_to_short[distro['language_en'].lower(
39 | )] = distro['language_short'].lower()
40 | # Example: languages_short_to_full['sw'] = 'swahili'
41 | # Example: languages_full_to_short['Swahili'] = 'sw'
42 |
43 | def post(self):
44 | """POST method to translate a given input
45 | ---
46 |
47 | ### Request Body
48 | ```json
49 | {
50 | "src_lang" : "src_lang_full",
51 | "tgt_lang" : "tgt_lang_full",
52 | "input": "input_text",
53 | }
54 | ```
55 | ### Returns a Translation Object defined in `src/server/core/models/translation.py`
56 | ```json
57 | {
58 | "src_lang" : "src_lang_full",
59 | "tgt_lang" : "tgt_lang_full",
60 | "input": "input_text",
61 | "output": "translation_result"
62 | }
63 | ```
64 | """
65 | # Get req body
66 | data = request.get_json()
67 | source_language = data['src_lang'].lower()
68 | target_language = data['tgt_lang'].lower()
69 |
70 | #Get short_name from self.language_dicts
71 | source_language_short = self.languages_full_to_short[source_language]
72 | target_language_short = self.languages_full_to_short[target_language]
73 |
74 | #model key to provide translation
75 | input_model = source_language_short+'-'+target_language_short
76 |
77 | if input_model not in self.models.keys():
78 | return {'message': 'model not found'}, HTTPStatus.NOT_FOUND
79 | else:
80 | translation_result = Predicter().translate(
81 | data['input'], model=self.models[input_model]['model'],
82 | src_vocab=self.models[input_model]['src_vocab'],
83 | trg_vocab=self.models[input_model]['trg_vocab'],
84 | preprocess=self.models[input_model]['preprocess'],
85 | postprocess=self.models[input_model]['postprocess'],
86 | logger=self.models[input_model]['logger'],
87 | beam_size=self.models[input_model]['beam_size'],
88 | beam_alpha=self.models[input_model]['beam_alpha'],
89 | level=self.models[input_model]['level'],
90 | lowercase=self.models[input_model]['lowercase'],
91 | max_output_length=self.models[input_model]['max_output_length'],
92 | use_cuda=self.models[input_model]['use_cuda'],
93 | )
94 |
95 | trans = Translation(src_lang=data['src_lang'],
96 | tgt_lang=data['tgt_lang'],
97 | input=data['input'],
98 | output=translation_result)
99 |
100 | return trans.data, HTTPStatus.CREATED
101 |
102 | def get(self):
103 | """GET Method to list available models in memory
104 | ---
105 |
106 | Returns a json list, ie
107 | ```json
108 | [
109 | {
110 | "type": "source",
111 | "name": "src_lang_full",
112 | "value": "src_lang_short",
113 | "targets": [
114 | {
115 | "name": "tgt_lang_full",
116 | "value": "tgt_lang_short"
117 | }
118 | ]
119 | }
120 | ]
121 | ```
122 | """
123 |
124 | dict_output = defaultdict(lambda: [])
125 | #for each src-tgt key in model dict
126 | for couple in list(self.models.keys()):
127 | src, tgt = couple.split("-")
128 | dict_output[src].append(
129 | {
130 | 'name': self.languages_short_to_full[tgt].capitalize(),
131 | 'value': tgt
132 | }
133 | )
134 |
135 | output = []
136 | for source in dict_output:
137 | output.append(
138 | {
139 | "type": "source",
140 | "name": self.languages_short_to_full[source].capitalize(),
141 | "value": source,
142 | 'targets': dict_output[source]
143 | }
144 | )
145 |
146 | return output, HTTPStatus.OK
147 |
148 |
149 | class AddResource(Resource):
150 | """ AddResource
151 | -----------------
152 | #### User-Defined Flask API Resource accepting GET\n
153 | GET - Updates the models based on the model info stored in the Language table
154 | """
155 | def __init__(self, saved_models):
156 | self.models = saved_models
157 | # Load file path to avialable_models.tsv which has all the github & google drive links that store the model files
158 | self.selected_models_file = os.environ.get('MODEL_ALL_FILE',
159 | "./available_models.tsv")
160 |
161 | def get(self):
162 | """GET Method to update the available models
163 | ---
164 | Returns a json Object, ie
165 | ```json
166 | {
167 | "message": "Models updated"
168 | }
169 | ```
170 | """
171 | model_loader = MasakhaneModelLoader(available_models_file=os.environ.get('MODEL_ALL_FILE',
172 | './available_models.tsv'))
173 | db_pairs = []
174 | model_directory = Path.cwd().joinpath('models', 'joeynmt')
175 | downloaded_models = list(model_directory.iterdir())
176 | #loads model info from the Language table
177 | for lan in Language.query.all():
178 | language_pair = lan.to_json()
179 | src_language =language_pair['source']
180 | tgt_language = language_pair['target']
181 | domain = language_pair['domain']
182 | db_pair = f"{language_pair['source']}-{language_pair['target']}"
183 | # check if the model is not already loaded in memory
184 | if db_pair not in list(self.models.keys()):
185 | name_tag = src_language+"-"+tgt_language+"-"+domain
186 | # check if the model is not already downloaded
187 | if name_tag not in downloaded_models:
188 | print("Downloading model for "+name_tag)
189 | model_loader.download_model(src_language, tgt_language, domain)
190 | # Attempts to download model and store in self.models
191 | self.models[db_pair] = model_loader.load_model(src_language, tgt_language, domain)
192 | print(f"db_pair : {db_pair} \n now : {list(self.models.keys())}")
193 |
194 | # keep all the pairs in the db
195 | db_pairs.append(db_pair)
196 |
197 | # Remove models from memory that are not listed in the DB Language table
198 | for pair in list(self.models.keys()):
199 | if pair not in db_pairs:
200 | del self.models[pair]
201 |
202 | return {'message': "Models updated"}, HTTPStatus.OK
203 |
204 |
205 | class SaveResource(Resource):
206 | """ SaveResource
207 | ------------
208 | #### User-Defined Flask API Resource accepting POST\n
209 | POST - saves feedback/correction information into the Feedback database
210 | """
211 | def __init__(self):
212 | super().__init__()
213 |
214 | def post(self):
215 | """POST Method to save feeback into the DB Feedback table
216 | ---
217 | ### Request Body
218 | ```json
219 | {
220 | "src_lang" : "src_lang_full",
221 | "tgt_lang" : "tgt_lang_full",
222 | "input": "input_text",
223 | "review": "translation_correction",
224 | "stars": "translation_confidence",
225 | "token": "user_auth(bool)",
226 | }
227 | ```
228 | ### Returns a Translation Object defined in `src/server/core/models/translation.py
229 | ```json
230 | {
231 | "message": "Review saved"
232 | }
233 | """
234 |
235 | data = request.get_json()
236 |
237 | feedback = Feedback(
238 | src_lang=data['src_lang'],
239 | tgt_lang=data['tgt_lang'],
240 | accurate_translation=data['accurate_translation'],
241 | know_src_lang=data['know_src_lang'],
242 | know_tgt_lang=data['know_tgt_lang'],
243 | own_translation=data['own_translation'],
244 | text=data['text'],
245 | translation=data['translation'],
246 | understand_translation=data['understand_translation'],
247 | feedbackToken=data['feedbackToken'])
248 |
249 | feedback.save()
250 |
251 | return {'message': "Review saved"}, HTTPStatus.CREATED
252 |
253 |
254 | class HomeResource(Resource):
255 | """ HomeResource
256 | ------------
257 | User-Defined Flask API Resource accepting GET\n
258 | GET - returns {'message': "welcome Masakhane Web"}
259 | """
260 | def __init__(self):
261 | super().__init__()
262 |
263 | def get(self):
264 | return {'message': "welcome Masakhane Web"}, HTTPStatus.OK
265 |
--------------------------------------------------------------------------------
/src/torchserve/transformer_handler.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import json
3 | import logging
4 | import os
5 | from abc import ABC
6 |
7 | import torch
8 | import transformers
9 | from captum.attr import LayerIntegratedGradients
10 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
11 |
12 | from ts.torch_handler.base_handler import BaseHandler
13 |
14 | logger = logging.getLogger(__name__)
15 | logger.info("Transformers version %s", transformers.__version__)
16 |
17 |
18 | class M2MTranslatorHandler(BaseHandler, ABC):
19 | """
20 | Transformer handler for machine translation task using the m2m_100 model.
21 | """
22 |
23 | def __init__(self):
24 | super(M2MTranslatorHandler, self).__init__()
25 | self.initialized = False
26 |
27 | def initialize(self, ctx):
28 | """In this initialize function, the BERT model is loaded and
29 | the Layer Integrated Gradients Algorithm for Captum Explanations
30 | is initialized here.
31 | Args:
32 | ctx (context): It is a JSON Object containing information
33 | pertaining to the model artefacts parameters.
34 | """
35 | self.manifest = ctx.manifest
36 | properties = ctx.system_properties
37 | model_dir = properties.get("model_dir")
38 | serialized_file = self.manifest["model"]["serializedFile"]
39 | model_pt_path = os.path.join(model_dir, serialized_file)
40 |
41 | self.device = torch.device(
42 | "cuda:" + str(properties.get("gpu_id"))
43 | if torch.cuda.is_available() and properties.get("gpu_id") is not None
44 | else "cpu"
45 | )
46 | self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
47 |
48 | self.model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
49 | self.model.eval()
50 |
51 | logger.info("Transformer model from path %s loaded successfully", model_dir)
52 |
53 | self.initialized = True
54 |
55 | def preprocess(self, requests):
56 | """Basic text preprocessing, based on the user's choice of application mode.
57 | Args:
58 | requests (str): The Input data in the form of text is passed on to the preprocess
59 | function.
60 | Returns:
61 | list : The preprocess function returns a list of Tensor for the size of the word tokens.
62 | """
63 | input_ids_batch = None
64 | attention_mask_batch = None
65 | for idx, data in enumerate(requests):
66 | input_text = data.get("data")
67 | if input_text is None:
68 | input_text = data.get("body")
69 | if isinstance(input_text, (bytes, bytearray)):
70 | input_text = input_text.decode("utf-8")
71 | logger.info("Received text: '%s'", input_text)
72 |
73 | inputs = self.tokenizer.encode_plus(
74 | input_text,
75 | max_length=int(max_length),
76 | pad_to_max_length=True,
77 | add_special_tokens=True,
78 | return_tensors="pt",
79 | )
80 | input_ids = inputs["input_ids"].to(self.device)
81 | attention_mask = inputs["attention_mask"].to(self.device)
82 | # making a batch out of the recieved requests
83 | # attention masks are passed for cases where input tokens are padded.
84 | if input_ids.shape is not None:
85 | if input_ids_batch is None:
86 | input_ids_batch = input_ids
87 | attention_mask_batch = attention_mask
88 | else:
89 | input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
90 | attention_mask_batch = torch.cat(
91 | (attention_mask_batch, attention_mask), 0
92 | )
93 | return (input_ids_batch, attention_mask_batch)
94 |
95 | def inference(self, input_batch):
96 | """Predict the class (or classes) of the received text using the
97 | serialized transformers checkpoint.
98 | Args:
99 | input_batch (list): List of Text Tensors from the pre-process function is passed here
100 | Returns:
101 | list : It returns a list of the predicted value for the input text
102 | """
103 | input_ids_batch, attention_mask_batch = input_batch
104 | inferences = []
105 | # todos need to change this to use the generation configuration
106 | outputs = self.model.generate(
107 | input_ids_batch, max_length=50, do_sample=True, top_p=0.95, top_k=60
108 | )
109 | for i, x in enumerate(outputs):
110 | inferences.append(self.tokenizer.decode(outputs[i], skip_special_tokens=True))
111 | logger.info("Generated text: '%s'", inferences)
112 |
113 | logger.info("Generated text", inferences)
114 | return inferences
115 |
116 | def postprocess(self, inference_output):
117 | """Post Process Function converts the predicted response into Torchserve readable format.
118 | Args:
119 | inference_output (list): It contains the predicted response of the input text.
120 | Returns:
121 | (list): Returns a list of the Predictions and Explanations.
122 | """
123 | return inference_output
124 |
125 | def get_insights(self, input_batch, text, target):
126 | """This function initialize and calls the layer integrated gradient to get word importance
127 | of the input text if captum explanation has been selected through setup_config
128 | Args:
129 | input_batch (int): Batches of tokens IDs of text
130 | text (str): The Text specified in the input request
131 | target (int): The Target can be set to any acceptable label under the user's discretion.
132 | Returns:
133 | (list): Returns a list of importances and words.
134 | """
135 |
136 | if self.setup_config["captum_explanation"]:
137 | embedding_layer = getattr(self.model, self.setup_config["embedding_name"])
138 | embeddings = embedding_layer.embeddings
139 | self.lig = LayerIntegratedGradients(captum_sequence_forward, embeddings)
140 | else:
141 | logger.warning("Captum Explanation is not chosen and will not be available")
142 |
143 | if isinstance(text, (bytes, bytearray)):
144 | text = text.decode("utf-8")
145 | text_target = ast.literal_eval(text)
146 |
147 | if not self.setup_config["mode"] == "question_answering":
148 | text = text_target["text"]
149 | self.target = text_target["target"]
150 |
151 | input_ids, ref_input_ids, attention_mask = construct_input_ref(
152 | text, self.tokenizer, self.device, self.setup_config["mode"]
153 | )
154 | all_tokens = get_word_token(input_ids, self.tokenizer)
155 | response = {}
156 | response["words"] = all_tokens
157 | return [response]
158 |
159 |
160 | def construct_input_ref(text, tokenizer, device, mode):
161 | """For a given text, this function creates token id, reference id and
162 | attention mask based on encode which is faster for captum insights
163 | Args:
164 | text (str): The text specified in the input request
165 | tokenizer (AutoTokenizer Class Object): To word tokenize the input text
166 | device (cpu or gpu): Type of the Environment the server runs on.
167 | Returns:
168 | input_id(Tensor): It attributes to the tensor of the input tokenized words
169 | ref_input_ids(Tensor): Ref Input IDs are used as baseline for the attributions
170 | attention mask() : The attention mask is a binary tensor indicating the position
171 | of the padded indices so that the model does not attend to them.
172 | """
173 |
174 | text_ids = tokenizer.encode(text, add_special_tokens=False)
175 | # construct input token ids
176 | logger.info("text_ids %s", text_ids)
177 | logger.info("[tokenizer.cls_token_id] %s", [tokenizer.cls_token_id])
178 | input_ids = [tokenizer.cls_token_id] + text_ids + [tokenizer.sep_token_id]
179 | logger.info("input_ids %s", input_ids)
180 |
181 | input_ids = torch.tensor([input_ids], device=device)
182 | # construct reference token ids
183 | ref_input_ids = (
184 | [tokenizer.cls_token_id]
185 | + [tokenizer.pad_token_id] * len(text_ids)
186 | + [tokenizer.sep_token_id]
187 | )
188 | ref_input_ids = torch.tensor([ref_input_ids], device=device)
189 | # construct attention mask
190 | attention_mask = torch.ones_like(input_ids)
191 | return input_ids, ref_input_ids, attention_mask
192 |
193 |
194 | def captum_sequence_forward(inputs, attention_mask=None, position=0, model=None):
195 | """This function is used to get the predictions from the model and this function
196 | can be used independent of the type of the BERT Task.
197 | Args:
198 | inputs (list): Input for Predictions
199 | attention_mask (list, optional): The attention mask is a binary tensor indicating the position
200 | of the padded indices so that the model does not attend to them, it defaults to None.
201 | position (int, optional): Position depends on the BERT Task.
202 | model ([type], optional): Name of the model, it defaults to None.
203 | Returns:
204 | list: Prediction Outcome
205 | """
206 | model.eval()
207 | model.zero_grad()
208 | pred = model(inputs, attention_mask=attention_mask)
209 | pred = pred[position]
210 | return pred
211 |
212 |
213 | def summarize_attributions(attributions):
214 | """Summarises the attribution across multiple runs
215 | Args:
216 | attributions ([list): attributions from the Layer Integrated Gradients
217 | Returns:
218 | list : Returns the attributions after normalizing them.
219 | """
220 | attributions = attributions.sum(dim=-1).squeeze(0)
221 | attributions = attributions / torch.norm(attributions)
222 | return attributions
223 |
224 |
225 | def get_word_token(input_ids, tokenizer):
226 | """constructs word tokens from token id using the BERT's
227 | Auto Tokenizer
228 | Args:
229 | input_ids (list): Input IDs from construct_input_ref method
230 | tokenizer (class): The Auto Tokenizer Pre-Trained model object
231 | Returns:
232 | (list): Returns the word tokens
233 | """
234 | indices = input_ids[0].detach().tolist()
235 | tokens = tokenizer.convert_ids_to_tokens(indices)
236 | # Remove unicode space character from BPE Tokeniser
237 | tokens = [token.replace("Ġ", "") for token in tokens]
238 | return tokens
239 |
--------------------------------------------------------------------------------
/src/server/core/model_load.py:
--------------------------------------------------------------------------------
1 | import os, yaml, logging, re
2 | # external imports
3 | import torch
4 | from joeynmt.helpers import load_config
5 | from subword_nmt import apply_bpe
6 | from subword_nmt import apply_bpe
7 | from sacremoses import MosesTokenizer, MosesDetokenizer
8 | from joeynmt.helpers import load_config, get_latest_checkpoint, \
9 | load_checkpoint
10 | from joeynmt.vocabulary import build_vocab
11 | from joeynmt.model import build_model
12 | from joeynmt.prediction import validate_on_data
13 | from urllib.request import urlopen
14 | from io import BytesIO
15 | from zipfile import ZipFile
16 |
17 | # internal imports
18 | from core.utils import load_line_as_data
19 |
20 | class MasakhaneModelLoader():
21 | """User Defined Class to manage the download of machine trasnlation models"""
22 | def __init__(self, available_models_file):
23 | # model directory to store the modeks
24 | self._model_dir_prefix = os.environ.get('MODEL',
25 | "./models/joeynmt/")
26 | self._src_language = ''
27 | #load availiable models into memory
28 | self.models = self.load_available_models(available_models_file)
29 |
30 | def load_available_models(self, available_models_file):
31 | """Load a dictonary with available models to download"""
32 | models = {}
33 | with open(available_models_file, 'r') as ofile:
34 | # iterate over file entries
35 | for i, line in enumerate(ofile):
36 | entries = line.strip().split("\t")
37 | # extract headers
38 | if i == 0:
39 | header_keys = [h.__str__() for h in entries]
40 | continue
41 |
42 | # build available model dictionary from the headers & entries:
43 | # https://www.geeksforgeeks.org/python-dictionary-comprehension/
44 | model = {key:value for key,value in zip(header_keys, entries)}
45 | # don't add incomplete models
46 | if model['complete'] != 'yes':
47 | continue
48 |
49 | models[f"{model['src_language']}-{model['tgt_language']}-{model['domain']}"] = model
50 |
51 | print('Found {} Masakhane models.'.format(len(models)))
52 |
53 | return models
54 |
55 | def download_model(self, src_language, tgt_language, domain):
56 | """ Download model for given trg language. """
57 | model_dir = f"{self._model_dir_prefix}{src_language}-{tgt_language}-{domain}"
58 |
59 | if not os.path.exists(model_dir):
60 | os.system(f'mkdir -p {model_dir}')
61 |
62 | model_files = self.models[f"{src_language}-{tgt_language}-{domain}"]
63 |
64 | # Check if files exist
65 | ckpt_path = os.path.join(model_dir, 'model.ckpt')
66 | src_vocab_path = os.path.join(model_dir, 'src_vocab.txt')
67 | trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt')
68 | config_path = os.path.join(model_dir, 'config_orig.yaml')
69 | src_bpe_path = os.path.join(model_dir, 'src.bpe.model')
70 | trg_bpe_path = os.path.join(model_dir, 'trg.bpe.model')
71 |
72 | if not os.path.exists in [ckpt_path, src_vocab_path, trg_vocab_path, config_path, src_bpe_path, trg_bpe_path]:
73 | URL = "https://zenodo.org/record/7636723/files/" + \
74 | src_language + "-" + tgt_language
75 | if domain == "":
76 | URL += "-baseline.zip?download=1"
77 | else:
78 | URL += "-" + domain + "-baseline.zip?download=1"
79 |
80 | http_response = urlopen(URL)
81 | zipfile = ZipFile(BytesIO(http_response.read()))
82 | zipfile.extractall(path=model_dir)
83 |
84 | # Rename config file to config_orig.yaml.
85 | os.rename(os.path.join(model_dir, 'config.yaml'), config_path)
86 |
87 | # Adjust config.
88 | config = load_config(config_path)
89 | new_config_file = os.path.join(model_dir, 'config.yaml')
90 | config = self._update_config(config, src_vocab_path, trg_vocab_path,
91 | model_dir, ckpt_path)
92 | with open(new_config_file, 'w') as cfile:
93 | yaml.dump(config, cfile)
94 |
95 | print('Downloaded model for {}-{}.'.format(src_language, tgt_language))
96 |
97 | def load_model(self, src_language, tgt_language, domain, bpe_src_code=None, tokenize=None):
98 | """ Load model for given trg language. """
99 | model_dir = f"{self._model_dir_prefix}{src_language}-{tgt_language}-{domain}"
100 |
101 | ckpt_path = os.path.join(model_dir, 'model.ckpt')
102 | src_vocab_path = os.path.join(model_dir, 'src_vocab.txt')
103 | trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt')
104 | config_path = os.path.join(model_dir, 'config_orig.yaml')
105 |
106 | # Adjust config.
107 | config = load_config(config_path)
108 | new_config_file = os.path.join(model_dir, 'config.yaml')
109 | config = self._update_config(config, src_vocab_path, trg_vocab_path,
110 | model_dir, ckpt_path)
111 | with open(new_config_file, 'w') as cfile:
112 | yaml.dump(config, cfile)
113 |
114 | print('Loaded model for {}-{}.'.format(src_language, tgt_language))
115 |
116 | conf = {}
117 |
118 | logger = logging.getLogger(__name__)
119 | conf["logger"] = logger
120 | # load the Joey configuration
121 | cfg = load_config(new_config_file)
122 | # load the checkpoint
123 | if "load_model" in cfg['training'].keys():
124 | ckpt = cfg['training']["load_model"]
125 | else:
126 | ckpt = get_latest_checkpoint(model_dir)
127 | if ckpt is None:
128 | raise FileNotFoundError("No checkpoint found in directory {}."
129 | .format(model_dir))
130 |
131 | # prediction parameters from config
132 | conf["use_cuda"] = cfg["training"].get(
133 | "use_cuda", False) if torch.cuda.is_available() else False
134 |
135 | conf["level"] = cfg["data"]["level"]
136 | conf["max_output_length"] = cfg["training"].get(
137 | "max_output_length", None)
138 | conf["lowercase"] = cfg["data"].get("lowercase", False)
139 |
140 | # load the vocabularies
141 | src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
142 | trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"
143 |
144 | conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file,
145 | dataset=None, max_size=-1, min_freq=0)
146 | conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file,
147 | dataset=None, max_size=-1, min_freq=0)
148 |
149 | # whether to use beam search for decoding, 0: greedy decoding
150 | if "testing" in cfg.keys():
151 | conf["beam_size"] = cfg["testing"].get("beam_size", 0)
152 | conf["beam_alpha"] = cfg["testing"].get("alpha", -1)
153 | else:
154 | conf["beam_size"] = 1
155 | conf["beam_alpha"] = -1
156 |
157 | # pre-processing
158 | if tokenize is not None:
159 | src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
160 | trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
161 | # tokenize input
162 | def tokenizer(x): return src_tokenizer.tokenize(x, return_str=True)
163 | def detokenizer(x): return trg_tokenizer.detokenize(
164 | x.split(), return_str=True)
165 | else:
166 | def tokenizer(x): return x
167 | def detokenizer(x): return x
168 |
169 | if bpe_src_code is not None and level == "bpe":
170 | # load bpe merge file
171 | merge_file = open(bpe_src_code, "r")
172 | bpe = apply_bpe.BPE(codes=merge_file)
173 | def segmenter(x): return bpe.process_line(x.strip())
174 | elif conf["level"] == "char":
175 | # split to chars
176 | def segmenter(x): return list(x.strip())
177 | else:
178 | def segmenter(x): return x.strip()
179 |
180 | conf["preprocess"] = [tokenizer, segmenter]
181 | conf["postprocess"] = [detokenizer]
182 | # build model and load parameters into it
183 | model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"])
184 | model = build_model(
185 | cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"])
186 | model.load_state_dict(model_checkpoint["model_state"])
187 | if conf["use_cuda"]:
188 | model.cuda()
189 | conf["model"] = model
190 | print("Joey NMT model loaded successfully.")
191 |
192 | return conf
193 |
194 | def _update_config(self, config, new_src_vocab_path, new_trg_vocab_path,
195 | new_model_dir, new_ckpt_path):
196 | """Overwrite the settings in the given config."""
197 | config['data']['src_vocab'] = new_src_vocab_path
198 | if config['model'].get('tied_embeddings', False):
199 | config['data']['trg_vocab'] = new_src_vocab_path
200 | else:
201 | config['data']['trg_vocab'] = new_trg_vocab_path
202 | config['training']['model_dir'] = new_model_dir
203 | config['training']['load_model'] = new_ckpt_path
204 | return config
205 |
206 | def _is_lowercase(self, src_vocab_path):
207 | # Infer whether the model is built on lowercased data.
208 | lowercase = True
209 | with open(src_vocab_path, 'r') as ofile:
210 | for line in ofile:
211 | if line != line.lower():
212 | lowercase = False
213 | break
214 | return lowercase
215 |
216 | # Doesn't look like these functions are ever called...
217 |
218 | def _download_gdrive_file(self, file_id, destination):
219 | """Download a file from Google Drive and store in local file."""
220 | download_link = 'https://drive.google.com/uc?id={}'.format(file_id)
221 | os.system(f'gdown -q -O {destination} {download_link}')
222 |
223 | def _download_github_file(self, github_raw_path, destination):
224 | """Download a file from GitHub."""
225 | os.system(f'wget -q -O {destination} {github_raw_path}')
226 |
227 | def _download(self, url, destination):
228 | """Download file from Github or Googledrive."""
229 | try:
230 | if 'drive.google.com' in url:
231 | if url.startswith('https://drive.google.com/file'):
232 | file_id = url.split("/")[-1]
233 | elif url.startswith('https://drive.google.com/open?'):
234 | file_id = url.split('id=')[-1]
235 | self._download_gdrive_file(file_id, destination)
236 | else:
237 | self._download_github_file(url, destination)
238 | except:
239 | print("Download failed, didn't recognize url {}.".format(url))
240 |
241 |
--------------------------------------------------------------------------------
/src/server/core/models/predict.py:
--------------------------------------------------------------------------------
1 | import os
2 | import ipdb
3 | import logging
4 | import re
5 |
6 | import pandas as pd
7 | from subword_nmt import apply_bpe
8 | from polyglot.text import Text
9 | from flask import current_app
10 | from subword_nmt import apply_bpe
11 | from sacremoses import MosesTokenizer, MosesDetokenizer
12 | from core.utils import load_line_as_data
13 | from joeynmt.helpers import load_config, get_latest_checkpoint, \
14 | load_checkpoint
15 | from joeynmt.vocabulary import build_vocab
16 | from joeynmt.model import build_model
17 | from joeynmt.prediction import validate_on_data
18 |
19 |
20 |
21 |
22 | def load_model(model_dir, bpe_src_code=None, tokenize=None):
23 | """
24 | Start the bot. This means loading the model according to the config file.
25 |
26 | :param model_dir: Model directory of trained Joey NMT model.
27 | :param bpe_src_code: BPE codes for source side processing (optional).
28 | :param tokenize: If True, tokenize inputs with Moses tokenizer.
29 | :return:
30 | """
31 | conf = {}
32 | cfg_file = model_dir+"/config.yaml"
33 |
34 | logger = logging.getLogger(__name__)
35 | conf["logger"] = logger
36 | # load the Joey configuration
37 | cfg = load_config(cfg_file)
38 |
39 | # load the checkpoint
40 | if "load_model" in cfg['training'].keys():
41 | ckpt = cfg['training']["load_model"]
42 | else:
43 | ckpt = get_latest_checkpoint(model_dir)
44 | if ckpt is None:
45 | raise FileNotFoundError("No checkpoint found in directory {}."
46 | .format(model_dir))
47 |
48 | # prediction parameters from config
49 | conf["use_cuda"] = cfg["training"].get("use_cuda", False)
50 | conf["level"] = cfg["data"]["level"]
51 | conf["max_output_length"] = cfg["training"].get("max_output_length", None)
52 | conf["lowercase"] = cfg["data"].get("lowercase", False)
53 |
54 | # load the vocabularies
55 | src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
56 | trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"
57 |
58 | conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file,
59 | dataset=None, max_size=-1, min_freq=0)
60 | conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file,
61 | dataset=None, max_size=-1, min_freq=0)
62 |
63 | # whether to use beam search for decoding, 0: greedy decoding
64 | if "testing" in cfg.keys():
65 | conf["beam_size"] = cfg["testing"].get("beam_size", 0)
66 | conf["beam_alpha"] = cfg["testing"].get("alpha", -1)
67 | else:
68 | conf["beam_size"] = 1
69 | conf["beam_alpha"] = -1
70 |
71 | # pre-processing
72 | if tokenize is not None:
73 | src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
74 | trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
75 | # tokenize input
76 | tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True)
77 | detokenizer = lambda x: trg_tokenizer.detokenize(
78 | x.split(), return_str=True)
79 | else:
80 | tokenizer = lambda x: x
81 | detokenizer = lambda x: x
82 |
83 | if bpe_src_code is not None and level == "bpe":
84 | # load bpe merge file
85 | merge_file = open(bpe_src_code, "r")
86 | bpe = apply_bpe.BPE(codes=merge_file)
87 | segmenter = lambda x: bpe.process_line(x.strip())
88 | elif conf["level"] == "char":
89 | # split to chars
90 | segmenter = lambda x: list(x.strip())
91 | else:
92 | segmenter = lambda x: x.strip()
93 |
94 | conf["preprocess"] = [tokenizer, segmenter]
95 | conf["postprocess"] = [detokenizer]
96 | # build model and load parameters into it
97 | model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"])
98 | model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"])
99 | model.load_state_dict(model_checkpoint["model_state"])
100 |
101 | if conf["use_cuda"]:
102 | model.cuda()
103 | conf["model"] = model
104 | print("Joey NMT model loaded successfully.")
105 | return conf
106 |
107 |
108 | class Predicter():
109 | # def __init__(self):
110 | # pass
111 |
112 | def translate(self, message_text, model, src_vocab, trg_vocab, preprocess, postprocess,
113 | logger, beam_size, beam_alpha, level, lowercase,
114 | max_output_length, use_cuda):
115 | """
116 | Describes how to translate a text message.
117 |
118 | :param message_text: Slack command, could be text.
119 | :param model: The Joey NMT model.
120 | :param src_vocab: Source vocabulary.
121 | :param trg_vocab: Target vocabulary.
122 | :param preprocess: Preprocessing pipeline (a list).
123 | :param postprocess: Postprocessing pipeline (a list).
124 | :param beam_size: Beam size for decoding.
125 | :param beam_alpha: Beam alpha for decoding.
126 | :param level: Segmentation level.
127 | :param lowercase: Lowercasing.
128 | :param max_output_length: Maximum output length.
129 | :param use_cuda: Using CUDA or not.
130 | :return:
131 | """
132 | # ipdb.set_trace()
133 | sentence = message_text.strip()
134 | # remove emojis
135 | emoji_pattern = re.compile("\:[a-zA-Z]+\:")
136 | sentence = re.sub(emoji_pattern, "", sentence)
137 | sentence = sentence.strip()
138 | if lowercase:
139 | sentence = sentence.lower()
140 | for p in preprocess:
141 | sentence = p(sentence)
142 |
143 | # load the data which consists only of this sentence
144 | test_data, src_vocab, trg_vocab = load_line_as_data(lowercase=lowercase,
145 | line=sentence, src_vocab=src_vocab, trg_vocab=trg_vocab, level=level)
146 |
147 | # generate outputs
148 | score, loss, ppl, sources, sources_raw, references, hypotheses, \
149 | hypotheses_raw, attention_scores = validate_on_data(
150 | model, data=test_data, batch_size=1, level=level,
151 | max_output_length=max_output_length, eval_metric=None,
152 | use_cuda=use_cuda, beam_size=beam_size,
153 | beam_alpha=beam_alpha, n_gpu=0)
154 |
155 | # validate_on_data(model: Model, data: Dataset,
156 | # batch_size: int,
157 | # use_cuda: bool, max_output_length: int,
158 | # level: str, eval_metric: Optional[str],
159 | # n_gpu: int,
160 | # batch_class: Batch = Batch,
161 | # compute_loss: bool = False,
162 | # beam_size: int = 1, beam_alpha: int = -1,
163 | # batch_type: str = "sentence",
164 | # postprocess: bool = True,
165 | # bpe_type: str = "subword-nmt",
166 | # sacrebleu: dict = None) \
167 |
168 | # post-process
169 | if level == "char":
170 | response = "".join(hypotheses)
171 | else:
172 | response = " ".join(hypotheses)
173 |
174 | for p in postprocess:
175 | response = p(response)
176 |
177 | return response
178 |
179 |
180 | def predict_translation(self, source, model_dir, lc):
181 | new_config_path = os.path.join(model_dir, 'config.yaml')
182 |
183 | # joenmt takes as input a file, so for the moment
184 | # I made the code to write the input into a file, ...
185 |
186 | if not os.path.exists(current_app.config['TEMP']):
187 | os.mkdir(current_app.config['TEMP'])
188 |
189 | path_to_temp = current_app.config['TEMP']
190 |
191 | # if not os.path.exists("../../data/temps/"):
192 | # os.mkdir("../../data/temps/")
193 | # path_to_temp = "../../data/temps/"
194 |
195 | if not os.path.exists(path_to_temp):
196 | os.mkdir(path_to_temp)
197 |
198 |
199 | src_input_file = 'src_input.bpe.txt'
200 | # src_bpe_path = os.path.join(model_dir, 'src.bpe.model')
201 |
202 | # ted_link = 'https://raw.githubusercontent.com/juliakreutzer/masakhane-eval/master/data/multitarget-ted-filt.en.tsv'
203 | os.system(f'echo {source} > {path_to_temp}input.tsv')
204 | # src_data = SourceData(path_to_temp+'input.tsv', lc, \
205 | # bpe_path=src_bpe_path, out_file=path_to_temp+src_input_file)
206 | # sources = src_data.get_sources()
207 | # ted_df = src_data.get_df()
208 |
209 | os.system(f"sed 's/@@ //g' {path_to_temp}{src_input_file} > {path_to_temp}src_input.txt")
210 |
211 | # os.system(f'echo {source} > input.txt')
212 | os.system(f'python -m joeynmt translate {new_config_path} < {path_to_temp}src_input.txt > {path_to_temp}trg_output_file')
213 |
214 | targets = post_process(path_to_temp+'trg_output_file', lc)
215 | #
216 | # with open('output.txt', 'r') as file:
217 | # output = file.read().replace('\n', '')
218 |
219 | # with open('trg_output_file', 'r') as file:
220 | # output = file.read().replace('\n', '')
221 |
222 | # return output
223 |
224 | return targets[0] if len(targets)>0 else ""
225 |
226 |
227 | class SourceData():
228 | def __init__(self, data_link, lc, bpe_path, out_file):
229 | self._src_df = pd.read_csv(data_link, sep='\t', header=None,
230 | names=['source'])
231 | print("Loaded {} lines.".format(len(self._src_df)))
232 | self._bpe_model = self.load_bpe(bpe_path)
233 | self._src_df, self._sources = self.preprocess(out_file, lc)
234 | self.lc = lc
235 |
236 | def get_df(self):
237 | return self._src_df
238 |
239 | def get_sources(self):
240 | return self._sources
241 |
242 | def preprocess(self, out_file, lc):
243 | """Tokenize, (lowercase,) sub-word split.
244 |
245 | Using Polyglot since it was used for JW300.
246 | Preprocess the source column of a dataframe object and write to file.
247 |
248 | Pipeline:
249 | - tokenize
250 | - split into sub-words
251 |
252 | Append pre-processed sources to dataframe."""
253 | tokenized_sentences = []
254 | bped_sentences = []
255 | sources = []
256 | with open(out_file, 'w') as ofile:
257 | for i, row in self._src_df.iterrows():
258 | sentence_i = Text(row[0]).sentences[0]
259 | tokenized_sentence = ""
260 | bped_sentence = ""
261 | tokenized = " ".join(sentence_i.words)
262 | sources.append(str(sentence_i))
263 | if lc:
264 | tokenized = tokenized.lower()
265 | tokenized_sentence = tokenized
266 | bped = self._bpe_model.process_line(tokenized)
267 | bped_sentence = bped
268 | ofile.write("{}\n".format(bped))
269 | tokenized_sentences.append(tokenized_sentence)
270 | bped_sentences.append(bped_sentence)
271 | data = self._src_df.assign(
272 | tokenized_sentences=tokenized_sentences)
273 | data = data.assign(
274 | bped_sentences=bped_sentences)
275 | return data, sources
276 |
277 | def load_bpe(self, bpe_path):
278 | with open(bpe_path, 'r') as ofile:
279 | bpe_model = apply_bpe.BPE(codes=ofile)
280 | return bpe_model
281 |
282 | # Post-processing
283 | def post_process(output_file, lc):
284 | """Load and detokenize translations.
285 |
286 | There is no given Polyglot detokenizer, so we do it by heuristics.
287 | """
288 | targets = []
289 | with open(output_file, 'r') as ofile:
290 | for line in ofile:
291 | sent = line.strip()
292 | sent = sent.replace('', '')
293 | sent = re.sub(r'\s+([?.!"-,:’])', r'\1', sent)
294 | sent = sent.replace('( ', '(').replace(' - ', '-').replace(' / ', '/').replace(' /', '/')
295 | if lc:
296 | # Cheap casing restoration... only first character but better than nothing.
297 | sent = sent[0].upper() + sent[1:]
298 | targets.append(sent)
299 | return targets
--------------------------------------------------------------------------------
/src/client/src/components/translateCard.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import { useState, useLayoutEffect,useRef, useEffect} from 'react';
3 | import { Container, Row, Col, Form, Button, Modal, Toast, OverlayTrigger, Tooltip } from 'react-bootstrap';
4 | import {CopyToClipboard} from 'react-copy-to-clipboard';
5 |
6 | import MultiStepForm from './multiStepForm';
7 |
8 | const MIN_TEXTAREA_HEIGHT = 200;
9 |
10 | export default function TranslateCard() {
11 | const [input, setText] = useState("");
12 | const [translation, setTranslation] = useState('...');
13 | const [srcLanguages, setSrcLanguages] = useState([]);
14 | const [tgtLanguages, setTgtLanguages] = useState([]);
15 | const [show, setShow] = useState(false);
16 | const [src_lang, setSrc_Lang] = useState('English');
17 | const [tgt_lang, setTgt_Lang] = useState('Swahili');
18 | const [feedBackForm, setFeedBackForm] = useState({});
19 | const textareaRef = useRef(null);
20 | const textareaRef2= useRef(null);
21 | const [feedbackToken, setFeedbackToken] = useState(
22 | localStorage.getItem('feedbackToken') || ''
23 | );
24 |
25 | const [copySuccess, setCopySuccess] = useState('');
26 | const [showToast, setShowToast] = useState('');
27 |
28 | const handleClose = () => setShow(false);
29 | const handleShow = () => setShow(true);
30 |
31 | const copyToClipboard = () => {
32 | setCopySuccess('Translation Copied!');
33 | setShowToast(true);
34 | };
35 |
36 | const handleChangeSrc_Lang= (e) => {
37 | //localstorage
38 | const name = e.target.value
39 | localStorage.setItem('src_lang', name);
40 |
41 | //set state
42 | setSrc_Lang(name);
43 | //get target languages
44 | const target = srcLanguages.filter(x => x.name === name)
45 | const target_languages = target[0].targets
46 | setTgtLanguages(target_languages)
47 | setTgt_Lang(target_languages[0].name)
48 | };
49 |
50 | const handleChangeTgt_Lang = (e) => {
51 | //localstorage
52 | localStorage.setItem('tgt_lang', e.target.value);
53 |
54 | //set state
55 | setTgt_Lang(e.target.value);
56 |
57 | // console.log(e.target.value)
58 |
59 | };
60 |
61 | const handleTranslate = (e) => {
62 | console.log('translating ..')
63 | console.log(src_lang)
64 | console.log(tgt_lang)
65 | e.preventDefault()
66 |
67 | fetch(
68 | '/translate',
69 | {
70 | method: 'post',
71 | // mode: 'no-cors',
72 | body: JSON.stringify({input, src_lang, tgt_lang}),
73 | headers: {
74 | 'Content-Type': 'application/json'
75 | },
76 | // credentials: 'same-origin',
77 | })
78 | .then(res => res.json())
79 | .then(data => {
80 | console.log({ data })
81 | // do something here
82 | setTranslation(data.output)
83 | })
84 | };
85 |
86 | const submitFeedBack = (formData) => {
87 | // first set state of feedback Form
88 | setFeedBackForm({...formData});
89 | // then submit feedback form to db here
90 | // here's where you write the function to push feedback to backend
91 |
92 | console.log({formData})
93 |
94 | fetch(
95 | '/save',
96 | {
97 | method: 'post',
98 | // mode: 'no-cors',
99 | body: JSON.stringify({
100 | src_lang: formData.src_lang,
101 | tgt_lang: formData.tgt_lang,
102 | accurate_translation: formData.accurate_translation,
103 | know_src_lang: formData.know_src_lang,
104 | know_tgt_lang: formData.know_tgt_lang,
105 | own_translation: formData.own_translation,
106 | text: formData.text,
107 | translation: formData.translation,
108 | understand_translation: formData.understand_translation,
109 | feedbackToken: formData.feedbackToken
110 | }),
111 | headers: {
112 | 'Content-Type': 'application/json'
113 | },
114 | // credentials: 'same-origin',
115 | })
116 | .then(res => res.json())
117 | .then(data => {
118 | //console.log({data})
119 | // do something here
120 | handleClear()
121 | })
122 |
123 | }
124 |
125 |
126 | const handleClear = () => {
127 | // clears text part
128 | setText('');
129 | // clear translation
130 | setTranslation('...');
131 | }
132 |
133 | useLayoutEffect(() => {
134 | // Reset height - important to shrink on delete
135 | textareaRef.current.style.height = "inherit";
136 | // Set height
137 | textareaRef.current.style.height = `${Math.max(
138 | textareaRef.current.scrollHeight,
139 | MIN_TEXTAREA_HEIGHT
140 | )}px`;
141 | }, [input]);
142 |
143 | useLayoutEffect(() => {
144 | // Reset height - important to shrink on delete
145 | textareaRef2.current.style.height = "inherit";
146 | // Set height
147 | textareaRef2.current.style.height = `${Math.max(
148 | textareaRef2.current.scrollHeight,
149 | MIN_TEXTAREA_HEIGHT
150 | )}px`;
151 | }, [input]);
152 |
153 | // console.log({feedbackToken});
154 | // console.log({tgt_lang});
155 |
156 | // console.log({feedbackToken});
157 |
158 | let srcLang = [];
159 | let tgtLang = [];
160 |
161 | useEffect(()=> {
162 | // define fetch function
163 | let src = [];
164 | let tgt = [];
165 | const fetchLanguages = async ()=> {
166 | await fetch(
167 | '/update',
168 | {
169 | method: 'get',
170 | headers: {
171 | 'Content-Type': 'application/json'
172 | },
173 | })
174 | await fetch(
175 | '/translate',
176 | {
177 | method: 'get',
178 | headers: {
179 | 'Content-Type': 'application/json'
180 | },
181 | // credentials: 'same-origin',
182 | })
183 | .then(res => res.json())
184 | .then(data => {
185 | console.log({ data})
186 | // do something here
187 | setSrcLanguages(data)
188 | setTgtLanguages(data[0].targets)
189 |
190 | })
191 |
192 |
193 | }
194 | // call fetch function
195 | fetchLanguages()
196 |
197 | }, [])
198 | // console.log(srcLanguages)
199 | // console.log(tgtLanguages)
200 | // console.log(tgt_lang)
201 |
202 | return (
203 |
204 |
205 |
212 |
213 |
214 | GIVE FEEDBACK
215 | We appreciate your feedback and your contribution will help make our translation better.
216 |
217 |
218 |
219 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
238 | From:
239 |
240 | {
241 | srcLanguages && srcLanguages.map((option, index) => {
242 | return ({option.name} )
243 | })
244 | }
245 |
246 |
247 |
248 |
249 | {/*
250 |
251 | {
252 | srcLanguages.length > 1 && srcLanguages
253 | .filter(x => x.value !== src_lang)
254 | .slice(0, 2)
255 | .map((option, index) => {
256 | return (
257 | setSrc_Lang(option.name)}>{option.name} )
258 | })
259 | }
260 |
261 | */}
262 |
263 |
264 |
265 |
266 |
267 |
269 | To:
270 |
271 |
272 | {
273 | tgtLanguages.map((option, index) => {
274 | return ({option.name} )
275 | })
276 | }
277 |
278 |
279 |
280 |
281 |
282 | {/*
283 |
284 | {
285 | tgtLanguages.length > 1 && tgtLanguages
286 | .filter(x => x.value !== tgt_lang)
287 | .slice(0, 2)
288 | .map((option, index) => {
289 | return (
290 | setTgt_Lang(option.name)}>{option.name} )
291 | })
292 | }
293 |
294 | */}
295 |
296 |
297 |
298 |
299 |
300 |
302 | setText(e.target.value)}
311 | />
312 |
313 |
314 |
315 |
316 |
317 | Translate
318 |
319 |
320 | {' '}
321 |
322 |
323 |
324 |
325 |
327 | setText(e.target.value)}
339 | // autoFocus={showToast}
340 | />
341 | {!translation && (
342 |
343 | Sorry, there’s no translation for that phrase.
344 |
345 | )}
346 |
347 |
348 |
349 |
350 |
351 | {/* Give Feedback on Translation */}
352 |
353 |
354 |
358 | Copy Translation .
359 |
360 | }
361 | >
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 | setShowToast(false)}
374 | show={showToast}
375 | delay={3000}
376 | autohide
377 | style={{
378 | position: 'absolute',
379 | bottom: 0,
380 | left: 0
381 | }}
382 | >
383 | {copySuccess}
384 |
385 |
386 |
387 | )
388 | }
389 |
--------------------------------------------------------------------------------