├── .github ├── FUNDING.yml └── workflows │ ├── ci.yml │ └── pypi-publish.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── doccano_mini ├── __init__.py ├── cli.py ├── components.py ├── docs │ └── usage.md ├── examples │ ├── named_entity_recognition.json │ ├── paraphrase.json │ ├── question_answering.json │ ├── summarization.json │ ├── task_free.json │ └── text_classification.json ├── home.py ├── layout.py ├── models │ ├── entity.py │ └── stepper.py ├── pages │ ├── 01_Text_Classification.py │ ├── 02_Question_Answering.py │ ├── 03_Summarization.py │ ├── 04_Paraphrase.py │ ├── 05_Named_Entity_Recognition.py │ ├── 06_(Beta)_Evaluation.py │ └── 09_Task_Free.py ├── prompts.py ├── storages │ ├── entity.py │ ├── session_storage.py │ └── stepper.py └── utils.py ├── docs ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md └── images │ ├── annotation.gif │ ├── copy_and_paste.gif │ ├── download_config.jpg │ └── test_new_example.jpg ├── poetry.lock ├── pyproject.toml └── tests └── test_prompts.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: Hironsan 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: doccano-mini CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | backend: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - name: Set up Python 3.10 11 | uses: actions/setup-python@v2 12 | with: 13 | python-version: '3.10' 14 | - name: Install dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | pip install poetry 18 | poetry install 19 | - name: Lint with flake8 20 | run: | 21 | poetry run task flake8 22 | - name: Lint with isort 23 | run: | 24 | poetry run task isort 25 | - name: Black 26 | run: | 27 | poetry run task black 28 | - name: mypy 29 | run: | 30 | poetry run task mypy 31 | - name: pytest 32 | run: | 33 | poetry run task test 34 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Setup Python 3.10 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: '3.10' 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install poetry poetry-dynamic-versioning 21 | poetry install 22 | - name: Copy README 23 | run: | 24 | cp README.md doccano_mini/docs/ 25 | - name: Build a binary wheel and a source tarball 26 | run: | 27 | poetry build 28 | - name: Publish a Python distribution to PyPI 29 | uses: pypa/gh-action-pypi-publish@master 30 | with: 31 | user: ${{ secrets.PYPI_USERNAME }} 32 | password: ${{ secrets.PYPI_PASSWORD }} 33 | packages_dir: ./dist/ 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### macOS template 3 | # General 4 | .DS_Store 5 | .AppleDouble 6 | .LSOverride 7 | .idea 8 | 9 | # Icon must end with two \r 10 | Icon 11 | 12 | # Thumbnails 13 | ._* 14 | 15 | # Files that might appear in the root of a volume 16 | .DocumentRevisions-V100 17 | .fseventsd 18 | .Spotlight-V100 19 | .TemporaryItems 20 | .Trashes 21 | .VolumeIcon.icns 22 | .com.apple.timemachine.donotpresent 23 | 24 | # Directories potentially created on remote AFP share 25 | .AppleDB 26 | .AppleDesktop 27 | Network Trash Folder 28 | Temporary Items 29 | .apdisk 30 | ### Python template 31 | # Byte-compiled / optimized / DLL files 32 | __pycache__/ 33 | *.py[cod] 34 | *$py.class 35 | 36 | # C extensions 37 | *.so 38 | 39 | # Distribution / packaging 40 | .Python 41 | build/ 42 | develop-eggs/ 43 | dist/ 44 | downloads/ 45 | eggs/ 46 | .eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | wheels/ 53 | *.egg-info/ 54 | .installed.cfg 55 | *.egg 56 | MANIFEST 57 | 58 | # PyInstaller 59 | # Usually these files are written by a python script from a template 60 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 61 | *.manifest 62 | *.spec 63 | 64 | # Installer logs 65 | pip-log.txt 66 | pip-delete-this-directory.txt 67 | 68 | # Unit test / coverage reports 69 | htmlcov/ 70 | .tox/ 71 | .coverage 72 | .coverage.* 73 | .cache 74 | nosetests.xml 75 | coverage.xml 76 | *.cover 77 | .hypothesis/ 78 | junitxml/ 79 | 80 | # Translations 81 | *.mo 82 | *.pot 83 | 84 | # Django stuff: 85 | *.log 86 | local_settings.py 87 | *.sqlite3 88 | staticfiles/ 89 | 90 | # Flask stuff: 91 | instance/ 92 | .webassets-cache 93 | 94 | # Scrapy stuff: 95 | .scrapy 96 | 97 | # Sphinx documentation 98 | docs/_build/ 99 | 100 | # PyBuilder 101 | target/ 102 | 103 | # Jupyter Notebook 104 | .ipynb_checkpoints 105 | 106 | # pyenv 107 | .python-version 108 | 109 | # celery beat schedule file 110 | celerybeat-schedule 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .env 117 | .venv 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | 124 | # Spyder project settings 125 | .spyderproject 126 | .spyproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | # mkdocs documentation 132 | /site 133 | 134 | # mypy 135 | .mypy_cache/ 136 | ### JetBrains template 137 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 138 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 139 | 140 | # User-specific stuff: 141 | .idea/**/workspace.xml 142 | .idea/**/tasks.xml 143 | .idea/dictionaries 144 | 145 | # Sensitive or high-churn files: 146 | .idea/**/dataSources/ 147 | .idea/**/dataSources.ids 148 | .idea/**/dataSources.xml 149 | .idea/**/dataSources.local.xml 150 | .idea/**/sqlDataSources.xml 151 | .idea/**/dynamic.xml 152 | .idea/**/uiDesigner.xml 153 | 154 | # Gradle: 155 | .idea/**/gradle.xml 156 | .idea/**/libraries 157 | 158 | # CMake 159 | cmake-build-debug/ 160 | 161 | # Mongo Explorer plugin: 162 | .idea/**/mongoSettings.xml 163 | 164 | ## File-based project format: 165 | *.iws 166 | 167 | ## Plugin-specific files: 168 | 169 | # IntelliJ 170 | out/ 171 | 172 | # mpeltonen/sbt-idea plugin 173 | .idea_modules/ 174 | 175 | # JIRA plugin 176 | atlassian-ide-plugin.xml 177 | 178 | # Cursive Clojure plugin 179 | .idea/replstate.xml 180 | 181 | # Crashlytics plugin (for Android Studio and IntelliJ) 182 | com_crashlytics_export_strings.xml 183 | crashlytics.properties 184 | crashlytics-build.properties 185 | fabric.properties 186 | ### VirtualEnv template 187 | # Virtualenv 188 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 189 | [Bb]in 190 | [Ii]nclude 191 | [Ll]ib 192 | [Ll]ib64 193 | [Ll]ocal 194 | [Ss]cripts 195 | pyvenv.cfg 196 | pip-selfcheck.json 197 | 198 | # ignore webpack state 199 | node_modules/ 200 | bundle/ 201 | webpack-stats.json 202 | 203 | .vscode 204 | config.yaml 205 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Hiroki Nakayama. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | lint: 2 | poetry run task flake8 3 | poetry run task black 4 | poetry run task isort 5 | poetry run task mypy 6 | 7 | test: 8 | poetry run task test 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # doccano-mini 2 | 3 | doccano-mini is a few-shot annotation tool to assist the development of applications with Large language models (LLMs). Once you annotate a few text, you can solve your task (e.g. text classification) with LLMs via [LangChain](https://github.com/hwchase17/langchain). 4 | 5 | At this time, the following tasks are supported: 6 | 7 | - Text classification 8 | - Question answering 9 | - Summarization 10 | - Paraphrasing 11 | - Named Entity Recognition 12 | - Task Free 13 | 14 | Note: This is an experimental project. 15 | 16 | ## Installation 17 | 18 | ```bash 19 | pip install doccano-mini 20 | ``` 21 | 22 | ## Usage 23 | 24 | For this example, we will be using OpenAI’s APIs, so we need to set the environment variable in the terminal. 25 | 26 | ```bash 27 | export OPENAI_API_KEY="..." 28 | ``` 29 | 30 | Then, we can run the server. 31 | 32 | ```bash 33 | doccano-mini 34 | ``` 35 | 36 | Now, we can open the browser and go to `http://localhost:8501/` to see the interface. 37 | 38 | ### Step1: Annotate a few text 39 | 40 | In this step, we will annotate a few text. We can add a new text by clicking the `+` button. Try it out by double-clicking on any cell. You'll notice you can edit all cell values. 41 | 42 | ![Step1](https://raw.githubusercontent.com/doccano/doccano-mini/master/docs/images/annotation.gif) 43 | 44 | The editor also supports pasting in tabular data from Google Sheets, Excel, and many other similar tools. 45 | 46 | ![Copy and Paste](https://raw.githubusercontent.com/doccano/doccano-mini/master/docs/images/copy_and_paste.gif) 47 | 48 | ### Step2: Test your task 49 | 50 | In this step, we will test your task. We can enter a new test to the text box and click the `Predict` button. Then, we can see the result of the test. 51 | 52 | “Step2” 53 | 54 | ### Step3: Download the config 55 | 56 | In this step, we will download the [LangChain](https://github.com/hwchase17/langchain)'s config. We can click the `Download` button to download it. After loading the config file, we can predict a label for the new text. 57 | 58 | ```python 59 | from langchain.chains import load_chain 60 | 61 | chain = load_chain("chain.yaml") 62 | chain.run("YOUR TEXT") 63 | ``` 64 | 65 | ## Development 66 | 67 | ```bash 68 | poetry install 69 | streamlit run doccano_mini/home.py 70 | ``` 71 | -------------------------------------------------------------------------------- /doccano_mini/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doccano/doccano-mini/0ef6c3368499eb172ff5c8c446d61c7f240baf8f/doccano_mini/__init__.py -------------------------------------------------------------------------------- /doccano_mini/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | import streamlit.web.cli as stcli 5 | 6 | 7 | def main(): 8 | filepath = str(Path(__file__).parent.resolve() / "home.py") 9 | sys.argv = ["streamlit", "run", filepath, "--global.developmentMode=false"] 10 | sys.exit(stcli.main()) 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /doccano_mini/components.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import streamlit as st 6 | from langchain.llms import OpenAI 7 | from langchain.prompts.few_shot import FewShotPromptTemplate 8 | from langchain.schema import BaseLanguageModel 9 | 10 | 11 | def display_download_button(): 12 | st.header("Download a config file") 13 | with open("config.yaml", "r", encoding="utf-8") as f: 14 | st.download_button( 15 | label="Download", 16 | data=f, 17 | file_name="config.yaml", 18 | ) 19 | 20 | 21 | def usage(): 22 | st.header("Usage") 23 | filepath = Path(__file__).parent.resolve() / "docs" / "usage.md" 24 | with filepath.open("r", encoding="utf-8") as f: 25 | st.markdown(f.read()) 26 | 27 | 28 | def task_instruction_editor(prompt: FewShotPromptTemplate) -> FewShotPromptTemplate: 29 | st.header("Edit instruction") 30 | with st.expander("See instruction"): 31 | prompt.prefix = st.text_area(label="Enter task instruction", value=prompt.prefix, height=200) 32 | return prompt 33 | 34 | 35 | def openai_model_form() -> Optional[BaseLanguageModel]: 36 | # https://platform.openai.com/docs/models/gpt-3-5 37 | AVAILABLE_MODELS = ( 38 | "gpt-3.5-turbo", 39 | "gpt-3.5-turbo-0301", 40 | "text-davinci-003", 41 | "text-davinci-002", 42 | "code-davinci-002", 43 | ) 44 | api_key = st.text_input("API key", value=os.environ.get("OPENAI_API_KEY", ""), type="password") 45 | model_name = st.selectbox("Model", AVAILABLE_MODELS, index=2) 46 | temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01) 47 | top_p = st.slider("Top-p", min_value=0.0, max_value=1.0, value=1.0, step=0.01) 48 | if not api_key: 49 | return None 50 | return OpenAI(model_name=model_name, temperature=temperature, top_p=top_p, openai_api_key=api_key) # type:ignore 51 | -------------------------------------------------------------------------------- /doccano_mini/docs/usage.md: -------------------------------------------------------------------------------- 1 | ```python 2 | from langchain.chains import load_chain 3 | 4 | chain = load_chain("chain.yaml") 5 | chain.run("YOUR TEXT") 6 | ``` -------------------------------------------------------------------------------- /doccano_mini/examples/named_entity_recognition.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"text": "EU rejects German call to boycott British lamb."}, 3 | {"text": "Peter Blackburn"}, 4 | {"text": "BRUSSELS 1996-08-22"} 5 | ] -------------------------------------------------------------------------------- /doccano_mini/examples/paraphrase.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "text": "Amrozi accused his brother, whom he called \"the witness\", of deliberately distorting his evidence.", 4 | "paraphrase": "Referring to him as only \"the witness\", Amrozi accused his brother of deliberately distorting his evidence.'" 5 | }, 6 | { 7 | "text": "Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.", 8 | "paraphrase": "Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998." 9 | } 10 | ] -------------------------------------------------------------------------------- /doccano_mini/examples/question_answering.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "context": "Google was founded by computer scientists Larry Page and Sergey Brin.", 4 | "question": "Who founded Google?", 5 | "answer": "Larry Page and Sergey Brin" 6 | } 7 | ] -------------------------------------------------------------------------------- /doccano_mini/examples/summarization.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "passage": "WASHINGTON (CNN) -- Vice President Dick Cheney will serve as acting president briefly Saturday while President Bush is anesthetized for a routine colonoscopy, White House spokesman Tony Snow said Friday. Bush is scheduled to have the medical procedure, expected to take about 2 1/2 hours, at the presidential retreat at Camp David, Maryland, Snow said. Bush's last colonoscopy was in June 2002, and no abnormalities were found, Snow said. The president's doctor had recommended a repeat procedure in about five years. The procedure will be supervised by Dr. Richard Tubb and conducted by a multidisciplinary team from the National Naval Medical Center in Bethesda, Maryland, Snow said. A colonoscopy is the most sensitive test for colon cancer, rectal cancer and polyps, small clumps of cells that can become cancerous, according to the Mayo Clinic. Small polyps may be removed during the procedure. Snow said that was the case when Bush had colonoscopies before becoming president. Snow himself is undergoing chemotherapy for cancer that began in his colon and spread to his liver. Snow told reporters he had a chemo session scheduled later Friday. Watch Snow talk about Bush's procedure and his own colon cancer » . \"The president wants to encourage everybody to use surveillance,\" Snow said. The American Cancer Society recommends that people without high-risk factors or symptoms begin getting screened for signs of colorectal cancer at age 50. E-mail to a friend .", 4 | "summary": "President Bush will have a routine colonoscopy Saturday. While he's anesthetized, his powers will be transferred to the vice president. Bush had last colonoscopy in 2002, which found no problems." 5 | }, 6 | { 7 | "passage": "LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in \"Harry Potter and the Order of the Phoenix\" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. \"I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar,\" he told an Australian interviewer earlier this month. \"I don't think I'll be particularly extravagant. \"The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs.\" At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film \"Hostel: Part II,\" currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. \"I'll definitely have some sort of party,\" he said in an interview. \"Hopefully none of you will be reading about it.\" Radcliffe's earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. \"People are always looking to say 'kid star goes off the rails,'\" he told reporters last month. \"But I try very hard not to go that way because it would be too easy for them.\" His latest outing as the boy wizard in \"Harry Potter and the Order of the Phoenix\" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Watch I-Reporter give her review of Potter's latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called \"My Boy Jack,\" about author Rudyard Kipling and his son, due for release later this year. He will also appear in \"December Boys,\" an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer's \"Equus.\" Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: \"I just think I'm going to be more sort of fair game,\" he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.", 8 | "summary": "Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday . Young actor says he has no plans to fritter his cash away . Radcliffe's earnings from first five Potter films have been held in trust fund ." 9 | } 10 | ] -------------------------------------------------------------------------------- /doccano_mini/examples/task_free.json: -------------------------------------------------------------------------------- 1 | [{"Column 1": "", "Column 2": ""}] -------------------------------------------------------------------------------- /doccano_mini/examples/text_classification.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"text": "That would be awesome!", "label": "positive"}, 3 | {"text": "This is awful!", "label": "negative"}, 4 | {"text": "Today is hot day.", "label": "neutral"} 5 | ] -------------------------------------------------------------------------------- /doccano_mini/home.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import streamlit as st 4 | 5 | 6 | def main(): 7 | st.set_page_config(page_title="doccano-mini", page_icon=":memo:") 8 | filepath = Path(__file__).parent.resolve() / "docs" / "README.md" 9 | 10 | # Development 11 | if not filepath.exists(): 12 | filepath = Path(__file__).parent.parent.resolve() / "README.md" 13 | 14 | with filepath.open("r", encoding="utf-8") as f: 15 | st.markdown(f.read(), unsafe_allow_html=True) 16 | 17 | 18 | if __name__ == "__main__": 19 | main() 20 | -------------------------------------------------------------------------------- /doccano_mini/layout.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from pathlib import Path 3 | from typing import Dict, List 4 | 5 | import pandas as pd 6 | import streamlit as st 7 | from langchain.chains import LLMChain 8 | from langchain.prompts.few_shot import FewShotPromptTemplate 9 | 10 | from doccano_mini.components import ( 11 | display_download_button, 12 | openai_model_form, 13 | task_instruction_editor, 14 | usage, 15 | ) 16 | from doccano_mini.utils import escape_markdown 17 | 18 | 19 | class BasePage(ABC): 20 | example_path: str = "" 21 | 22 | def __init__(self, title: str) -> None: 23 | self.title = title 24 | 25 | @property 26 | def columns(self) -> List[str]: 27 | return [] 28 | 29 | def load_examples(self, filename: str) -> pd.DataFrame: 30 | filepath = Path(__file__).parent.resolve().joinpath("examples", filename) 31 | return pd.read_json(filepath) 32 | 33 | def make_examples(self, columns: List[str]) -> List[Dict]: 34 | df = self.load_examples(self.example_path) 35 | edited_df = st.experimental_data_editor(df, num_rows="dynamic", width=1000) 36 | examples = edited_df.to_dict(orient="records") 37 | return examples 38 | 39 | @abstractmethod 40 | def make_prompt(self, examples: List[Dict]) -> FewShotPromptTemplate: 41 | raise NotImplementedError() 42 | 43 | @abstractmethod 44 | def prepare_inputs(self, columns: List[str]) -> Dict: 45 | raise NotImplementedError() 46 | 47 | def annotate(self, examples: List[Dict]) -> List[Dict]: 48 | return examples 49 | 50 | def render(self) -> None: 51 | st.title(self.title) 52 | st.header("Annotate your data") 53 | columns = self.columns 54 | examples = self.make_examples(columns) 55 | examples = self.annotate(examples) 56 | 57 | prompt = self.make_prompt(examples) 58 | prompt = task_instruction_editor(prompt) 59 | 60 | st.header("Test") 61 | col1, col2 = st.columns([3, 1]) 62 | 63 | with col1: 64 | inputs = self.prepare_inputs(columns) 65 | 66 | with col2: 67 | llm = openai_model_form() 68 | 69 | with st.expander("See your prompt"): 70 | st.markdown(f"```\n{prompt.format(**inputs)}\n```") 71 | 72 | if llm is None: 73 | st.error("Enter your API key.") 74 | 75 | if st.button("Predict", disabled=llm is None): 76 | chain = LLMChain(llm=llm, prompt=prompt) # type:ignore 77 | response = chain.run(**inputs) 78 | st.markdown(escape_markdown(response).replace("\n", " \n")) 79 | 80 | chain.save("config.yaml") 81 | display_download_button() 82 | usage() 83 | -------------------------------------------------------------------------------- /doccano_mini/models/entity.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict 2 | 3 | 4 | class Entity(TypedDict): 5 | start: int 6 | end: int 7 | label: str 8 | -------------------------------------------------------------------------------- /doccano_mini/models/stepper.py: -------------------------------------------------------------------------------- 1 | class Stepper: 2 | def __init__(self, step=0): 3 | self._step = step 4 | 5 | @property 6 | def step(self) -> int: 7 | return self._step 8 | 9 | def fit(self, total: int): 10 | if self._step >= total: 11 | self._step = total - 1 12 | 13 | def at(self, step: int, total: int): 14 | if step >= total: 15 | raise ValueError(f"step must be less than {total}") 16 | if step < 0: 17 | raise ValueError("step must be greater than 0") 18 | self._step = step 19 | 20 | def increment(self, total: int): 21 | self._step += 1 22 | if self._step >= total: 23 | self._step = 0 24 | 25 | def decrement(self, total: int): 26 | self._step -= 1 27 | if self._step < 0: 28 | self._step = total - 1 29 | -------------------------------------------------------------------------------- /doccano_mini/pages/01_Text_Classification.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import streamlit as st 4 | 5 | from doccano_mini.layout import BasePage 6 | from doccano_mini.prompts import make_classification_prompt 7 | 8 | 9 | class TextClassificationPage(BasePage): 10 | example_path = "text_classification.json" 11 | 12 | def make_prompt(self, examples: List[Dict]): 13 | return make_classification_prompt(examples) 14 | 15 | def prepare_inputs(self, columns: List[str]): 16 | return {"input": st.text_area(label="Please enter your text.", value="", height=300)} 17 | 18 | 19 | page = TextClassificationPage(title="Text Classification") 20 | page.render() 21 | -------------------------------------------------------------------------------- /doccano_mini/pages/02_Question_Answering.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import streamlit as st 4 | 5 | from doccano_mini.layout import BasePage 6 | from doccano_mini.prompts import make_question_answering_prompt 7 | 8 | 9 | class QuestionAnsweringPage(BasePage): 10 | example_path = "question_answering.json" 11 | 12 | def make_prompt(self, examples: List[Dict]): 13 | return make_question_answering_prompt(examples) 14 | 15 | def prepare_inputs(self, columns: List[str]): 16 | return { 17 | "context": st.text_area(label="Context.", value="", height=300), 18 | "question": st.text_input(label="Question.", value=""), 19 | } 20 | 21 | 22 | page = QuestionAnsweringPage(title="Question Answering") 23 | page.render() 24 | -------------------------------------------------------------------------------- /doccano_mini/pages/03_Summarization.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import streamlit as st 4 | 5 | from doccano_mini.layout import BasePage 6 | from doccano_mini.prompts import make_summarization_prompt 7 | 8 | 9 | class SummarizationPage(BasePage): 10 | example_path = "summarization.json" 11 | 12 | def make_prompt(self, examples: List[Dict]): 13 | return make_summarization_prompt(examples) 14 | 15 | def prepare_inputs(self, columns: List[str]): 16 | return { 17 | "passage": st.text_area(label="Passage.", value="", height=300), 18 | } 19 | 20 | 21 | page = SummarizationPage(title="Summarization") 22 | page.render() 23 | -------------------------------------------------------------------------------- /doccano_mini/pages/04_Paraphrase.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import streamlit as st 4 | 5 | from doccano_mini.layout import BasePage 6 | from doccano_mini.prompts import make_paraphrase_prompt 7 | 8 | 9 | class ParaphrasePage(BasePage): 10 | example_path = "paraphrase.json" 11 | 12 | def make_prompt(self, examples: List[Dict]): 13 | return make_paraphrase_prompt(examples) 14 | 15 | def prepare_inputs(self, columns: List[str]): 16 | return { 17 | "text": st.text_area(label="Text.", value="", height=300), 18 | } 19 | 20 | 21 | page = ParaphrasePage(title="Paraphrase") 22 | page.render() 23 | -------------------------------------------------------------------------------- /doccano_mini/pages/05_Named_Entity_Recognition.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | from st_ner_annotate import st_ner_annotate 6 | 7 | from doccano_mini.layout import BasePage 8 | from doccano_mini.prompts import make_named_entity_recognition_prompt 9 | from doccano_mini.storages.entity import EntitySessionStorage 10 | from doccano_mini.storages.stepper import StepperSessionStorage 11 | 12 | 13 | class NamedEntityRecognitionPage(BasePage): 14 | example_path = "named_entity_recognition.json" 15 | 16 | def __init__(self, title: str) -> None: 17 | super().__init__(title) 18 | self.types: List[str] = [] 19 | self.entity_repository = EntitySessionStorage() 20 | self.stepper_repository = StepperSessionStorage() 21 | 22 | def define_entity_types(self): 23 | st.subheader("Define entity types") 24 | default_types = pd.DataFrame([{"type": entity_type} for entity_type in ["ORG", "LOC", "PER"]]) 25 | edited_df = st.experimental_data_editor(default_types, num_rows="dynamic", width=1000) 26 | types = edited_df["type"].values 27 | self.types = types 28 | return types 29 | 30 | def annotate(self, examples: List[Dict]) -> List[Dict]: 31 | if len(examples) == 0: 32 | return [] 33 | 34 | types = self.define_entity_types() 35 | selected_type = st.selectbox("Select an entity type", types) 36 | 37 | col1, col2, _ = st.columns([1, 1, 8]) 38 | col1.button("Prev", on_click=self.stepper_repository.decrement, args=(len(examples),)) 39 | col2.button("Next", on_click=self.stepper_repository.increment, args=(len(examples),)) 40 | 41 | self.stepper_repository.fit(len(examples)) 42 | step = self.stepper_repository.get_step() 43 | text = examples[step]["text"] 44 | entities = self.entity_repository.find_by_text(text) 45 | entities = st_ner_annotate(selected_type, text, entities, key=text) 46 | self.entity_repository.store_by_text(text, entities) 47 | return examples 48 | 49 | def make_prompt(self, examples: List[Dict]): 50 | examples = [ 51 | {**example, "entities": self.entity_repository.find_by_text(example["text"])} for example in examples 52 | ] 53 | return make_named_entity_recognition_prompt(examples, types=self.types) 54 | 55 | def prepare_inputs(self, columns: List[str]): 56 | return {"text": st.text_area(label="Please enter your text.", value="", height=300)} 57 | 58 | 59 | page = NamedEntityRecognitionPage(title="Named Entity Recognition") 60 | page.render() 61 | -------------------------------------------------------------------------------- /doccano_mini/pages/06_(Beta)_Evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | from datasets import load_dataset 6 | from langchain.chains import LLMChain 7 | from more_itertools import interleave_longest 8 | from sklearn.metrics import classification_report 9 | 10 | from doccano_mini.components import openai_model_form, task_instruction_editor 11 | from doccano_mini.prompts import make_classification_prompt 12 | from doccano_mini.utils import escape_markdown 13 | 14 | AVAILABLE_DATASETS = ("imdb", "ag_news", "rotten_tomatoes") 15 | 16 | 17 | @st.cache_resource 18 | def prepare_dataset(dataset_id): 19 | # Loading dataset 20 | dataset = load_dataset(dataset_id, split="train") 21 | # Splitting dataset 22 | dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="label", shuffle=True) 23 | 24 | # Preparing indices 25 | indices_by_label = defaultdict(list) 26 | for i, x in enumerate(dataset["train"]): 27 | indices_by_label[x["label"]].append(i) 28 | 29 | return dataset, list(interleave_longest(*indices_by_label.values())) 30 | 31 | 32 | st.title("Text Classification Evaluation on 🤗 datasets") 33 | 34 | st.header("Setup your data") 35 | 36 | dataset_id = st.selectbox("Select a dataset", options=AVAILABLE_DATASETS) 37 | 38 | dataset, train_indices = prepare_dataset(dataset_id) 39 | 40 | train_dataset = dataset["train"] 41 | validation_dataset = dataset["test"] 42 | 43 | label_info = train_dataset.features["label"] 44 | num_classes = label_info.num_classes 45 | few_shot_example_size = int( 46 | st.number_input("Number of examples", min_value=num_classes, max_value=num_classes * 5, value=num_classes) 47 | ) 48 | 49 | subset = [] 50 | for i in range(few_shot_example_size): 51 | example = train_dataset[train_indices[i]] 52 | subset.append({"text": example["text"], "label": label_info.int2str(example["label"])}) 53 | 54 | 55 | df = pd.DataFrame(subset) 56 | 57 | st.write(df) 58 | 59 | prompt = make_classification_prompt(df.to_dict("records")) 60 | prompt = task_instruction_editor(prompt) 61 | 62 | 63 | st.header("Test") 64 | col1, col2 = st.columns([3, 1]) 65 | 66 | with col1: 67 | inputs = {"input": st.text_area(label="Please enter your text.", value="", height=300)} 68 | 69 | with col2: 70 | llm = openai_model_form() 71 | 72 | with st.expander("See your prompt"): 73 | st.markdown(f"```\n{prompt.format(**inputs)}\n```") 74 | 75 | if llm is None: 76 | st.error("Enter your API key.") 77 | 78 | if st.button("Predict", disabled=llm is None): 79 | chain = LLMChain(llm=llm, prompt=prompt) # type:ignore 80 | response = chain.run(**inputs) 81 | st.markdown(escape_markdown(response).replace("\n", " \n")) 82 | 83 | st.subheader("Evaluation") 84 | 85 | evaluation_size = int(st.number_input("Number of examples", min_value=5, max_value=validation_dataset.dataset_size)) 86 | 87 | if llm is None: 88 | st.error("Enter your API key.") 89 | 90 | if st.button("Evaluate", disabled=llm is None): 91 | chain = LLMChain(llm=llm, prompt=prompt) # type:ignore 92 | y_true = [] 93 | y_pred = [] 94 | for i in range(evaluation_size): 95 | example = validation_dataset[i] 96 | response = chain.run(input=example["text"]) 97 | y_true.append(label_info.int2str(example["label"])) 98 | y_pred.append(response.split(":")[-1].strip()) 99 | 100 | st.text(classification_report(y_true, y_pred, digits=3)) 101 | -------------------------------------------------------------------------------- /doccano_mini/pages/09_Task_Free.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import streamlit as st 4 | 5 | from doccano_mini.layout import BasePage 6 | from doccano_mini.prompts import make_task_free_prompt 7 | 8 | 9 | class TaskFreePage(BasePage): 10 | @property 11 | def columns(self) -> List[str]: 12 | num_cols = st.number_input("Set the number of columns", min_value=2, max_value=10) 13 | columns = [st.text_input(f"Column {i + 1}:", value=f"column {i + 1}") for i in range(int(num_cols))] 14 | return columns 15 | 16 | def make_examples(self, columns: List[str]): 17 | df = self.load_examples("task_free.json") 18 | df = df.reindex(columns, axis="columns", fill_value="") 19 | edited_df = st.experimental_data_editor(df, num_rows="dynamic", width=1000) 20 | examples = edited_df.to_dict(orient="records") 21 | return examples 22 | 23 | def make_prompt(self, examples: List[Dict]): 24 | return make_task_free_prompt(examples) 25 | 26 | def prepare_inputs(self, columns: List[str]): 27 | return {column: st.text_area(label=f"Input for {column}:", value="", height=300) for column in columns[:-1]} 28 | 29 | 30 | page = TaskFreePage(title="Task Free") 31 | page.render() 32 | -------------------------------------------------------------------------------- /doccano_mini/prompts.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | from langchain.prompts.few_shot import FewShotPromptTemplate 5 | from langchain.prompts.prompt import PromptTemplate 6 | 7 | 8 | def make_classification_prompt(examples: List[dict]) -> FewShotPromptTemplate: 9 | unique_labels = set([example["label"] for example in examples]) 10 | 11 | task_instruction = "Classify the text into one of the following labels:\n" 12 | # Sorting to make label order deterministic 13 | for label in sorted(unique_labels): 14 | task_instruction += f"- {label}\n" 15 | 16 | example_prompt = PromptTemplate(input_variables=["text", "label"], template="text: {text}\nlabel: {label}") 17 | prompt = FewShotPromptTemplate( 18 | examples=examples, 19 | example_prompt=example_prompt, 20 | prefix=task_instruction, 21 | suffix="text: {input}", 22 | input_variables=["input"], 23 | ) 24 | return prompt 25 | 26 | 27 | def make_question_answering_prompt(examples: List[dict]) -> FewShotPromptTemplate: 28 | task_instruction = ( 29 | "You are a highly intelligent question answering bot. " 30 | "You take context and question as input and return the answer from the context. " 31 | "Retain as much information as needed to answer the question at a later time. " 32 | "If you don't know the answer, you should return N/A." 33 | ) 34 | 35 | example_prompt = PromptTemplate( 36 | input_variables=["context", "question", "answer"], 37 | template="context: {context}\nquestion: {question}\nanswer: {answer}", 38 | ) 39 | prompt = FewShotPromptTemplate( 40 | examples=examples, 41 | example_prompt=example_prompt, 42 | prefix=task_instruction, 43 | suffix="context: {context}\nquestion: {question}", 44 | input_variables=["context", "question"], 45 | ) 46 | return prompt 47 | 48 | 49 | def make_summarization_prompt(examples: List[dict]) -> FewShotPromptTemplate: 50 | task_instruction = ( 51 | "You are a highly intelligent Summarization system. " 52 | "You take Passage as input and summarize the passage as an expert." 53 | ) 54 | example_prompt = PromptTemplate( 55 | input_variables=["passage", "summary"], template="passage: {passage}\nsummary: {summary}" 56 | ) 57 | prompt = FewShotPromptTemplate( 58 | examples=examples, 59 | example_prompt=example_prompt, 60 | prefix=task_instruction, 61 | suffix="passage: {passage}", 62 | input_variables=["passage"], 63 | ) 64 | return prompt 65 | 66 | 67 | def make_paraphrase_prompt(examples: List[dict]) -> FewShotPromptTemplate: 68 | task_instruction = ( 69 | "You are a highly intelligent paraphrasing system. You take text as input and paraphrase it as an expert." 70 | ) 71 | example_prompt = PromptTemplate( 72 | input_variables=["text", "paraphrase"], template="text: {text}\nparaphrase: {paraphrase}" 73 | ) 74 | prompt = FewShotPromptTemplate( 75 | examples=examples, 76 | example_prompt=example_prompt, 77 | prefix=task_instruction, 78 | suffix="text: {text}", 79 | input_variables=["text"], 80 | ) 81 | return prompt 82 | 83 | 84 | def make_task_free_prompt(examples: List[dict]) -> FewShotPromptTemplate: 85 | columns = list(examples[0]) 86 | 87 | task_instruction = f"Predict {columns[-1]} based on {', '.join(columns[:-1])}." 88 | example_prompt = PromptTemplate( 89 | input_variables=columns, template="\n".join([f"{column}: {{{column}}}" for column in columns]) 90 | ) 91 | 92 | prompt = FewShotPromptTemplate( 93 | examples=examples, 94 | example_prompt=example_prompt, 95 | prefix=task_instruction, 96 | suffix="\n".join([f"{column}: {{{column}}}" for column in columns[:-1]]), 97 | input_variables=columns[:-1], 98 | ) 99 | return prompt 100 | 101 | 102 | def make_named_entity_recognition_prompt(examples: List[dict], **kwargs) -> FewShotPromptTemplate: 103 | task_instruction = ( 104 | "You are a highly intelligent and accurate Named-entity recognition(NER) system. " 105 | "You take Passage as input and your task is to recognize and extract specific types of " 106 | "named entities in that given passage and classify into a set of entity types.\n" 107 | ) 108 | types = kwargs.get("types", []) 109 | task_instruction += "The following entity types are allowed:\n" 110 | for type in types: 111 | task_instruction += f"- {type}\n" 112 | 113 | for example in examples: 114 | entities = [ 115 | {"mention": example["text"][entity["start"] : entity["end"]], "type": entity["label"]} 116 | for entity in example["entities"] 117 | ] 118 | example["entities"] = json.dumps(entities) 119 | 120 | example_prompt = PromptTemplate( 121 | input_variables=["text", "entities"], 122 | template="text: {text}\nentities: {entities}", 123 | ) 124 | prompt = FewShotPromptTemplate( 125 | examples=examples, 126 | example_prompt=example_prompt, 127 | prefix=task_instruction, 128 | suffix="text: {{text}}", 129 | input_variables=["text"], 130 | template_format="jinja2", 131 | ) 132 | return prompt 133 | -------------------------------------------------------------------------------- /doccano_mini/storages/entity.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import List 3 | 4 | import streamlit as st 5 | 6 | from doccano_mini.models.entity import Entity 7 | from doccano_mini.storages.session_storage import SessionStorage 8 | 9 | 10 | class EntitySessionStorage: 11 | def __init__(self) -> None: 12 | self.storage = SessionStorage(state=st.session_state) 13 | self.storage.init_state("entities", defaultdict(list)) 14 | 15 | def find_by_text(self, text: str) -> List[Entity]: 16 | entities = self.storage.get_state("entities") 17 | return entities.get(text, []) 18 | 19 | def store_by_text(self, text: str, entities: List[Entity]) -> None: 20 | current_entities = self.storage.get_state("entities") 21 | current_entities[text] = entities 22 | self.storage.set_state("entities", current_entities) 23 | -------------------------------------------------------------------------------- /doccano_mini/storages/session_storage.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from streamlit.runtime.state import SessionStateProxy 4 | 5 | 6 | class SessionStorage: 7 | def __init__(self, state: SessionStateProxy) -> None: 8 | self.state = state 9 | 10 | def init_state(self, key: str, value: Any) -> None: 11 | if key not in self.state: 12 | self.state[key] = value 13 | 14 | def set_state(self, key: str, value: Any, *, do_init: bool = False) -> None: 15 | if do_init: 16 | self.init_state(key, value) 17 | 18 | self.state[key] = value 19 | 20 | def get_state(self, key: str) -> Any: 21 | return self.state.get(key, None) 22 | -------------------------------------------------------------------------------- /doccano_mini/storages/stepper.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from doccano_mini.models.stepper import Stepper 4 | from doccano_mini.storages.session_storage import SessionStorage 5 | 6 | 7 | class StepperSessionStorage: 8 | def __init__(self) -> None: 9 | self.storage = SessionStorage(state=st.session_state) 10 | self.storage.init_state("step", 0) 11 | 12 | def get_step(self) -> int: 13 | return self.storage.get_state("step") 14 | 15 | def fit(self, total: int) -> None: 16 | step = self.storage.get_state("step") 17 | stepper = Stepper(step) 18 | stepper.fit(total) 19 | self.storage.set_state("step", stepper.step) 20 | 21 | def increment(self, total: int) -> None: 22 | step = self.storage.get_state("step") 23 | stepper = Stepper(step) 24 | stepper.increment(total) 25 | self.storage.set_state("step", stepper.step) 26 | 27 | def decrement(self, total: int) -> None: 28 | step = self.storage.get_state("step") 29 | stepper = Stepper(step) 30 | stepper.decrement(total) 31 | self.storage.set_state("step", stepper.step) 32 | -------------------------------------------------------------------------------- /doccano_mini/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def escape_markdown(text: str) -> str: 5 | # Brought from https://github.com/python-telegram-bot/python-telegram-bot/blob/v20.2/telegram/helpers.py#L66 6 | escape_chars = r"\_*[]()~`>#+-=|{}.!$" 7 | return re.sub(f"([{re.escape(escape_chars)}])", r"\\\1", text) 8 | -------------------------------------------------------------------------------- /docs/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment 10 | include: 11 | 12 | * Using welcoming and inclusive language 13 | * Being respectful of differing viewpoints and experiences 14 | * Gracefully accepting constructive criticism 15 | * Focusing on what is best for the community 16 | * Showing empathy towards other community members 17 | 18 | Examples of unacceptable behavior by participants include: 19 | 20 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 21 | * Trolling, insulting/derogatory comments, and personal or political attacks 22 | * Public or private harassment 23 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 24 | * Other conduct which could reasonably be considered inappropriate in a professional setting 25 | 26 | ## Our Responsibilities 27 | 28 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 29 | 30 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 31 | 32 | ## Scope 33 | 34 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 35 | 36 | ## Enforcement 37 | 38 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at hiroki.nakayama.py@gmail.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 39 | 40 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 41 | 42 | ## Attribution 43 | 44 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 45 | 46 | [homepage]: https://www.contributor-covenant.org 47 | 48 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq 49 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via issue with the owners of this repository before making a change. 4 | 5 | Please note we have a code of conduct, please follow it in all your interactions with the project. 6 | 7 | ## How to contribute 8 | 9 | ### Reporting Bugs 10 | 11 | #### Before submitting a bug report 12 | 13 | * Ensure the bug was not already reported by searching on GitHub under [Issues](https://github.com/doccano/doccano-mini/issues). 14 | * [Open a new issue](https://github.com/doccano/doccano-mini/issues/new/choose) if you're unable to find an open one addressing the problem. 15 | * Use the relevant bug report templates to create the issue. 16 | 17 | #### How do I submit a good bug report? 18 | 19 | Explain the problem and include additional details to help maintainers reproduce the problem: 20 | 21 | * Use a clear and descriptive title for the issue to identify the problem. 22 | * Describe the exact steps which reproduce the problem in as many details as possible. 23 | * Provide specific examples to demonstrate the steps. 24 | * Describe the behavior you observed after following the steps and point out what exactly is the problem with that behavior. 25 | * Explain which behavior you expected to see instead and why. 26 | * Include screenshots and animated GIFs which show you following the described steps and clearly demonstrate the problem. 27 | * If the problem is related to performance or memory, include a CPU profile capture with your report. 28 | * If the problem is related to network, include a network activity in Chrome/Firefox/Safari DevTools. 29 | * If the problem wasn't triggered by a specific action, describe what you were doing before the problem happened and share more information using the guidelines below. 30 | 31 | ### Suggesting Enhancements 32 | 33 | #### Before submitting an enhancement suggestion 34 | 35 | * Ensure the suggestion was not already reported by searching on GitHub under [Issues](https://github.com/doccano/doccano-mini/issues). 36 | * [Open a new issue](https://github.com/doccano/doccano-mini/issues/new/choose) if you're unable to find an open one addressing the suggestion. 37 | * Use the relevant issue templates to create one. 38 | 39 | #### How do I submit a good enhancement suggestion? 40 | 41 | Explain the suggestion and include additional details to help developers understand it: 42 | 43 | * Use a clear and descriptive title for the issue to identify the suggestion. 44 | * Provide a step-by-step description of the suggested enhancement in as many details as possible. 45 | * Provide specific examples to demonstrate the steps. 46 | * Describe the current behavior and explain which behavior you expected to see instead and why. 47 | * Include screenshots and animated GIFs which help you demonstrate the steps or point out the part of doccano-mini which the suggestion is related to. 48 | * Explain why this enhancement would be useful to most users. 49 | * List some other annotation tools or applications where this enhancement exists. 50 | * Specify which version you're using. 51 | * Specify the name and version of the OS you're using. 52 | 53 | ## development workflow 54 | 55 | 1. **Fork the project & clone it locally:** Click the "Fork" button in the header of the [GitHub repository](https://github.com/doccano/doccano-mini), creating a copy of `doccano-mini` in your GitHub account. To get a working copy on your local machine, you have to clone your fork. Click the "Clone or Download" button in the right-hand side bar, then append its output to the `git clone` command. 56 | 57 | $ git clone https://github.com/YOUR_USERNAME/doccano-mini.git 58 | 59 | 1. **Create an upstream remote and sync your local copy:** Connect your local copy to the original "upstream" repository by adding it as a remote. 60 | 61 | $ cd doccano-mini 62 | $ git remote add upstream https://github.com:doccano/doccano-mini.git 63 | 64 | You should now have two remotes: read/write-able `origin` points to your GitHub fork, and a read-only `upstream` points to the original repo. Be sure to [keep your fork in sync](https://help.github.com/en/articles/syncing-a-fork) with the original, reducing the likelihood of merge conflicts later on. 65 | 66 | 1. **Create a branch for each piece of work:** Branch off `develop` for each bugfix or feature that you're working on. Give your branch a descriptive, meaningful name like `bugfix-for-issue-1234` or `improve-io-performance`, so others know at a glance what you're working on. 67 | 68 | $ git checkout develop 69 | $ git pull develop master && git push origin develop 70 | $ git checkout -b my-descriptive-branch-name 71 | 72 | At this point, you may want to install your version. It's usually best to do this within a dedicated virtual environment; We recomment to use `poetry`: 73 | 74 | $ poetry install 75 | $ poetry shell 76 | 77 | Then run the `streamlit` command to serve: 78 | 79 | $ streamlit run doccano_mini/app.py 80 | 81 | Now, you can access to the frontend at . 82 | 83 | 2. **Implement your changes:** Use your preferred text editor to modify the source code. Be sure to keep your changes focused and in scope, and follow the coding conventions described below! Document your code as you write it. Run your changes against any existing tests and add new ones as needed to validate your changes; make sure you don’t accidentally break existing functionality! Several common commands can be accessed via the `make`: 84 | 85 | $ make lint 86 | 87 | 3. **Push commits to your forked repository:** Group changes into atomic git commits, then push them to your `origin` repository. There's no need to wait until all changes are final before pushing — it's always good to have a backup, in case something goes wrong in your local copy. 88 | 89 | $ git push origin my-descriptive-branch-name 90 | 91 | 4. **Open a new Pull Request in GitHub:** When you're ready to submit your changes to the main repo, navigate to your forked repository on GitHub. Switch to your working branch then click "New pull request"; alternatively, if you recently pushed, you may see a banner at the top of the repo with a "Compare & pull request" button, which you can click on to initiate the same process. Fill out the PR template completely and clearly, confirm that the code "diff" is as expected, then submit the PR. A number of processes will run automatically via GitHub Workflows (see `.github/workflows/`); we'll want to make sure everything passes before the PR gets merged. 92 | 93 | 5. **Respond to any code review feedback:** At this point, @Hironsan will review your work and either request additional changes/clarification or approve your work. There may be some necessary back-and-forth; please do your best to be responsive. If you haven’t gotten a response in a week or so, please politely nudge him in the same thread — thanks in advance for your patience! 94 | 95 | ## Styleguides 96 | 97 | ### Git Commit Messages 98 | 99 | * Use the present tense ("Add feature" not "Added feature") 100 | * Use the imperative mood ("Move cursor to..." not "Moves cursor to...") 101 | * Limit the first line to 72 characters or less 102 | * Reference issues and pull requests liberally after the first line 103 | -------------------------------------------------------------------------------- /docs/images/annotation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doccano/doccano-mini/0ef6c3368499eb172ff5c8c446d61c7f240baf8f/docs/images/annotation.gif -------------------------------------------------------------------------------- /docs/images/copy_and_paste.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doccano/doccano-mini/0ef6c3368499eb172ff5c8c446d61c7f240baf8f/docs/images/copy_and_paste.gif -------------------------------------------------------------------------------- /docs/images/download_config.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doccano/doccano-mini/0ef6c3368499eb172ff5c8c446d61c7f240baf8f/docs/images/download_config.jpg -------------------------------------------------------------------------------- /docs/images/test_new_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doccano/doccano-mini/0ef6c3368499eb172ff5c8c446d61c7f240baf8f/docs/images/test_new_example.jpg -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "doccano-mini" 3 | version = "0.0.10" 4 | description = "Annotation meets Large Language Models." 5 | authors = ["Hironsan "] 6 | license = "MIT" 7 | readme = "README.md" 8 | homepage = "https://github.com/doccano/doccano-mini" 9 | repository = "https://github.com/doccano/doccano-mini" 10 | classifiers = [ 11 | "Programming Language :: Python", 12 | "Programming Language :: Python :: 3.8", 13 | "Programming Language :: Python :: 3.9", 14 | "Programming Language :: Python :: 3.10", 15 | ] 16 | 17 | [tool.poetry.scripts] 18 | doccano-mini = 'doccano_mini.cli:main' 19 | 20 | [tool.poetry.dependencies] 21 | python = ">=3.8.1,<3.9.7 || >3.9.7,<4.0" 22 | streamlit = "^1.20.0" 23 | langchain = "^0.0.113" 24 | openai = "^0.27.2" 25 | st-ner-annotate = "^0.1.0" 26 | scikit-learn = "^1.2.2" 27 | datasets = "^2.11.0" 28 | more-itertools = "^9.1.0" 29 | 30 | [tool.poetry.dev-dependencies] 31 | taskipy = "^1.10.3" 32 | black = "^23.1.0" 33 | isort = "^5.12.0" 34 | mypy = "^1.1.1" 35 | pyproject-flake8 = "^6.0.0" 36 | pytest = "^7.2.2" 37 | pytest-cov = "^4.0.0" 38 | 39 | [build-system] 40 | requires = ["poetry-core>=1.0.0"] 41 | build-backend = "poetry.core.masonry.api" 42 | 43 | [tool.black] 44 | line-length = 120 45 | target-version = ['py38', 'py39'] 46 | include = '\.pyi?$' 47 | 48 | [tool.flake8] 49 | max-line-length = 120 50 | max-complexity = 18 51 | ignore = "E203,E266,W503," 52 | filename = "*.py" 53 | 54 | [tool.mypy] 55 | python_version = "3.8" 56 | ignore_missing_imports = true 57 | show_error_codes = true 58 | 59 | [tool.isort] 60 | profile = "black" 61 | include_trailing_comma = true 62 | multi_line_output = 3 63 | 64 | [tool.pytest.ini_options] 65 | testpaths = [ 66 | "tests", 67 | ] 68 | 69 | [tool.taskipy.tasks] 70 | isort = "isort . -c --skip migrations" 71 | flake8 = "pflake8 --filename \"*.py\"" 72 | black = "black --check ." 73 | mypy = "mypy ." 74 | test = "pytest --cov=doccano_mini --cov-report=term-missing -vv" 75 | -------------------------------------------------------------------------------- /tests/test_prompts.py: -------------------------------------------------------------------------------- 1 | from doccano_mini.prompts import make_classification_prompt, make_task_free_prompt 2 | 3 | 4 | def test_make_classification_prompt(): 5 | examples = [ 6 | {"text": "That would be awesome!", "label": "positive"}, 7 | {"text": "This is awful!", "label": "negative"}, 8 | ] 9 | 10 | expected = """\ 11 | Classify the text into one of the following labels: 12 | - negative 13 | - positive 14 | 15 | 16 | text: That would be awesome! 17 | label: positive 18 | 19 | text: This is awful! 20 | label: negative 21 | 22 | text: It's very hot.""" 23 | 24 | input_text = "It's very hot." 25 | 26 | prompt = make_classification_prompt(examples) 27 | 28 | assert prompt.format(input=input_text) == expected 29 | 30 | 31 | def test_make_task_free_prompt(): 32 | examples = [ 33 | {"English": "I like sushi.", "Japanese": "寿司が好きです。"}, 34 | {"English": "I live in Japan.", "Japanese": "日本に住んでいます。"}, 35 | ] 36 | 37 | expected = """\ 38 | Predict Japanese based on English. 39 | 40 | English: I like sushi. 41 | Japanese: 寿司が好きです。 42 | 43 | English: I live in Japan. 44 | Japanese: 日本に住んでいます。 45 | 46 | English: I'm developing doccano-mini.""" 47 | 48 | english_text = "I'm developing doccano-mini." 49 | 50 | prompt = make_task_free_prompt(examples) 51 | 52 | assert prompt.format(English=english_text) == expected 53 | --------------------------------------------------------------------------------