├── .gitignore ├── data └── desc.md ├── database └── desc.md ├── figs └── overview.png ├── readme.md └── src └── sources ├── data_preprocess.py ├── dataset ├── ppl_train_other.json └── train_schema-linking.jsonl ├── llms ├── __init__.py ├── codellama.py ├── gpt.py ├── llama2.py ├── puyu.py ├── sqlcoder.py └── vicuna.py ├── nltk_data └── corpora │ └── stopwords │ ├── README │ ├── arabic │ ├── azerbaijani │ ├── basque │ ├── bengali │ ├── catalan │ ├── chinese │ ├── danish │ ├── dutch │ ├── english │ ├── finnish │ ├── french │ ├── german │ ├── greek │ ├── hebrew │ ├── hinglish │ ├── hungarian │ ├── indonesian │ ├── italian │ ├── kazakh │ ├── nepali │ ├── norwegian │ ├── portuguese │ ├── romanian │ ├── russian │ ├── slovene │ ├── spanish │ ├── swedish │ ├── tajik │ └── turkish ├── post_process.py ├── ppl_dev.json ├── ppl_dev_add_sl.json ├── run_all.sh ├── run_cross_voting.sh ├── run_one_model.sh ├── schemalink.py ├── sql_gen ├── __init__.py ├── get_example_modules.py ├── main.py └── sql_gen_utils.py └── utils ├── data_builder.py ├── datasets └── spider.py ├── enums.py ├── linking_process.py ├── linking_utils ├── abstract_preproc.py ├── application.py ├── corenlp.py ├── serialization.py └── spider_match_utils.py ├── post_process.py ├── pretrained_embeddings.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | #*.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pdm 86 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 87 | pdm.lock 88 | 89 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 90 | # in version control. 91 | # https://pdm.fming.dev/#use-with-ide 92 | .pdm.toml 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # pytype static type analyzer 132 | .pytype/ 133 | 134 | # Cython debug symbols 135 | cython_debug/ 136 | 137 | # PyCharm 138 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 139 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 140 | # and can be added to the global gitignore or merged into this file. For a more nuclear 141 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 142 | .idea 143 | 144 | # VSCode 145 | .vscode 146 | glove 147 | 148 | data/spider/ 149 | sentence_* 150 | *.txt 151 | *.out 152 | *.zip 153 | data_*.json 154 | *_vt.py 155 | sentence_transformers/ 156 | data_*.json 157 | data/ 158 | # *.jsonl 159 | -------------------------------------------------------------------------------- /data/desc.md: -------------------------------------------------------------------------------- 1 | # spider official test data here 2 | -------------------------------------------------------------------------------- /database/desc.md: -------------------------------------------------------------------------------- 1 | # spider official test database here 2 | -------------------------------------------------------------------------------- /figs/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhshLii/PETSQL/3d73b36ab42525e593c4d70edb74987eac741744/figs/overview.png -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # The implementation code of "PET-SQL: A Prompt-enhanced Two-stage Text-to-SQL Framework with Cross-consistency" 2 | 3 | ## Overview 4 | ![The proposed PETSQL framework](./figs/overview.png) 5 | 6 | ## RUN 7 | ```bash 8 | docker pull zhishuailii/spider_envs:latest 9 | 10 | docker run --ipc=host -itd -v /your-local-path/PETSQL/src:/root/src -v /your-local-path/data:/root/data -v /your-local-path/test_database:/root/database --name spider_db_test_offline the-docker-iamge-ID 11 | # If you want to run it in current directory 12 | mkdir -p src data test_database && docker run --ipc=host -itd -v $(pwd)/src:/root/src -v $(pwd)/data:/root/data -v $(pwd)/test_database:/root/database --name spider_db_test_offline zhishuailii/spider_envs:latest 13 | 14 | docker exec -it spider_db_test_offline bash run_all.sh 15 | ``` 16 | 17 | ## Citation 18 | ```bibtex 19 | @article{li2024pet, 20 | title={PET-SQL: A Prompt-enhanced Two-stage Text-to-SQL Framework with Cross-consistency}, 21 | author={Li, Zhishuai and Wang, Xiang and Zhao, Jingjing and Yang, Sun and Du, Guoqing and Hu, Xiaoru and Zhang, Bin and Ye, Yuxiao and Li, Ziyue and Zhao, Rui and others}, 22 | journal={arXiv preprint arXiv:2403.09732}, 23 | year={2024} 24 | } 25 | ``` 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/sources/data_preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import pickle 5 | from pathlib import Path 6 | import sqlite3 7 | from tqdm import tqdm 8 | import random 9 | import sys, os 10 | sys.path.append(os.path.dirname(__file__)) 11 | from utils.linking_process import SpiderEncoderV2Preproc 12 | from utils.pretrained_embeddings import GloVe 13 | from utils.datasets.spider import load_tables 14 | # from dataset.process.preprocess_kaggle import gather_questions 15 | 16 | 17 | def schema_linking_producer(test, train, table, db, dataset_dir, compute_cv_link=True): 18 | 19 | # load data 20 | test_data = json.load(open(os.path.join(dataset_dir, test))) 21 | # train_data = json.load(open(os.path.join(dataset_dir, train))) 22 | 23 | # load schemas 24 | schemas, _ = load_tables([os.path.join(dataset_dir, table)]) 25 | 26 | # Backup in-memory copies of all the DBs and create the live connections 27 | for db_id, schema in schemas.items(): 28 | # sqlite_path = Path(dataset_dir) / db / db_id / f"{db_id}.sqlite" 29 | sqlite_path = f"{db}/{db_id}/{db_id}.sqlite" 30 | # print(sqlite_path) 31 | source: sqlite3.Connection 32 | with sqlite3.connect(str(sqlite_path)) as source: 33 | dest = sqlite3.connect(':memory:') 34 | dest.row_factory = sqlite3.Row 35 | source.backup(dest) 36 | schema.connection = dest 37 | 38 | word_emb = GloVe(kind='42B', lemmatize=True) 39 | linking_processor = SpiderEncoderV2Preproc(dataset_dir, 40 | min_freq=4, 41 | max_count=5000, 42 | include_table_name_in_column=False, 43 | word_emb=word_emb, 44 | fix_issue_16_primary_keys=True, 45 | compute_sc_link=True, 46 | compute_cv_link=compute_cv_link) 47 | 48 | # build schema-linking 49 | print("Build test schema-linking ...") 50 | for data, section in zip([test_data],['test']): 51 | for item in data: 52 | db_id = item["db_id"] 53 | schema = schemas[db_id] 54 | to_add, validation_info = linking_processor.validate_item(item, schema, section) 55 | if to_add: 56 | linking_processor.add_item(item, schema, section, validation_info) 57 | 58 | # save 59 | linking_processor.save() 60 | 61 | 62 | import re, json, os 63 | 64 | from sql_metadata import Parser 65 | 66 | proj_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) 67 | 68 | 69 | def add_fk(ppl_test): 70 | tables_data = json.load( 71 | open("data/spider/test_data/tables.json", 'r', encoding='utf-8')) 72 | forekeys = {} 73 | anno_simddl = {} 74 | for db in tables_data: 75 | tables = db["table_names_original"] 76 | column_names = db["column_names_original"] 77 | column_types = db["column_types"] 78 | foreign_keys = db["foreign_keys"] 79 | sql_tem_all = [] 80 | sql_tem_sim_ddl = [] 81 | for idx, data in enumerate(tables): 82 | for j, column in enumerate(column_names): 83 | sql_tem = [] 84 | if idx == column[0]: 85 | sql_tem.append(column[0]) 86 | sql_tem.append( 87 | str(column[1]) + " " + str(column_types[j]).upper()) 88 | sql_tem_all.append(sql_tem) 89 | sql_tem_sim_ddl.append([column[0], column[1]]) 90 | 91 | simddl_all = [] 92 | for idx, data in enumerate(tables): 93 | sql_01 = "# " + str(data) + "(" 94 | sql_final_tem = [] 95 | for j, sql_final in enumerate(sql_tem_sim_ddl): 96 | if idx == sql_final[0]: 97 | sql_final_tem.append(sql_final[1]) 98 | sql_01 += ",".join(sql_final_tem) + ")" 99 | simddl_all.append(sql_01) 100 | anno_simddl[db["db_id"]] = simddl_all 101 | forkey = [] 102 | for foreign in foreign_keys: 103 | vlaus = str(tables[int( 104 | column_names[foreign[0]][0])]) + "(" + str( 105 | column_names[foreign[0]][1]) + ") REFERENCES " + str(tables[int( 106 | column_names[foreign[1]][0])]) + "(" + str( 107 | column_names[foreign[1]][1]) + ")" 108 | forkey.append(vlaus) 109 | forekeys[db["db_id"]] = forkey 110 | for i in range(len(ppl_test)): 111 | ppl_test[i]['foreign_key'] = ["\n".join(forekeys[ppl_test[i]["db"]])] 112 | return ppl_test 113 | 114 | 115 | 116 | def gen_ppl_from_json(ppl_filename='data/ppl_dev.json', model=None): 117 | tables_data = json.load( 118 | open(proj_dir + "/data/spider/test_data/tables.json", 'r', encoding='utf-8')) 119 | dev_data = json.load( 120 | open(proj_dir + "/data/spider/test_data/dev.json", 'r', encoding='utf-8')) 121 | ppl_test = [] 122 | for ix, it in enumerate(dev_data): 123 | ppl_test.append({ 124 | "id": ix, 125 | "db": it['db_id'], 126 | "question": it['question'], 127 | "gold_sql": it['query'] 128 | }) 129 | anno_simddl = {} 130 | anno = {} 131 | for db in tables_data: 132 | tables = db["table_names_original"] 133 | column_names = db["column_names_original"] 134 | column_types = db["column_types"] 135 | sql_tem_all = [] 136 | sql_tem_sim_ddl = [] 137 | for idx, data in enumerate(tables): 138 | for j, column in enumerate(column_names): 139 | sql_tem = [] 140 | if idx == column[0]: 141 | sql_tem.append(column[0]) 142 | sql_tem.append( 143 | str(column[1]) + " " + str(column_types[j]).upper()) 144 | sql_tem_all.append(sql_tem) 145 | sql_tem_sim_ddl.append([column[0], column[1]]) 146 | 147 | # 外键 148 | for foreign in db["foreign_keys"]: 149 | vlaus = str(tables[int( 150 | column_names[foreign[0]][0])]) + "(" + str( 151 | column_names[foreign[0]][1]) + ") REFERENCES " + str(tables[int( 152 | column_names[foreign[1]][0])]) + "(" + str( 153 | column_names[foreign[1]][1]) + ")" 154 | # print(vlaus) 155 | sql_tem_all.append([column_names[foreign[0]][0], vlaus]) 156 | # DDL语句 157 | ddl_all = [] 158 | for idx, data in enumerate(tables): 159 | # 表名 160 | sql_01 = "\nCREATE TABLE " + str(data) + "(" 161 | sql_final_tem = [] 162 | for j, sql_final in enumerate(sql_tem_all): 163 | if idx == sql_final[0]: 164 | sql_final_tem.append(sql_final[1]) 165 | sql_01 += ", ".join(sql_final_tem) + ");" 166 | ddl_all.append(sql_01) 167 | anno[db["db_id"]] = ddl_all 168 | 169 | simddl_all = [] 170 | for idx, data in enumerate(tables): 171 | sql_01 = "# " + str(data) + "(" 172 | sql_final_tem = [] 173 | for j, sql_final in enumerate(sql_tem_sim_ddl): 174 | if idx == sql_final[0]: 175 | sql_final_tem.append(sql_final[1]) 176 | sql_01 += ", ".join(sql_final_tem) + ")" 177 | simddl_all.append(sql_01) 178 | anno_simddl[db["db_id"]] = simddl_all 179 | for i in range(len(ppl_test)): 180 | ppl_test[i]['simplified_ddl'] = ";\n".join( 181 | anno_simddl[ppl_test[i]["db"]]) + ".\n" 182 | ppl_test[i]['full_ddl'] = "\n".join(anno[ppl_test[i]["db"]]) + '\n' 183 | ppl_test = add_fk(ppl_test) 184 | json.dump(ppl_test, 185 | open(ppl_filename, 'w', encoding='utf-8'), 186 | ensure_ascii=False, 187 | indent=4) 188 | return ppl_test 189 | 190 | 191 | 192 | import re, json, os 193 | 194 | from sql_metadata import Parser 195 | 196 | 197 | 198 | if __name__ == '__main__': 199 | parser = argparse.ArgumentParser() 200 | parser.add_argument("--data_dir", type=str, default="data/") 201 | args = parser.parse_args() 202 | 203 | # merge two training split of Spider 204 | spider_dir = args.data_dir 205 | # split1 = "train_spider.json" 206 | # split2 = "train_others.json" 207 | # total_train = [] 208 | # for item in json.load(open(os.path.join(spider_dir, split1))): 209 | # total_train.append(item) 210 | # for item in json.load(open(os.path.join(spider_dir, split2))): 211 | # total_train.append(item) 212 | # with open(os.path.join(spider_dir, 'train_spider_and_others.json'), 'w') as f: 213 | # json.dump(total_train, f) 214 | 215 | # schema-linking between questions and databases for Spider 216 | if "test.json" in os.listdir('data/'): 217 | dev_name = "test.json" 218 | elif "dev.json" in os.listdir('data/'): 219 | dev_name = "dev.json" 220 | else: 221 | raise Exception("There is no 'test.json' or 'dev.json' in dataset. Please check the file path.") 222 | spider_dev = dev_name 223 | print(spider_dev) 224 | spider_train = '' 225 | spider_table = 'tables.json' 226 | spider_db = 'database' 227 | schema_linking_producer(spider_dev, spider_train, spider_table, spider_db, spider_dir) 228 | -------------------------------------------------------------------------------- /src/sources/llms/__init__.py: -------------------------------------------------------------------------------- 1 | from .codellama import codellama 2 | from .llama2 import Llama2 3 | from .puyu import Puyu 4 | from .sqlcoder import SQLCoder 5 | from .gpt import GPT -------------------------------------------------------------------------------- /src/sources/llms/codellama.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import random 4 | import requests 5 | import json 6 | import time 7 | import os, sys 8 | 9 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 10 | # from settings import SECRETKEYS 11 | 12 | 13 | class codellama: 14 | 15 | def __init__(self, keys=None): 16 | self._key = None 17 | 18 | def __call__(self, 19 | prompt, 20 | core_pod_ip='10.119.26.210', 21 | temperature=0.001, 22 | top_p=0.9, 23 | max_new_tokens=256, 24 | repetition_penalty=1.05, 25 | stream=False, 26 | *args, 27 | **kwargs): 28 | 29 | server = "http://cluster-proxy.sh.sensetime.com:19906/generate" 30 | # headers = {"Content-Type": "application/json"} 31 | # endpoint = f"http://{core_pod_ip}:2345/generate" # cci tgi-gpu8 32 | 33 | # request_body = { 34 | # # "endpoint": endpoint, 35 | # "inputs": prompt, 36 | # "parameters": { 37 | # "temperature": temperature, 38 | # "top_p": top_p, 39 | # "do_sample": True, 40 | # "max_new_tokens": max_new_tokens, 41 | # "top_k": 4, 42 | # "repetition_penalty": repetition_penalty, 43 | # "stop": [ 44 | # # "", 45 | # # "User:", 46 | # ] 47 | # } 48 | # } 49 | data = { 50 | "inputs": prompt, 51 | "parameters": { 52 | "temperature": temperature, 53 | "top_p": top_p, 54 | "do_sample": True, 55 | "max_new_tokens": max_new_tokens, 56 | "repetition_penalty": repetition_penalty, 57 | } 58 | } 59 | response = requests.post(server, json=data) 60 | 61 | time.sleep(0.1) 62 | if response.status_code == 200: 63 | try: 64 | res = response.json() 65 | except: 66 | raise Exception('Response can not be parsed !') 67 | 68 | return res["generated_text"][0].rstrip("") 69 | else: 70 | raise Exception('No response returned by SenseNova !') 71 | 72 | 73 | def parallel_call(inps): 74 | querys, api = inps[0], inps[1] 75 | llm = codellama() 76 | res = [] 77 | for q in querys: 78 | out = llm(q, core_pod_ip=api) 79 | res.append(out) 80 | return res 81 | 82 | 83 | if __name__ == '__main__': 84 | llm = codellama() 85 | print(llm('请用一句话解释万有引力')) -------------------------------------------------------------------------------- /src/sources/llms/gpt.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import os 4 | import openai 5 | 6 | openai.api_type = "azure" 7 | openai.api_base = "$your-proxy" 8 | openai.api_key = "$your-key" 9 | openai.api_version = "2023-05-15" 10 | 11 | 12 | class GPT: 13 | 14 | def __init__(self, keys=None): 15 | self._key = None 16 | 17 | def __call__(self, prompt, *args, **kwargs): 18 | 19 | response = openai.ChatCompletion.create( 20 | engine="gpt-4-0613", # engine = "deployment_name". 21 | messages=[{ 22 | "role": "user", 23 | "content": prompt 24 | }], 25 | temperature=0, 26 | max_tokens=200, 27 | ) 28 | 29 | time.sleep(25) 30 | try: 31 | print(f"prompt: {prompt}") 32 | print(f"result: {response['choices'][0]['message']['content']}") 33 | return response['choices'][0]['message']['content'] 34 | except: 35 | time.sleep(30) 36 | response = openai.ChatCompletion.create( 37 | engine="gpt-4-0613", # engine = "deployment_name". 38 | messages=[{ 39 | "role": "user", 40 | "content": prompt 41 | }], 42 | temperature=0, 43 | max_tokens=200, 44 | ) 45 | print(f"prompt: {prompt}") 46 | print(f"result: {response['choices'][0]['message']['content']}") 47 | return response['choices'][0]['message']['content'] 48 | 49 | 50 | if __name__ == '__main__': 51 | llm = GPT() 52 | print(llm('请用一句话解释万有引力')) -------------------------------------------------------------------------------- /src/sources/llms/llama2.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import random 4 | import requests 5 | import json 6 | import time 7 | import os, sys 8 | 9 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 10 | # from settings import SECRETKEYS 11 | 12 | 13 | class Llama2: 14 | 15 | def __init__(self, keys=None): 16 | self._key = None 17 | 18 | def __call__(self, 19 | prompt, 20 | core_pod_ip='10.119.25.63', 21 | temperature=0.001, 22 | top_p=0.9, 23 | max_new_tokens=256, 24 | repetition_penalty=1.05, 25 | stream=False, 26 | *args, 27 | **kwargs): 28 | #替换敏感词 29 | # prompt_desen=prompt.replace('Master', 'Bachelor') 30 | # print('-*-S'*20) 31 | # print(f"Llama2 Prompt:\n{prompt}") 32 | # print('--' * 30+' Fin Prompt ') 33 | 34 | server = "http://103.177.28.206:8000/api/generate" 35 | headers = {"Content-Type": "application/json"} 36 | endpoint = f"http://{core_pod_ip}:2345/generate" # cci tgi-gpu8 37 | 38 | request_body = { 39 | "endpoint": endpoint, 40 | "inputs": prompt, 41 | "parameters": { 42 | "temperature": temperature, 43 | "top_p": top_p, 44 | "do_sample": True, 45 | "max_new_tokens": max_new_tokens, 46 | "top_k": 4, 47 | "repetition_penalty": repetition_penalty, 48 | "stop": [ 49 | # "", 50 | # "User:", 51 | ] 52 | } 53 | } 54 | response = requests.post(server, 55 | headers=headers, 56 | json=request_body, 57 | stream=stream) 58 | 59 | time.sleep(0.1) 60 | if response.status_code == 200: 61 | try: 62 | res = response.json() 63 | except: 64 | raise Exception('Response can not be parsed !') 65 | 66 | return res["generated_text"] 67 | else: 68 | raise Exception('No response returned by SenseNova !') 69 | 70 | 71 | def parallel_call(inps): 72 | querys, api = inps[0], inps[1] 73 | llm = Llama2() 74 | res = [] 75 | for q in querys: 76 | out = llm(q, core_pod_ip=api) 77 | res.append(out) 78 | return res 79 | 80 | 81 | if __name__ == '__main__': 82 | llm = Llama2() 83 | print(llm('请用一句话解释万有引力')) -------------------------------------------------------------------------------- /src/sources/llms/puyu.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import ast 3 | import time 4 | import jsonlines 5 | import requests 6 | import json 7 | import time 8 | 9 | 10 | class Puyu: 11 | def __init__(self): 12 | pass 13 | 14 | def __call__(self, 15 | prompt, 16 | model= "nova-ptc-xl-v1", # "nova-ptc-xl-v2-1-0-8k-internal", 17 | temperature=1e-7, 18 | top_p=0.9, 19 | max_new_tokens=256, 20 | repetition_penalty=1., 21 | stream=False, 22 | *args, **kwargs): 23 | 24 | url = 'http://cluster-proxy.sh.sensetime.com:19904/generate' 25 | # context = "<|User|>:"+prompt+"\n<|Bot|>:" 26 | data = { 27 | "inputs": prompt, 28 | "parameters": { 29 | "do_sample": False, 30 | "temperature": temperature, 31 | "top_k": 1, 32 | "max_new_tokens": max_new_tokens, 33 | "repetition_penalty": repetition_penalty, 34 | } 35 | } 36 | response = requests.post(url, json=data) 37 | 38 | time.sleep(0.1) 39 | if response.status_code == 200: 40 | try: 41 | res = response.json() 42 | except: 43 | raise Exception('Response can not be parsed !') 44 | 45 | return res["generated_text"][0].rstrip("") 46 | else: 47 | raise Exception('No response returned by SenseNova !') 48 | 49 | 50 | if __name__=='__main__': 51 | llm=Puyu() 52 | print(llm('请用一句话解释万有引力', max_new_tokens=512)) 53 | -------------------------------------------------------------------------------- /src/sources/llms/sqlcoder.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import random 4 | import requests 5 | import json 6 | import time 7 | import os, sys 8 | 9 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 10 | 11 | class SQLCoder: 12 | 13 | def __init__(self, keys=None): 14 | self._key = None 15 | 16 | def __call__(self, 17 | prompt, 18 | core_pod_ip='10.119.28.126', 19 | temperature=0.001, 20 | top_p=0.9, 21 | max_new_tokens=256, 22 | repetition_penalty=1.05, 23 | stream=True, 24 | *args, 25 | **kwargs): 26 | 27 | url = 'http://cluster-proxy.sh.sensetime.com:19939/generate' 28 | # context = "<|User|>:"+prompt+"\n<|Bot|>:" 29 | data = { 30 | "inputs": prompt, 31 | "parameters": { 32 | "do_sample": False, 33 | "temperature": temperature, 34 | "top_k": 1, 35 | "max_new_tokens": max_new_tokens, 36 | "repetition_penalty": repetition_penalty, 37 | } 38 | } 39 | response = requests.post(url, json=data) 40 | 41 | time.sleep(0.1) 42 | if response.status_code == 200: 43 | try: 44 | res = response.json() 45 | except: 46 | raise Exception('Response can not be parsed !') 47 | return res["generated_text"][0].rstrip("") 48 | else: 49 | raise Exception('No response returned by SQLCoder !') 50 | 51 | 52 | if __name__ == '__main__': 53 | llm = SQLCoder() 54 | print(llm('请用一句话解释万有引力')) -------------------------------------------------------------------------------- /src/sources/llms/vicuna.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import random 4 | import requests 5 | import json 6 | import time 7 | import os, sys 8 | 9 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 10 | # from settings import SECRETKEYS 11 | 12 | 13 | class vicuna: 14 | 15 | def __init__(self, keys=None): 16 | self._key = None 17 | 18 | def __call__(self, 19 | prompt, 20 | core_pod_ip='10.119.27.61', 21 | temperature=0.001, 22 | top_p=0.9, 23 | max_new_tokens=256, 24 | repetition_penalty=1.05, 25 | stream=True, 26 | *args, 27 | **kwargs): 28 | 29 | server = "http://cluster-proxy.sh.sensetime.com:19905/generate" 30 | # headers = {"Content-Type": "application/json"} 31 | # endpoint = f"http://{core_pod_ip}:2345/generate" # cci tgi-gpu8 32 | 33 | # request_body = { 34 | # # "endpoint": endpoint, 35 | # "inputs": prompt, 36 | # "parameters": { 37 | # "temperature": temperature, 38 | # "top_p": top_p, 39 | # "do_sample": True, 40 | # "max_new_tokens": max_new_tokens, 41 | # "top_k": 4, 42 | # "repetition_penalty": repetition_penalty, 43 | # "stop": [ 44 | # # "", 45 | # # "User:", 46 | # ] 47 | # } 48 | # } 49 | data = { 50 | "inputs": prompt, 51 | "parameters": { 52 | "temperature": temperature, 53 | "top_p": top_p, 54 | "do_sample": True, 55 | "max_new_tokens": max_new_tokens, 56 | "repetition_penalty": repetition_penalty, 57 | } 58 | } 59 | response = requests.post(server, json=data) 60 | # response = requests.post(server, headers=headers, data=json.dumps(request_body), timeout=(10, 5)) 61 | time.sleep(0.1) 62 | # print(response.status_code) 63 | if response.status_code == 200: 64 | try: 65 | res = response.json() 66 | except: 67 | raise Exception('Response can not be parsed !') 68 | 69 | return res["generated_text"] 70 | else: 71 | raise Exception('No response returned by SQLCoder !') 72 | 73 | 74 | if __name__ == '__main__': 75 | llm = vicuna() 76 | print(llm('请用一句话解释万有引力')) -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/README: -------------------------------------------------------------------------------- 1 | Stopwords Corpus 2 | 3 | This corpus contains lists of stop words for several languages. These 4 | are high-frequency grammatical words which are usually ignored in text 5 | retrieval applications. 6 | 7 | They were obtained from: 8 | http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ 9 | 10 | The stop words for the Romanian language were obtained from: 11 | http://arlc.ro/resources/ 12 | 13 | The English list has been augmented 14 | https://github.com/nltk/nltk_data/issues/22 15 | 16 | The German list has been corrected 17 | https://github.com/nltk/nltk_data/pull/49 18 | 19 | A Kazakh list has been added 20 | https://github.com/nltk/nltk_data/pull/52 21 | 22 | A Nepali list has been added 23 | https://github.com/nltk/nltk_data/pull/83 24 | 25 | An Azerbaijani list has been added 26 | https://github.com/nltk/nltk_data/pull/100 27 | 28 | A Greek list has been added 29 | https://github.com/nltk/nltk_data/pull/103 30 | 31 | An Indonesian list has been added 32 | https://github.com/nltk/nltk_data/pull/112 33 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/arabic: -------------------------------------------------------------------------------- 1 | إذ 2 | إذا 3 | إذما 4 | إذن 5 | أف 6 | أقل 7 | أكثر 8 | ألا 9 | إلا 10 | التي 11 | الذي 12 | الذين 13 | اللاتي 14 | اللائي 15 | اللتان 16 | اللتيا 17 | اللتين 18 | اللذان 19 | اللذين 20 | اللواتي 21 | إلى 22 | إليك 23 | إليكم 24 | إليكما 25 | إليكن 26 | أم 27 | أما 28 | أما 29 | إما 30 | أن 31 | إن 32 | إنا 33 | أنا 34 | أنت 35 | أنتم 36 | أنتما 37 | أنتن 38 | إنما 39 | إنه 40 | أنى 41 | أنى 42 | آه 43 | آها 44 | أو 45 | أولاء 46 | أولئك 47 | أوه 48 | آي 49 | أي 50 | أيها 51 | إي 52 | أين 53 | أين 54 | أينما 55 | إيه 56 | بخ 57 | بس 58 | بعد 59 | بعض 60 | بك 61 | بكم 62 | بكم 63 | بكما 64 | بكن 65 | بل 66 | بلى 67 | بما 68 | بماذا 69 | بمن 70 | بنا 71 | به 72 | بها 73 | بهم 74 | بهما 75 | بهن 76 | بي 77 | بين 78 | بيد 79 | تلك 80 | تلكم 81 | تلكما 82 | ته 83 | تي 84 | تين 85 | تينك 86 | ثم 87 | ثمة 88 | حاشا 89 | حبذا 90 | حتى 91 | حيث 92 | حيثما 93 | حين 94 | خلا 95 | دون 96 | ذا 97 | ذات 98 | ذاك 99 | ذان 100 | ذانك 101 | ذلك 102 | ذلكم 103 | ذلكما 104 | ذلكن 105 | ذه 106 | ذو 107 | ذوا 108 | ذواتا 109 | ذواتي 110 | ذي 111 | ذين 112 | ذينك 113 | ريث 114 | سوف 115 | سوى 116 | شتان 117 | عدا 118 | عسى 119 | عل 120 | على 121 | عليك 122 | عليه 123 | عما 124 | عن 125 | عند 126 | غير 127 | فإذا 128 | فإن 129 | فلا 130 | فمن 131 | في 132 | فيم 133 | فيما 134 | فيه 135 | فيها 136 | قد 137 | كأن 138 | كأنما 139 | كأي 140 | كأين 141 | كذا 142 | كذلك 143 | كل 144 | كلا 145 | كلاهما 146 | كلتا 147 | كلما 148 | كليكما 149 | كليهما 150 | كم 151 | كم 152 | كما 153 | كي 154 | كيت 155 | كيف 156 | كيفما 157 | لا 158 | لاسيما 159 | لدى 160 | لست 161 | لستم 162 | لستما 163 | لستن 164 | لسن 165 | لسنا 166 | لعل 167 | لك 168 | لكم 169 | لكما 170 | لكن 171 | لكنما 172 | لكي 173 | لكيلا 174 | لم 175 | لما 176 | لن 177 | لنا 178 | له 179 | لها 180 | لهم 181 | لهما 182 | لهن 183 | لو 184 | لولا 185 | لوما 186 | لي 187 | لئن 188 | ليت 189 | ليس 190 | ليسا 191 | ليست 192 | ليستا 193 | ليسوا 194 | ما 195 | ماذا 196 | متى 197 | مذ 198 | مع 199 | مما 200 | ممن 201 | من 202 | منه 203 | منها 204 | منذ 205 | مه 206 | مهما 207 | نحن 208 | نحو 209 | نعم 210 | ها 211 | هاتان 212 | هاته 213 | هاتي 214 | هاتين 215 | هاك 216 | هاهنا 217 | هذا 218 | هذان 219 | هذه 220 | هذي 221 | هذين 222 | هكذا 223 | هل 224 | هلا 225 | هم 226 | هما 227 | هن 228 | هنا 229 | هناك 230 | هنالك 231 | هو 232 | هؤلاء 233 | هي 234 | هيا 235 | هيت 236 | هيهات 237 | والذي 238 | والذين 239 | وإذ 240 | وإذا 241 | وإن 242 | ولا 243 | ولكن 244 | ولو 245 | وما 246 | ومن 247 | وهو 248 | يا 249 | أبٌ 250 | أخٌ 251 | حمٌ 252 | فو 253 | أنتِ 254 | يناير 255 | فبراير 256 | مارس 257 | أبريل 258 | مايو 259 | يونيو 260 | يوليو 261 | أغسطس 262 | سبتمبر 263 | أكتوبر 264 | نوفمبر 265 | ديسمبر 266 | جانفي 267 | فيفري 268 | مارس 269 | أفريل 270 | ماي 271 | جوان 272 | جويلية 273 | أوت 274 | كانون 275 | شباط 276 | آذار 277 | نيسان 278 | أيار 279 | حزيران 280 | تموز 281 | آب 282 | أيلول 283 | تشرين 284 | دولار 285 | دينار 286 | ريال 287 | درهم 288 | ليرة 289 | جنيه 290 | قرش 291 | مليم 292 | فلس 293 | هللة 294 | سنتيم 295 | يورو 296 | ين 297 | يوان 298 | شيكل 299 | واحد 300 | اثنان 301 | ثلاثة 302 | أربعة 303 | خمسة 304 | ستة 305 | سبعة 306 | ثمانية 307 | تسعة 308 | عشرة 309 | أحد 310 | اثنا 311 | اثني 312 | إحدى 313 | ثلاث 314 | أربع 315 | خمس 316 | ست 317 | سبع 318 | ثماني 319 | تسع 320 | عشر 321 | ثمان 322 | سبت 323 | أحد 324 | اثنين 325 | ثلاثاء 326 | أربعاء 327 | خميس 328 | جمعة 329 | أول 330 | ثان 331 | ثاني 332 | ثالث 333 | رابع 334 | خامس 335 | سادس 336 | سابع 337 | ثامن 338 | تاسع 339 | عاشر 340 | حادي 341 | أ 342 | ب 343 | ت 344 | ث 345 | ج 346 | ح 347 | خ 348 | د 349 | ذ 350 | ر 351 | ز 352 | س 353 | ش 354 | ص 355 | ض 356 | ط 357 | ظ 358 | ع 359 | غ 360 | ف 361 | ق 362 | ك 363 | ل 364 | م 365 | ن 366 | ه 367 | و 368 | ي 369 | ء 370 | ى 371 | آ 372 | ؤ 373 | ئ 374 | أ 375 | ة 376 | ألف 377 | باء 378 | تاء 379 | ثاء 380 | جيم 381 | حاء 382 | خاء 383 | دال 384 | ذال 385 | راء 386 | زاي 387 | سين 388 | شين 389 | صاد 390 | ضاد 391 | طاء 392 | ظاء 393 | عين 394 | غين 395 | فاء 396 | قاف 397 | كاف 398 | لام 399 | ميم 400 | نون 401 | هاء 402 | واو 403 | ياء 404 | همزة 405 | ي 406 | نا 407 | ك 408 | كن 409 | ه 410 | إياه 411 | إياها 412 | إياهما 413 | إياهم 414 | إياهن 415 | إياك 416 | إياكما 417 | إياكم 418 | إياك 419 | إياكن 420 | إياي 421 | إيانا 422 | أولالك 423 | تانِ 424 | تانِك 425 | تِه 426 | تِي 427 | تَيْنِ 428 | ثمّ 429 | ثمّة 430 | ذانِ 431 | ذِه 432 | ذِي 433 | ذَيْنِ 434 | هَؤلاء 435 | هَاتانِ 436 | هَاتِه 437 | هَاتِي 438 | هَاتَيْنِ 439 | هَذا 440 | هَذانِ 441 | هَذِه 442 | هَذِي 443 | هَذَيْنِ 444 | الألى 445 | الألاء 446 | أل 447 | أنّى 448 | أيّ 449 | ّأيّان 450 | أنّى 451 | أيّ 452 | ّأيّان 453 | ذيت 454 | كأيّ 455 | كأيّن 456 | بضع 457 | فلان 458 | وا 459 | آمينَ 460 | آهِ 461 | آهٍ 462 | آهاً 463 | أُفٍّ 464 | أُفٍّ 465 | أفٍّ 466 | أمامك 467 | أمامكَ 468 | أوّهْ 469 | إلَيْكَ 470 | إلَيْكَ 471 | إليكَ 472 | إليكنّ 473 | إيهٍ 474 | بخٍ 475 | بسّ 476 | بَسْ 477 | بطآن 478 | بَلْهَ 479 | حاي 480 | حَذارِ 481 | حيَّ 482 | حيَّ 483 | دونك 484 | رويدك 485 | سرعان 486 | شتانَ 487 | شَتَّانَ 488 | صهْ 489 | صهٍ 490 | طاق 491 | طَق 492 | عَدَسْ 493 | كِخ 494 | مكانَك 495 | مكانَك 496 | مكانَك 497 | مكانكم 498 | مكانكما 499 | مكانكنّ 500 | نَخْ 501 | هاكَ 502 | هَجْ 503 | هلم 504 | هيّا 505 | هَيْهات 506 | وا 507 | واهاً 508 | وراءَك 509 | وُشْكَانَ 510 | وَيْ 511 | يفعلان 512 | تفعلان 513 | يفعلون 514 | تفعلون 515 | تفعلين 516 | اتخذ 517 | ألفى 518 | تخذ 519 | ترك 520 | تعلَّم 521 | جعل 522 | حجا 523 | حبيب 524 | خال 525 | حسب 526 | خال 527 | درى 528 | رأى 529 | زعم 530 | صبر 531 | ظنَّ 532 | عدَّ 533 | علم 534 | غادر 535 | ذهب 536 | وجد 537 | ورد 538 | وهب 539 | أسكن 540 | أطعم 541 | أعطى 542 | رزق 543 | زود 544 | سقى 545 | كسا 546 | أخبر 547 | أرى 548 | أعلم 549 | أنبأ 550 | حدَث 551 | خبَّر 552 | نبَّا 553 | أفعل به 554 | ما أفعله 555 | بئس 556 | ساء 557 | طالما 558 | قلما 559 | لات 560 | لكنَّ 561 | ءَ 562 | أجل 563 | إذاً 564 | أمّا 565 | إمّا 566 | إنَّ 567 | أنًّ 568 | أى 569 | إى 570 | أيا 571 | ب 572 | ثمَّ 573 | جلل 574 | جير 575 | رُبَّ 576 | س 577 | علًّ 578 | ف 579 | كأنّ 580 | كلَّا 581 | كى 582 | ل 583 | لات 584 | لعلَّ 585 | لكنَّ 586 | لكنَّ 587 | م 588 | نَّ 589 | هلّا 590 | وا 591 | أل 592 | إلّا 593 | ت 594 | ك 595 | لمّا 596 | ن 597 | ه 598 | و 599 | ا 600 | ي 601 | تجاه 602 | تلقاء 603 | جميع 604 | حسب 605 | سبحان 606 | شبه 607 | لعمر 608 | مثل 609 | معاذ 610 | أبو 611 | أخو 612 | حمو 613 | فو 614 | مئة 615 | مئتان 616 | ثلاثمئة 617 | أربعمئة 618 | خمسمئة 619 | ستمئة 620 | سبعمئة 621 | ثمنمئة 622 | تسعمئة 623 | مائة 624 | ثلاثمائة 625 | أربعمائة 626 | خمسمائة 627 | ستمائة 628 | سبعمائة 629 | ثمانمئة 630 | تسعمائة 631 | عشرون 632 | ثلاثون 633 | اربعون 634 | خمسون 635 | ستون 636 | سبعون 637 | ثمانون 638 | تسعون 639 | عشرين 640 | ثلاثين 641 | اربعين 642 | خمسين 643 | ستين 644 | سبعين 645 | ثمانين 646 | تسعين 647 | بضع 648 | نيف 649 | أجمع 650 | جميع 651 | عامة 652 | عين 653 | نفس 654 | لا سيما 655 | أصلا 656 | أهلا 657 | أيضا 658 | بؤسا 659 | بعدا 660 | بغتة 661 | تعسا 662 | حقا 663 | حمدا 664 | خلافا 665 | خاصة 666 | دواليك 667 | سحقا 668 | سرا 669 | سمعا 670 | صبرا 671 | صدقا 672 | صراحة 673 | طرا 674 | عجبا 675 | عيانا 676 | غالبا 677 | فرادى 678 | فضلا 679 | قاطبة 680 | كثيرا 681 | لبيك 682 | معاذ 683 | أبدا 684 | إزاء 685 | أصلا 686 | الآن 687 | أمد 688 | أمس 689 | آنفا 690 | آناء 691 | أنّى 692 | أول 693 | أيّان 694 | تارة 695 | ثمّ 696 | ثمّة 697 | حقا 698 | صباح 699 | مساء 700 | ضحوة 701 | عوض 702 | غدا 703 | غداة 704 | قطّ 705 | كلّما 706 | لدن 707 | لمّا 708 | مرّة 709 | قبل 710 | خلف 711 | أمام 712 | فوق 713 | تحت 714 | يمين 715 | شمال 716 | ارتدّ 717 | استحال 718 | أصبح 719 | أضحى 720 | آض 721 | أمسى 722 | انقلب 723 | بات 724 | تبدّل 725 | تحوّل 726 | حار 727 | رجع 728 | راح 729 | صار 730 | ظلّ 731 | عاد 732 | غدا 733 | كان 734 | ما انفك 735 | ما برح 736 | مادام 737 | مازال 738 | مافتئ 739 | ابتدأ 740 | أخذ 741 | اخلولق 742 | أقبل 743 | انبرى 744 | أنشأ 745 | أوشك 746 | جعل 747 | حرى 748 | شرع 749 | طفق 750 | علق 751 | قام 752 | كرب 753 | كاد 754 | هبّ -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/azerbaijani: -------------------------------------------------------------------------------- 1 | a 2 | ad 3 | altı 4 | altmış 5 | amma 6 | arasında 7 | artıq 8 | ay 9 | az 10 | bax 11 | belə 12 | bəli 13 | bəlkə 14 | beş 15 | bəy 16 | bəzən 17 | bəzi 18 | bilər 19 | bir 20 | biraz 21 | biri 22 | birşey 23 | biz 24 | bizim 25 | bizlər 26 | bu 27 | buna 28 | bundan 29 | bunların 30 | bunu 31 | bunun 32 | buradan 33 | bütün 34 | ci 35 | cı 36 | çox 37 | cu 38 | cü 39 | çünki 40 | da 41 | daha 42 | də 43 | dedi 44 | dək 45 | dən 46 | dəqiqə 47 | deyil 48 | dir 49 | doqquz 50 | doqsan 51 | dörd 52 | düz 53 | ə 54 | edən 55 | edir 56 | əgər 57 | əlbəttə 58 | elə 59 | əlli 60 | ən 61 | əslində 62 | et 63 | etdi 64 | etmə 65 | etmək 66 | faiz 67 | gilə 68 | görə 69 | ha 70 | haqqında 71 | harada 72 | hə 73 | heç 74 | həm 75 | həmin 76 | həmişə 77 | hər 78 | ı 79 | idi 80 | iki 81 | il 82 | ildə 83 | ilə 84 | ilk 85 | in 86 | indi 87 | isə 88 | istifadə 89 | iyirmi 90 | ki 91 | kim 92 | kimə 93 | kimi 94 | lakin 95 | lap 96 | məhz 97 | mən 98 | mənə 99 | mirşey 100 | nə 101 | nəhayət 102 | niyə 103 | o 104 | obirisi 105 | of 106 | olan 107 | olar 108 | olaraq 109 | oldu 110 | olduğu 111 | olmadı 112 | olmaz 113 | olmuşdur 114 | olsun 115 | olur 116 | on 117 | ona 118 | ondan 119 | onlar 120 | onlardan 121 | onların 122 | onsuzda 123 | onu 124 | onun 125 | oradan 126 | otuz 127 | öz 128 | özü 129 | qarşı 130 | qədər 131 | qırx 132 | saat 133 | sadəcə 134 | saniyə 135 | səhv 136 | səkkiz 137 | səksən 138 | sən 139 | sənə 140 | sənin 141 | siz 142 | sizin 143 | sizlər 144 | sonra 145 | təəssüf 146 | ü 147 | üç 148 | üçün 149 | var 150 | və 151 | xan 152 | xanım 153 | xeyr 154 | ya 155 | yalnız 156 | yaxşı 157 | yeddi 158 | yenə 159 | yəni 160 | yetmiş 161 | yox 162 | yoxdur 163 | yoxsa 164 | yüz 165 | zaman -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/basque: -------------------------------------------------------------------------------- 1 | ahala 2 | aitzitik 3 | al 4 | ala 5 | alabadere 6 | alabaina 7 | alabaina 8 | aldiz 9 | alta 10 | amaitu 11 | amaitzeko 12 | anitz 13 | antzina 14 | arabera 15 | arabera 16 | arabera 17 | argi 18 | arratsaldero 19 | arte 20 | artean 21 | asko 22 | aspaldiko 23 | aurrera 24 | aurrera 25 | azkenez 26 | azkenik 27 | azkenik 28 | ba 29 | bada 30 | bada 31 | bada 32 | bada 33 | badarik 34 | badarik 35 | badarik 36 | badere 37 | bai 38 | baina 39 | baina 40 | baina 41 | baino 42 | baino 43 | baino 44 | baino 45 | baita 46 | baizik 47 | baldin 48 | baldin 49 | barren 50 | bat 51 | batean 52 | batean 53 | batean 54 | batean 55 | batek 56 | baten 57 | batera 58 | batez 59 | bati 60 | batzuei 61 | batzuek 62 | batzuetan 63 | batzuk 64 | bazen 65 | bederen 66 | bederik 67 | beharrez 68 | behiala 69 | behin 70 | behin 71 | behin 72 | behin 73 | behinik 74 | behinola 75 | behintzat 76 | bera 77 | beraiek 78 | beranduago 79 | berau 80 | berauek 81 | beraz 82 | beraz 83 | bere 84 | berean 85 | berebat 86 | berehala 87 | berori 88 | beroriek 89 | berriro 90 | berriz 91 | bertzalde 92 | bertzenaz 93 | bestalde 94 | beste 95 | bestela 96 | besterik 97 | bezain 98 | bezala 99 | bide 100 | bien 101 | bigarrenez 102 | bigarrenik 103 | bitartean 104 | bitartean 105 | bizkitartean 106 | bukaeran 107 | bukatzeko 108 | da 109 | dago 110 | dago 111 | dela 112 | dela 113 | dela 114 | delarik 115 | den 116 | dena 117 | dena 118 | dezadan 119 | dira 120 | ditu 121 | du 122 | dute 123 | edo 124 | edo 125 | edota 126 | egin 127 | egin 128 | egun 129 | egun 130 | egunean 131 | emateko 132 | era 133 | erdi 134 | ere 135 | ere 136 | ere 137 | ere 138 | ere 139 | esan 140 | esan 141 | esanak 142 | esandakoaren 143 | eta 144 | eta 145 | eta 146 | eta 147 | eta 148 | eta 149 | eurak 150 | ez 151 | ez 152 | ez 153 | eze 154 | ezen 155 | ezer 156 | ezezik 157 | ezik 158 | ezpabere 159 | ezpada 160 | ezpere 161 | ezperen 162 | ezta 163 | funtsean 164 | gabe 165 | gain 166 | gainera 167 | gainera 168 | gainerontzean 169 | gaur 170 | gero 171 | gero 172 | gero 173 | geroago 174 | gisa 175 | gu 176 | gutxi 177 | guzti 178 | guztia 179 | guztiz 180 | haatik 181 | haiei 182 | haiek 183 | haietan 184 | hain 185 | hainbeste 186 | hainbestez 187 | hala 188 | hala 189 | hala 190 | halaber 191 | halako 192 | halatan 193 | han 194 | handik 195 | hango 196 | hara 197 | hargatik 198 | hari 199 | hark 200 | hartan 201 | hartan 202 | hasi 203 | hasi 204 | hasiera 205 | hasieran 206 | hasteaz 207 | hasteko 208 | hasteko 209 | hau 210 | hau 211 | hau 212 | hau 213 | hau 214 | hau 215 | hauei 216 | hauek 217 | hauetan 218 | hemen 219 | hemendik 220 | hemengo 221 | hi 222 | hona 223 | honebestez 224 | honek 225 | honela 226 | honela 227 | honela 228 | honen 229 | honen 230 | honetan 231 | honetaz 232 | honi 233 | hor 234 | hori 235 | hori 236 | hori 237 | horiei 238 | horiek 239 | horietan 240 | horko 241 | horra 242 | horratik 243 | horregatik 244 | horregatik 245 | horrek 246 | horrela 247 | horrela 248 | horrela 249 | horren 250 | horrenbestez 251 | horretan 252 | horri 253 | hortaz 254 | hortaz 255 | hortik 256 | hura 257 | ikusi 258 | ikusi 259 | izan 260 | izan 261 | izan 262 | jarraituz 263 | kariaz 264 | kasuaz 265 | kontuan 266 | laburbilduz 267 | laburki 268 | laster 269 | laster 270 | lehen 271 | lehen 272 | lehen 273 | lehen 274 | lehenengo 275 | lehenengo 276 | lehenik 277 | lehen-lehenik 278 | litzateke 279 | medio 280 | mendean 281 | mundura 282 | nahiz 283 | ni 284 | noiz 285 | nola 286 | non 287 | nondik 288 | nongo 289 | nor 290 | nora 291 | on 292 | ondoren 293 | ondorio 294 | ondorioz 295 | ondorioz 296 | orain 297 | ordea 298 | orduan 299 | orduan 300 | orduan 301 | orduko 302 | ordura 303 | orobat 304 | ostean 305 | ostera 306 | osterantzean 307 | pentsatuz 308 | ustez 309 | ze 310 | zein 311 | zein 312 | zen 313 | zen 314 | zenbait 315 | zenbat 316 | zer 317 | zeren 318 | zergatik 319 | zergatik 320 | ziren 321 | zituen 322 | zu 323 | zuek 324 | zuen 325 | zuten 326 | zuzen 327 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/bengali: -------------------------------------------------------------------------------- 1 | অতএব 2 | অথচ 3 | অথবা 4 | অনুযায়ী 5 | অনেক 6 | অনেকে 7 | অনেকেই 8 | অন্তত 9 | অন্য 10 | অবধি 11 | অবশ্য 12 | অর্থাত 13 | আই 14 | আগামী 15 | আগে 16 | আগেই 17 | আছে 18 | আজ 19 | আদ্যভাগে 20 | আপনার 21 | আপনি 22 | আবার 23 | আমরা 24 | আমাকে 25 | আমাদের 26 | আমার 27 | আমি 28 | আর 29 | আরও 30 | ই 31 | ইত্যাদি 32 | ইহা 33 | উচিত 34 | উত্তর 35 | উনি 36 | উপর 37 | উপরে 38 | এ 39 | এঁদের 40 | এঁরা 41 | এই 42 | একই 43 | একটি 44 | একবার 45 | একে 46 | এক্ 47 | এখন 48 | এখনও 49 | এখানে 50 | এখানেই 51 | এটা 52 | এটাই 53 | এটি 54 | এত 55 | এতটাই 56 | এতে 57 | এদের 58 | এব 59 | এবং 60 | এবার 61 | এমন 62 | এমনকী 63 | এমনি 64 | এর 65 | এরা 66 | এল 67 | এস 68 | এসে 69 | ঐ 70 | ও 71 | ওঁদের 72 | ওঁর 73 | ওঁরা 74 | ওই 75 | ওকে 76 | ওখানে 77 | ওদের 78 | ওর 79 | ওরা 80 | কখনও 81 | কত 82 | কবে 83 | কমনে 84 | কয়েক 85 | কয়েকটি 86 | করছে 87 | করছেন 88 | করতে 89 | করবে 90 | করবেন 91 | করলে 92 | করলেন 93 | করা 94 | করাই 95 | করায় 96 | করার 97 | করি 98 | করিতে 99 | করিয়া 100 | করিয়ে 101 | করে 102 | করেই 103 | করেছিলেন 104 | করেছে 105 | করেছেন 106 | করেন 107 | কাউকে 108 | কাছ 109 | কাছে 110 | কাজ 111 | কাজে 112 | কারও 113 | কারণ 114 | কি 115 | কিংবা 116 | কিছু 117 | কিছুই 118 | কিন্তু 119 | কী 120 | কে 121 | কেউ 122 | কেউই 123 | কেখা 124 | কেন 125 | কোটি 126 | কোন 127 | কোনও 128 | কোনো 129 | ক্ষেত্রে 130 | কয়েক 131 | খুব 132 | গিয়ে 133 | গিয়েছে 134 | গিয়ে 135 | গুলি 136 | গেছে 137 | গেল 138 | গেলে 139 | গোটা 140 | চলে 141 | চান 142 | চায় 143 | চার 144 | চালু 145 | চেয়ে 146 | চেষ্টা 147 | ছাড়া 148 | ছাড়াও 149 | ছিল 150 | ছিলেন 151 | জন 152 | জনকে 153 | জনের 154 | জন্য 155 | জন্যওজে 156 | জানতে 157 | জানা 158 | জানানো 159 | জানায় 160 | জানিয়ে 161 | জানিয়েছে 162 | জে 163 | জ্নজন 164 | টি 165 | ঠিক 166 | তখন 167 | তত 168 | তথা 169 | তবু 170 | তবে 171 | তা 172 | তাঁকে 173 | তাঁদের 174 | তাঁর 175 | তাঁরা 176 | তাঁাহারা 177 | তাই 178 | তাও 179 | তাকে 180 | তাতে 181 | তাদের 182 | তার 183 | তারপর 184 | তারা 185 | তারৈ 186 | তাহলে 187 | তাহা 188 | তাহাতে 189 | তাহার 190 | তিনঐ 191 | তিনি 192 | তিনিও 193 | তুমি 194 | তুলে 195 | তেমন 196 | তো 197 | তোমার 198 | থাকবে 199 | থাকবেন 200 | থাকা 201 | থাকায় 202 | থাকে 203 | থাকেন 204 | থেকে 205 | থেকেই 206 | থেকেও 207 | দিকে 208 | দিতে 209 | দিন 210 | দিয়ে 211 | দিয়েছে 212 | দিয়েছেন 213 | দিলেন 214 | দু 215 | দুই 216 | দুটি 217 | দুটো 218 | দেওয়া 219 | দেওয়ার 220 | দেওয়া 221 | দেখতে 222 | দেখা 223 | দেখে 224 | দেন 225 | দেয় 226 | দ্বারা 227 | ধরা 228 | ধরে 229 | ধামার 230 | নতুন 231 | নয় 232 | না 233 | নাই 234 | নাকি 235 | নাগাদ 236 | নানা 237 | নিজে 238 | নিজেই 239 | নিজেদের 240 | নিজের 241 | নিতে 242 | নিয়ে 243 | নিয়ে 244 | নেই 245 | নেওয়া 246 | নেওয়ার 247 | নেওয়া 248 | নয় 249 | পক্ষে 250 | পর 251 | পরে 252 | পরেই 253 | পরেও 254 | পর্যন্ত 255 | পাওয়া 256 | পাচ 257 | পারি 258 | পারে 259 | পারেন 260 | পি 261 | পেয়ে 262 | পেয়্র্ 263 | প্রতি 264 | প্রথম 265 | প্রভৃতি 266 | প্রযন্ত 267 | প্রাথমিক 268 | প্রায় 269 | প্রায় 270 | ফলে 271 | ফিরে 272 | ফের 273 | বক্তব্য 274 | বদলে 275 | বন 276 | বরং 277 | বলতে 278 | বলল 279 | বললেন 280 | বলা 281 | বলে 282 | বলেছেন 283 | বলেন 284 | বসে 285 | বহু 286 | বা 287 | বাদে 288 | বার 289 | বি 290 | বিনা 291 | বিভিন্ন 292 | বিশেষ 293 | বিষয়টি 294 | বেশ 295 | বেশি 296 | ব্যবহার 297 | ব্যাপারে 298 | ভাবে 299 | ভাবেই 300 | মতো 301 | মতোই 302 | মধ্যভাগে 303 | মধ্যে 304 | মধ্যেই 305 | মধ্যেও 306 | মনে 307 | মাত্র 308 | মাধ্যমে 309 | মোট 310 | মোটেই 311 | যখন 312 | যত 313 | যতটা 314 | যথেষ্ট 315 | যদি 316 | যদিও 317 | যা 318 | যাঁর 319 | যাঁরা 320 | যাওয়া 321 | যাওয়ার 322 | যাওয়া 323 | যাকে 324 | যাচ্ছে 325 | যাতে 326 | যাদের 327 | যান 328 | যাবে 329 | যায় 330 | যার 331 | যারা 332 | যিনি 333 | যে 334 | যেখানে 335 | যেতে 336 | যেন 337 | যেমন 338 | র 339 | রকম 340 | রয়েছে 341 | রাখা 342 | রেখে 343 | লক্ষ 344 | শুধু 345 | শুরু 346 | সঙ্গে 347 | সঙ্গেও 348 | সব 349 | সবার 350 | সমস্ত 351 | সম্প্রতি 352 | সহ 353 | সহিত 354 | সাধারণ 355 | সামনে 356 | সি 357 | সুতরাং 358 | সে 359 | সেই 360 | সেখান 361 | সেখানে 362 | সেটা 363 | সেটাই 364 | সেটাও 365 | সেটি 366 | স্পষ্ট 367 | স্বয়ং 368 | হইতে 369 | হইবে 370 | হইয়া 371 | হওয়া 372 | হওয়ায় 373 | হওয়ার 374 | হচ্ছে 375 | হত 376 | হতে 377 | হতেই 378 | হন 379 | হবে 380 | হবেন 381 | হয় 382 | হয়তো 383 | হয়নি 384 | হয়ে 385 | হয়েই 386 | হয়েছিল 387 | হয়েছে 388 | হয়েছেন 389 | হল 390 | হলে 391 | হলেই 392 | হলেও 393 | হলো 394 | হাজার 395 | হিসাবে 396 | হৈলে 397 | হোক 398 | হয় -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/catalan: -------------------------------------------------------------------------------- 1 | a 2 | abans 3 | ací 4 | ah 5 | així 6 | això 7 | al 8 | aleshores 9 | algun 10 | alguna 11 | algunes 12 | alguns 13 | alhora 14 | allà 15 | allí 16 | allò 17 | als 18 | altra 19 | altre 20 | altres 21 | amb 22 | ambdues 23 | ambdós 24 | anar 25 | ans 26 | apa 27 | aquell 28 | aquella 29 | aquelles 30 | aquells 31 | aquest 32 | aquesta 33 | aquestes 34 | aquests 35 | aquí 36 | baix 37 | bastant 38 | bé 39 | cada 40 | cadascuna 41 | cadascunes 42 | cadascuns 43 | cadascú 44 | com 45 | consegueixo 46 | conseguim 47 | conseguir 48 | consigueix 49 | consigueixen 50 | consigueixes 51 | contra 52 | d'un 53 | d'una 54 | d'unes 55 | d'uns 56 | dalt 57 | de 58 | del 59 | dels 60 | des 61 | des de 62 | després 63 | dins 64 | dintre 65 | donat 66 | doncs 67 | durant 68 | e 69 | eh 70 | el 71 | elles 72 | ells 73 | els 74 | em 75 | en 76 | encara 77 | ens 78 | entre 79 | era 80 | erem 81 | eren 82 | eres 83 | es 84 | esta 85 | estan 86 | estat 87 | estava 88 | estaven 89 | estem 90 | esteu 91 | estic 92 | està 93 | estàvem 94 | estàveu 95 | et 96 | etc 97 | ets 98 | fa 99 | faig 100 | fan 101 | fas 102 | fem 103 | fer 104 | feu 105 | fi 106 | fins 107 | fora 108 | gairebé 109 | ha 110 | han 111 | has 112 | haver 113 | havia 114 | he 115 | hem 116 | heu 117 | hi 118 | ho 119 | i 120 | igual 121 | iguals 122 | inclòs 123 | ja 124 | jo 125 | l'hi 126 | la 127 | les 128 | li 129 | li'n 130 | llarg 131 | llavors 132 | m'he 133 | ma 134 | mal 135 | malgrat 136 | mateix 137 | mateixa 138 | mateixes 139 | mateixos 140 | me 141 | mentre 142 | meu 143 | meus 144 | meva 145 | meves 146 | mode 147 | molt 148 | molta 149 | moltes 150 | molts 151 | mon 152 | mons 153 | més 154 | n'he 155 | n'hi 156 | ne 157 | ni 158 | no 159 | nogensmenys 160 | només 161 | nosaltres 162 | nostra 163 | nostre 164 | nostres 165 | o 166 | oh 167 | oi 168 | on 169 | pas 170 | pel 171 | pels 172 | per 173 | per que 174 | perquè 175 | però 176 | poc 177 | poca 178 | pocs 179 | podem 180 | poden 181 | poder 182 | podeu 183 | poques 184 | potser 185 | primer 186 | propi 187 | puc 188 | qual 189 | quals 190 | quan 191 | quant 192 | que 193 | quelcom 194 | qui 195 | quin 196 | quina 197 | quines 198 | quins 199 | què 200 | s'ha 201 | s'han 202 | sa 203 | sabem 204 | saben 205 | saber 206 | sabeu 207 | sap 208 | saps 209 | semblant 210 | semblants 211 | sense 212 | ser 213 | ses 214 | seu 215 | seus 216 | seva 217 | seves 218 | si 219 | sobre 220 | sobretot 221 | soc 222 | solament 223 | sols 224 | som 225 | son 226 | sons 227 | sota 228 | sou 229 | sóc 230 | són 231 | t'ha 232 | t'han 233 | t'he 234 | ta 235 | tal 236 | també 237 | tampoc 238 | tan 239 | tant 240 | tanta 241 | tantes 242 | te 243 | tene 244 | tenim 245 | tenir 246 | teniu 247 | teu 248 | teus 249 | teva 250 | teves 251 | tinc 252 | ton 253 | tons 254 | tot 255 | tota 256 | totes 257 | tots 258 | un 259 | una 260 | unes 261 | uns 262 | us 263 | va 264 | vaig 265 | vam 266 | van 267 | vas 268 | veu 269 | vosaltres 270 | vostra 271 | vostre 272 | vostres 273 | érem 274 | éreu 275 | és 276 | éssent 277 | últim 278 | ús -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/chinese: -------------------------------------------------------------------------------- 1 | 一 2 | 一下 3 | 一些 4 | 一切 5 | 一则 6 | 一天 7 | 一定 8 | 一方面 9 | 一旦 10 | 一时 11 | 一来 12 | 一样 13 | 一次 14 | 一片 15 | 一直 16 | 一致 17 | 一般 18 | 一起 19 | 一边 20 | 一面 21 | 万一 22 | 上下 23 | 上升 24 | 上去 25 | 上来 26 | 上述 27 | 上面 28 | 下列 29 | 下去 30 | 下来 31 | 下面 32 | 不一 33 | 不久 34 | 不仅 35 | 不会 36 | 不但 37 | 不光 38 | 不单 39 | 不变 40 | 不只 41 | 不可 42 | 不同 43 | 不够 44 | 不如 45 | 不得 46 | 不怕 47 | 不惟 48 | 不成 49 | 不拘 50 | 不敢 51 | 不断 52 | 不是 53 | 不比 54 | 不然 55 | 不特 56 | 不独 57 | 不管 58 | 不能 59 | 不要 60 | 不论 61 | 不足 62 | 不过 63 | 不问 64 | 与 65 | 与其 66 | 与否 67 | 与此同时 68 | 专门 69 | 且 70 | 两者 71 | 严格 72 | 严重 73 | 个 74 | 个人 75 | 个别 76 | 中小 77 | 中间 78 | 丰富 79 | 临 80 | 为 81 | 为主 82 | 为了 83 | 为什么 84 | 为什麽 85 | 为何 86 | 为着 87 | 主张 88 | 主要 89 | 举行 90 | 乃 91 | 乃至 92 | 么 93 | 之 94 | 之一 95 | 之前 96 | 之后 97 | 之後 98 | 之所以 99 | 之类 100 | 乌乎 101 | 乎 102 | 乘 103 | 也 104 | 也好 105 | 也是 106 | 也罢 107 | 了 108 | 了解 109 | 争取 110 | 于 111 | 于是 112 | 于是乎 113 | 云云 114 | 互相 115 | 产生 116 | 人们 117 | 人家 118 | 什么 119 | 什么样 120 | 什麽 121 | 今后 122 | 今天 123 | 今年 124 | 今後 125 | 仍然 126 | 从 127 | 从事 128 | 从而 129 | 他 130 | 他人 131 | 他们 132 | 他的 133 | 代替 134 | 以 135 | 以上 136 | 以下 137 | 以为 138 | 以便 139 | 以免 140 | 以前 141 | 以及 142 | 以后 143 | 以外 144 | 以後 145 | 以来 146 | 以至 147 | 以至于 148 | 以致 149 | 们 150 | 任 151 | 任何 152 | 任凭 153 | 任务 154 | 企图 155 | 伟大 156 | 似乎 157 | 似的 158 | 但 159 | 但是 160 | 何 161 | 何况 162 | 何处 163 | 何时 164 | 作为 165 | 你 166 | 你们 167 | 你的 168 | 使得 169 | 使用 170 | 例如 171 | 依 172 | 依照 173 | 依靠 174 | 促进 175 | 保持 176 | 俺 177 | 俺们 178 | 倘 179 | 倘使 180 | 倘或 181 | 倘然 182 | 倘若 183 | 假使 184 | 假如 185 | 假若 186 | 做到 187 | 像 188 | 允许 189 | 充分 190 | 先后 191 | 先後 192 | 先生 193 | 全部 194 | 全面 195 | 兮 196 | 共同 197 | 关于 198 | 其 199 | 其一 200 | 其中 201 | 其二 202 | 其他 203 | 其余 204 | 其它 205 | 其实 206 | 其次 207 | 具体 208 | 具体地说 209 | 具体说来 210 | 具有 211 | 再者 212 | 再说 213 | 冒 214 | 冲 215 | 决定 216 | 况且 217 | 准备 218 | 几 219 | 几乎 220 | 几时 221 | 凭 222 | 凭借 223 | 出去 224 | 出来 225 | 出现 226 | 分别 227 | 则 228 | 别 229 | 别的 230 | 别说 231 | 到 232 | 前后 233 | 前者 234 | 前进 235 | 前面 236 | 加之 237 | 加以 238 | 加入 239 | 加强 240 | 十分 241 | 即 242 | 即令 243 | 即使 244 | 即便 245 | 即或 246 | 即若 247 | 却不 248 | 原来 249 | 又 250 | 及 251 | 及其 252 | 及时 253 | 及至 254 | 双方 255 | 反之 256 | 反应 257 | 反映 258 | 反过来 259 | 反过来说 260 | 取得 261 | 受到 262 | 变成 263 | 另 264 | 另一方面 265 | 另外 266 | 只是 267 | 只有 268 | 只要 269 | 只限 270 | 叫 271 | 叫做 272 | 召开 273 | 叮咚 274 | 可 275 | 可以 276 | 可是 277 | 可能 278 | 可见 279 | 各 280 | 各个 281 | 各人 282 | 各位 283 | 各地 284 | 各种 285 | 各级 286 | 各自 287 | 合理 288 | 同 289 | 同一 290 | 同时 291 | 同样 292 | 后来 293 | 后面 294 | 向 295 | 向着 296 | 吓 297 | 吗 298 | 否则 299 | 吧 300 | 吧哒 301 | 吱 302 | 呀 303 | 呃 304 | 呕 305 | 呗 306 | 呜 307 | 呜呼 308 | 呢 309 | 周围 310 | 呵 311 | 呸 312 | 呼哧 313 | 咋 314 | 和 315 | 咚 316 | 咦 317 | 咱 318 | 咱们 319 | 咳 320 | 哇 321 | 哈 322 | 哈哈 323 | 哉 324 | 哎 325 | 哎呀 326 | 哎哟 327 | 哗 328 | 哟 329 | 哦 330 | 哩 331 | 哪 332 | 哪个 333 | 哪些 334 | 哪儿 335 | 哪天 336 | 哪年 337 | 哪怕 338 | 哪样 339 | 哪边 340 | 哪里 341 | 哼 342 | 哼唷 343 | 唉 344 | 啊 345 | 啐 346 | 啥 347 | 啦 348 | 啪达 349 | 喂 350 | 喏 351 | 喔唷 352 | 嗡嗡 353 | 嗬 354 | 嗯 355 | 嗳 356 | 嘎 357 | 嘎登 358 | 嘘 359 | 嘛 360 | 嘻 361 | 嘿 362 | 因 363 | 因为 364 | 因此 365 | 因而 366 | 固然 367 | 在 368 | 在下 369 | 地 370 | 坚决 371 | 坚持 372 | 基本 373 | 处理 374 | 复杂 375 | 多 376 | 多少 377 | 多数 378 | 多次 379 | 大力 380 | 大多数 381 | 大大 382 | 大家 383 | 大批 384 | 大约 385 | 大量 386 | 失去 387 | 她 388 | 她们 389 | 她的 390 | 好的 391 | 好象 392 | 如 393 | 如上所述 394 | 如下 395 | 如何 396 | 如其 397 | 如果 398 | 如此 399 | 如若 400 | 存在 401 | 宁 402 | 宁可 403 | 宁愿 404 | 宁肯 405 | 它 406 | 它们 407 | 它们的 408 | 它的 409 | 安全 410 | 完全 411 | 完成 412 | 实现 413 | 实际 414 | 宣布 415 | 容易 416 | 密切 417 | 对 418 | 对于 419 | 对应 420 | 将 421 | 少数 422 | 尔后 423 | 尚且 424 | 尤其 425 | 就 426 | 就是 427 | 就是说 428 | 尽 429 | 尽管 430 | 属于 431 | 岂但 432 | 左右 433 | 巨大 434 | 巩固 435 | 己 436 | 已经 437 | 帮助 438 | 常常 439 | 并 440 | 并不 441 | 并不是 442 | 并且 443 | 并没有 444 | 广大 445 | 广泛 446 | 应当 447 | 应用 448 | 应该 449 | 开外 450 | 开始 451 | 开展 452 | 引起 453 | 强烈 454 | 强调 455 | 归 456 | 当 457 | 当前 458 | 当时 459 | 当然 460 | 当着 461 | 形成 462 | 彻底 463 | 彼 464 | 彼此 465 | 往 466 | 往往 467 | 待 468 | 後来 469 | 後面 470 | 得 471 | 得出 472 | 得到 473 | 心里 474 | 必然 475 | 必要 476 | 必须 477 | 怎 478 | 怎么 479 | 怎么办 480 | 怎么样 481 | 怎样 482 | 怎麽 483 | 总之 484 | 总是 485 | 总的来看 486 | 总的来说 487 | 总的说来 488 | 总结 489 | 总而言之 490 | 恰恰相反 491 | 您 492 | 意思 493 | 愿意 494 | 慢说 495 | 成为 496 | 我 497 | 我们 498 | 我的 499 | 或 500 | 或是 501 | 或者 502 | 战斗 503 | 所 504 | 所以 505 | 所有 506 | 所谓 507 | 打 508 | 扩大 509 | 把 510 | 抑或 511 | 拿 512 | 按 513 | 按照 514 | 换句话说 515 | 换言之 516 | 据 517 | 掌握 518 | 接着 519 | 接著 520 | 故 521 | 故此 522 | 整个 523 | 方便 524 | 方面 525 | 旁人 526 | 无宁 527 | 无法 528 | 无论 529 | 既 530 | 既是 531 | 既然 532 | 时候 533 | 明显 534 | 明确 535 | 是 536 | 是否 537 | 是的 538 | 显然 539 | 显著 540 | 普通 541 | 普遍 542 | 更加 543 | 曾经 544 | 替 545 | 最后 546 | 最大 547 | 最好 548 | 最後 549 | 最近 550 | 最高 551 | 有 552 | 有些 553 | 有关 554 | 有利 555 | 有力 556 | 有所 557 | 有效 558 | 有时 559 | 有点 560 | 有的 561 | 有着 562 | 有著 563 | 望 564 | 朝 565 | 朝着 566 | 本 567 | 本着 568 | 来 569 | 来着 570 | 极了 571 | 构成 572 | 果然 573 | 果真 574 | 某 575 | 某个 576 | 某些 577 | 根据 578 | 根本 579 | 欢迎 580 | 正在 581 | 正如 582 | 正常 583 | 此 584 | 此外 585 | 此时 586 | 此间 587 | 毋宁 588 | 每 589 | 每个 590 | 每天 591 | 每年 592 | 每当 593 | 比 594 | 比如 595 | 比方 596 | 比较 597 | 毫不 598 | 没有 599 | 沿 600 | 沿着 601 | 注意 602 | 深入 603 | 清楚 604 | 满足 605 | 漫说 606 | 焉 607 | 然则 608 | 然后 609 | 然後 610 | 然而 611 | 照 612 | 照着 613 | 特别是 614 | 特殊 615 | 特点 616 | 现代 617 | 现在 618 | 甚么 619 | 甚而 620 | 甚至 621 | 用 622 | 由 623 | 由于 624 | 由此可见 625 | 的 626 | 的话 627 | 目前 628 | 直到 629 | 直接 630 | 相似 631 | 相信 632 | 相反 633 | 相同 634 | 相对 635 | 相对而言 636 | 相应 637 | 相当 638 | 相等 639 | 省得 640 | 看出 641 | 看到 642 | 看来 643 | 看看 644 | 看见 645 | 真是 646 | 真正 647 | 着 648 | 着呢 649 | 矣 650 | 知道 651 | 确定 652 | 离 653 | 积极 654 | 移动 655 | 突出 656 | 突然 657 | 立即 658 | 第 659 | 等 660 | 等等 661 | 管 662 | 紧接着 663 | 纵 664 | 纵令 665 | 纵使 666 | 纵然 667 | 练习 668 | 组成 669 | 经 670 | 经常 671 | 经过 672 | 结合 673 | 结果 674 | 给 675 | 绝对 676 | 继续 677 | 继而 678 | 维持 679 | 综上所述 680 | 罢了 681 | 考虑 682 | 者 683 | 而 684 | 而且 685 | 而况 686 | 而外 687 | 而已 688 | 而是 689 | 而言 690 | 联系 691 | 能 692 | 能否 693 | 能够 694 | 腾 695 | 自 696 | 自个儿 697 | 自从 698 | 自各儿 699 | 自家 700 | 自己 701 | 自身 702 | 至 703 | 至于 704 | 良好 705 | 若 706 | 若是 707 | 若非 708 | 范围 709 | 莫若 710 | 获得 711 | 虽 712 | 虽则 713 | 虽然 714 | 虽说 715 | 行为 716 | 行动 717 | 表明 718 | 表示 719 | 被 720 | 要 721 | 要不 722 | 要不是 723 | 要不然 724 | 要么 725 | 要是 726 | 要求 727 | 规定 728 | 觉得 729 | 认为 730 | 认真 731 | 认识 732 | 让 733 | 许多 734 | 论 735 | 设使 736 | 设若 737 | 该 738 | 说明 739 | 诸位 740 | 谁 741 | 谁知 742 | 赶 743 | 起 744 | 起来 745 | 起见 746 | 趁 747 | 趁着 748 | 越是 749 | 跟 750 | 转动 751 | 转变 752 | 转贴 753 | 较 754 | 较之 755 | 边 756 | 达到 757 | 迅速 758 | 过 759 | 过去 760 | 过来 761 | 运用 762 | 还是 763 | 还有 764 | 这 765 | 这个 766 | 这么 767 | 这么些 768 | 这么样 769 | 这么点儿 770 | 这些 771 | 这会儿 772 | 这儿 773 | 这就是说 774 | 这时 775 | 这样 776 | 这点 777 | 这种 778 | 这边 779 | 这里 780 | 这麽 781 | 进入 782 | 进步 783 | 进而 784 | 进行 785 | 连 786 | 连同 787 | 适应 788 | 适当 789 | 适用 790 | 逐步 791 | 逐渐 792 | 通常 793 | 通过 794 | 造成 795 | 遇到 796 | 遭到 797 | 避免 798 | 那 799 | 那个 800 | 那么 801 | 那么些 802 | 那么样 803 | 那些 804 | 那会儿 805 | 那儿 806 | 那时 807 | 那样 808 | 那边 809 | 那里 810 | 那麽 811 | 部分 812 | 鄙人 813 | 采取 814 | 里面 815 | 重大 816 | 重新 817 | 重要 818 | 鉴于 819 | 问题 820 | 防止 821 | 阿 822 | 附近 823 | 限制 824 | 除 825 | 除了 826 | 除此之外 827 | 除非 828 | 随 829 | 随着 830 | 随著 831 | 集中 832 | 需要 833 | 非但 834 | 非常 835 | 非徒 836 | 靠 837 | 顺 838 | 顺着 839 | 首先 840 | 高兴 841 | 是不是 842 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/danish: -------------------------------------------------------------------------------- 1 | og 2 | i 3 | jeg 4 | det 5 | at 6 | en 7 | den 8 | til 9 | er 10 | som 11 | på 12 | de 13 | med 14 | han 15 | af 16 | for 17 | ikke 18 | der 19 | var 20 | mig 21 | sig 22 | men 23 | et 24 | har 25 | om 26 | vi 27 | min 28 | havde 29 | ham 30 | hun 31 | nu 32 | over 33 | da 34 | fra 35 | du 36 | ud 37 | sin 38 | dem 39 | os 40 | op 41 | man 42 | hans 43 | hvor 44 | eller 45 | hvad 46 | skal 47 | selv 48 | her 49 | alle 50 | vil 51 | blev 52 | kunne 53 | ind 54 | når 55 | være 56 | dog 57 | noget 58 | ville 59 | jo 60 | deres 61 | efter 62 | ned 63 | skulle 64 | denne 65 | end 66 | dette 67 | mit 68 | også 69 | under 70 | have 71 | dig 72 | anden 73 | hende 74 | mine 75 | alt 76 | meget 77 | sit 78 | sine 79 | vor 80 | mod 81 | disse 82 | hvis 83 | din 84 | nogle 85 | hos 86 | blive 87 | mange 88 | ad 89 | bliver 90 | hendes 91 | været 92 | thi 93 | jer 94 | sådan 95 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/dutch: -------------------------------------------------------------------------------- 1 | de 2 | en 3 | van 4 | ik 5 | te 6 | dat 7 | die 8 | in 9 | een 10 | hij 11 | het 12 | niet 13 | zijn 14 | is 15 | was 16 | op 17 | aan 18 | met 19 | als 20 | voor 21 | had 22 | er 23 | maar 24 | om 25 | hem 26 | dan 27 | zou 28 | of 29 | wat 30 | mijn 31 | men 32 | dit 33 | zo 34 | door 35 | over 36 | ze 37 | zich 38 | bij 39 | ook 40 | tot 41 | je 42 | mij 43 | uit 44 | der 45 | daar 46 | haar 47 | naar 48 | heb 49 | hoe 50 | heeft 51 | hebben 52 | deze 53 | u 54 | want 55 | nog 56 | zal 57 | me 58 | zij 59 | nu 60 | ge 61 | geen 62 | omdat 63 | iets 64 | worden 65 | toch 66 | al 67 | waren 68 | veel 69 | meer 70 | doen 71 | toen 72 | moet 73 | ben 74 | zonder 75 | kan 76 | hun 77 | dus 78 | alles 79 | onder 80 | ja 81 | eens 82 | hier 83 | wie 84 | werd 85 | altijd 86 | doch 87 | wordt 88 | wezen 89 | kunnen 90 | ons 91 | zelf 92 | tegen 93 | na 94 | reeds 95 | wil 96 | kon 97 | niets 98 | uw 99 | iemand 100 | geweest 101 | andere 102 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/english: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | you're 11 | you've 12 | you'll 13 | you'd 14 | your 15 | yours 16 | yourself 17 | yourselves 18 | he 19 | him 20 | his 21 | himself 22 | she 23 | she's 24 | her 25 | hers 26 | herself 27 | it 28 | it's 29 | its 30 | itself 31 | they 32 | them 33 | their 34 | theirs 35 | themselves 36 | what 37 | which 38 | who 39 | whom 40 | this 41 | that 42 | that'll 43 | these 44 | those 45 | am 46 | is 47 | are 48 | was 49 | were 50 | be 51 | been 52 | being 53 | have 54 | has 55 | had 56 | having 57 | do 58 | does 59 | did 60 | doing 61 | a 62 | an 63 | the 64 | and 65 | but 66 | if 67 | or 68 | because 69 | as 70 | until 71 | while 72 | of 73 | at 74 | by 75 | for 76 | with 77 | about 78 | against 79 | between 80 | into 81 | through 82 | during 83 | before 84 | after 85 | above 86 | below 87 | to 88 | from 89 | up 90 | down 91 | in 92 | out 93 | on 94 | off 95 | over 96 | under 97 | again 98 | further 99 | then 100 | once 101 | here 102 | there 103 | when 104 | where 105 | why 106 | how 107 | all 108 | any 109 | both 110 | each 111 | few 112 | more 113 | most 114 | other 115 | some 116 | such 117 | no 118 | nor 119 | not 120 | only 121 | own 122 | same 123 | so 124 | than 125 | too 126 | very 127 | s 128 | t 129 | can 130 | will 131 | just 132 | don 133 | don't 134 | should 135 | should've 136 | now 137 | d 138 | ll 139 | m 140 | o 141 | re 142 | ve 143 | y 144 | ain 145 | aren 146 | aren't 147 | couldn 148 | couldn't 149 | didn 150 | didn't 151 | doesn 152 | doesn't 153 | hadn 154 | hadn't 155 | hasn 156 | hasn't 157 | haven 158 | haven't 159 | isn 160 | isn't 161 | ma 162 | mightn 163 | mightn't 164 | mustn 165 | mustn't 166 | needn 167 | needn't 168 | shan 169 | shan't 170 | shouldn 171 | shouldn't 172 | wasn 173 | wasn't 174 | weren 175 | weren't 176 | won 177 | won't 178 | wouldn 179 | wouldn't 180 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/finnish: -------------------------------------------------------------------------------- 1 | olla 2 | olen 3 | olet 4 | on 5 | olemme 6 | olette 7 | ovat 8 | ole 9 | oli 10 | olisi 11 | olisit 12 | olisin 13 | olisimme 14 | olisitte 15 | olisivat 16 | olit 17 | olin 18 | olimme 19 | olitte 20 | olivat 21 | ollut 22 | olleet 23 | en 24 | et 25 | ei 26 | emme 27 | ette 28 | eivät 29 | minä 30 | minun 31 | minut 32 | minua 33 | minussa 34 | minusta 35 | minuun 36 | minulla 37 | minulta 38 | minulle 39 | sinä 40 | sinun 41 | sinut 42 | sinua 43 | sinussa 44 | sinusta 45 | sinuun 46 | sinulla 47 | sinulta 48 | sinulle 49 | hän 50 | hänen 51 | hänet 52 | häntä 53 | hänessä 54 | hänestä 55 | häneen 56 | hänellä 57 | häneltä 58 | hänelle 59 | me 60 | meidän 61 | meidät 62 | meitä 63 | meissä 64 | meistä 65 | meihin 66 | meillä 67 | meiltä 68 | meille 69 | te 70 | teidän 71 | teidät 72 | teitä 73 | teissä 74 | teistä 75 | teihin 76 | teillä 77 | teiltä 78 | teille 79 | he 80 | heidän 81 | heidät 82 | heitä 83 | heissä 84 | heistä 85 | heihin 86 | heillä 87 | heiltä 88 | heille 89 | tämä 90 | tämän 91 | tätä 92 | tässä 93 | tästä 94 | tähän 95 | tallä 96 | tältä 97 | tälle 98 | tänä 99 | täksi 100 | tuo 101 | tuon 102 | tuotä 103 | tuossa 104 | tuosta 105 | tuohon 106 | tuolla 107 | tuolta 108 | tuolle 109 | tuona 110 | tuoksi 111 | se 112 | sen 113 | sitä 114 | siinä 115 | siitä 116 | siihen 117 | sillä 118 | siltä 119 | sille 120 | sinä 121 | siksi 122 | nämä 123 | näiden 124 | näitä 125 | näissä 126 | näistä 127 | näihin 128 | näillä 129 | näiltä 130 | näille 131 | näinä 132 | näiksi 133 | nuo 134 | noiden 135 | noita 136 | noissa 137 | noista 138 | noihin 139 | noilla 140 | noilta 141 | noille 142 | noina 143 | noiksi 144 | ne 145 | niiden 146 | niitä 147 | niissä 148 | niistä 149 | niihin 150 | niillä 151 | niiltä 152 | niille 153 | niinä 154 | niiksi 155 | kuka 156 | kenen 157 | kenet 158 | ketä 159 | kenessä 160 | kenestä 161 | keneen 162 | kenellä 163 | keneltä 164 | kenelle 165 | kenenä 166 | keneksi 167 | ketkä 168 | keiden 169 | ketkä 170 | keitä 171 | keissä 172 | keistä 173 | keihin 174 | keillä 175 | keiltä 176 | keille 177 | keinä 178 | keiksi 179 | mikä 180 | minkä 181 | minkä 182 | mitä 183 | missä 184 | mistä 185 | mihin 186 | millä 187 | miltä 188 | mille 189 | minä 190 | miksi 191 | mitkä 192 | joka 193 | jonka 194 | jota 195 | jossa 196 | josta 197 | johon 198 | jolla 199 | jolta 200 | jolle 201 | jona 202 | joksi 203 | jotka 204 | joiden 205 | joita 206 | joissa 207 | joista 208 | joihin 209 | joilla 210 | joilta 211 | joille 212 | joina 213 | joiksi 214 | että 215 | ja 216 | jos 217 | koska 218 | kuin 219 | mutta 220 | niin 221 | sekä 222 | sillä 223 | tai 224 | vaan 225 | vai 226 | vaikka 227 | kanssa 228 | mukaan 229 | noin 230 | poikki 231 | yli 232 | kun 233 | niin 234 | nyt 235 | itse 236 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/french: -------------------------------------------------------------------------------- 1 | au 2 | aux 3 | avec 4 | ce 5 | ces 6 | dans 7 | de 8 | des 9 | du 10 | elle 11 | en 12 | et 13 | eux 14 | il 15 | ils 16 | je 17 | la 18 | le 19 | les 20 | leur 21 | lui 22 | ma 23 | mais 24 | me 25 | même 26 | mes 27 | moi 28 | mon 29 | ne 30 | nos 31 | notre 32 | nous 33 | on 34 | ou 35 | par 36 | pas 37 | pour 38 | qu 39 | que 40 | qui 41 | sa 42 | se 43 | ses 44 | son 45 | sur 46 | ta 47 | te 48 | tes 49 | toi 50 | ton 51 | tu 52 | un 53 | une 54 | vos 55 | votre 56 | vous 57 | c 58 | d 59 | j 60 | l 61 | à 62 | m 63 | n 64 | s 65 | t 66 | y 67 | été 68 | étée 69 | étées 70 | étés 71 | étant 72 | étante 73 | étants 74 | étantes 75 | suis 76 | es 77 | est 78 | sommes 79 | êtes 80 | sont 81 | serai 82 | seras 83 | sera 84 | serons 85 | serez 86 | seront 87 | serais 88 | serait 89 | serions 90 | seriez 91 | seraient 92 | étais 93 | était 94 | étions 95 | étiez 96 | étaient 97 | fus 98 | fut 99 | fûmes 100 | fûtes 101 | furent 102 | sois 103 | soit 104 | soyons 105 | soyez 106 | soient 107 | fusse 108 | fusses 109 | fût 110 | fussions 111 | fussiez 112 | fussent 113 | ayant 114 | ayante 115 | ayantes 116 | ayants 117 | eu 118 | eue 119 | eues 120 | eus 121 | ai 122 | as 123 | avons 124 | avez 125 | ont 126 | aurai 127 | auras 128 | aura 129 | aurons 130 | aurez 131 | auront 132 | aurais 133 | aurait 134 | aurions 135 | auriez 136 | auraient 137 | avais 138 | avait 139 | avions 140 | aviez 141 | avaient 142 | eut 143 | eûmes 144 | eûtes 145 | eurent 146 | aie 147 | aies 148 | ait 149 | ayons 150 | ayez 151 | aient 152 | eusse 153 | eusses 154 | eût 155 | eussions 156 | eussiez 157 | eussent 158 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/german: -------------------------------------------------------------------------------- 1 | aber 2 | alle 3 | allem 4 | allen 5 | aller 6 | alles 7 | als 8 | also 9 | am 10 | an 11 | ander 12 | andere 13 | anderem 14 | anderen 15 | anderer 16 | anderes 17 | anderm 18 | andern 19 | anderr 20 | anders 21 | auch 22 | auf 23 | aus 24 | bei 25 | bin 26 | bis 27 | bist 28 | da 29 | damit 30 | dann 31 | der 32 | den 33 | des 34 | dem 35 | die 36 | das 37 | dass 38 | daß 39 | derselbe 40 | derselben 41 | denselben 42 | desselben 43 | demselben 44 | dieselbe 45 | dieselben 46 | dasselbe 47 | dazu 48 | dein 49 | deine 50 | deinem 51 | deinen 52 | deiner 53 | deines 54 | denn 55 | derer 56 | dessen 57 | dich 58 | dir 59 | du 60 | dies 61 | diese 62 | diesem 63 | diesen 64 | dieser 65 | dieses 66 | doch 67 | dort 68 | durch 69 | ein 70 | eine 71 | einem 72 | einen 73 | einer 74 | eines 75 | einig 76 | einige 77 | einigem 78 | einigen 79 | einiger 80 | einiges 81 | einmal 82 | er 83 | ihn 84 | ihm 85 | es 86 | etwas 87 | euer 88 | eure 89 | eurem 90 | euren 91 | eurer 92 | eures 93 | für 94 | gegen 95 | gewesen 96 | hab 97 | habe 98 | haben 99 | hat 100 | hatte 101 | hatten 102 | hier 103 | hin 104 | hinter 105 | ich 106 | mich 107 | mir 108 | ihr 109 | ihre 110 | ihrem 111 | ihren 112 | ihrer 113 | ihres 114 | euch 115 | im 116 | in 117 | indem 118 | ins 119 | ist 120 | jede 121 | jedem 122 | jeden 123 | jeder 124 | jedes 125 | jene 126 | jenem 127 | jenen 128 | jener 129 | jenes 130 | jetzt 131 | kann 132 | kein 133 | keine 134 | keinem 135 | keinen 136 | keiner 137 | keines 138 | können 139 | könnte 140 | machen 141 | man 142 | manche 143 | manchem 144 | manchen 145 | mancher 146 | manches 147 | mein 148 | meine 149 | meinem 150 | meinen 151 | meiner 152 | meines 153 | mit 154 | muss 155 | musste 156 | nach 157 | nicht 158 | nichts 159 | noch 160 | nun 161 | nur 162 | ob 163 | oder 164 | ohne 165 | sehr 166 | sein 167 | seine 168 | seinem 169 | seinen 170 | seiner 171 | seines 172 | selbst 173 | sich 174 | sie 175 | ihnen 176 | sind 177 | so 178 | solche 179 | solchem 180 | solchen 181 | solcher 182 | solches 183 | soll 184 | sollte 185 | sondern 186 | sonst 187 | über 188 | um 189 | und 190 | uns 191 | unsere 192 | unserem 193 | unseren 194 | unser 195 | unseres 196 | unter 197 | viel 198 | vom 199 | von 200 | vor 201 | während 202 | war 203 | waren 204 | warst 205 | was 206 | weg 207 | weil 208 | weiter 209 | welche 210 | welchem 211 | welchen 212 | welcher 213 | welches 214 | wenn 215 | werde 216 | werden 217 | wie 218 | wieder 219 | will 220 | wir 221 | wird 222 | wirst 223 | wo 224 | wollen 225 | wollte 226 | würde 227 | würden 228 | zu 229 | zum 230 | zur 231 | zwar 232 | zwischen 233 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/greek: -------------------------------------------------------------------------------- 1 | αλλα 2 | αν 3 | αντι 4 | απο 5 | αυτα 6 | αυτεσ 7 | αυτη 8 | αυτο 9 | αυτοι 10 | αυτοσ 11 | αυτουσ 12 | αυτων 13 | αἱ 14 | αἳ 15 | αἵ 16 | αὐτόσ 17 | αὐτὸς 18 | αὖ 19 | γάρ 20 | γα 21 | γα^ 22 | γε 23 | για 24 | γοῦν 25 | γὰρ 26 | δ' 27 | δέ 28 | δή 29 | δαί 30 | δαίσ 31 | δαὶ 32 | δαὶς 33 | δε 34 | δεν 35 | δι' 36 | διά 37 | διὰ 38 | δὲ 39 | δὴ 40 | δ’ 41 | εαν 42 | ειμαι 43 | ειμαστε 44 | ειναι 45 | εισαι 46 | ειστε 47 | εκεινα 48 | εκεινεσ 49 | εκεινη 50 | εκεινο 51 | εκεινοι 52 | εκεινοσ 53 | εκεινουσ 54 | εκεινων 55 | ενω 56 | επ 57 | επι 58 | εἰ 59 | εἰμί 60 | εἰμὶ 61 | εἰς 62 | εἰσ 63 | εἴ 64 | εἴμι 65 | εἴτε 66 | η 67 | θα 68 | ισωσ 69 | κ 70 | καί 71 | καίτοι 72 | καθ 73 | και 74 | κατ 75 | κατά 76 | κατα 77 | κατὰ 78 | καὶ 79 | κι 80 | κἀν 81 | κἂν 82 | μέν 83 | μή 84 | μήτε 85 | μα 86 | με 87 | μεθ 88 | μετ 89 | μετά 90 | μετα 91 | μετὰ 92 | μη 93 | μην 94 | μἐν 95 | μὲν 96 | μὴ 97 | μὴν 98 | να 99 | ο 100 | οι 101 | ομωσ 102 | οπωσ 103 | οσο 104 | οτι 105 | οἱ 106 | οἳ 107 | οἷς 108 | οὐ 109 | οὐδ 110 | οὐδέ 111 | οὐδείσ 112 | οὐδεὶς 113 | οὐδὲ 114 | οὐδὲν 115 | οὐκ 116 | οὐχ 117 | οὐχὶ 118 | οὓς 119 | οὔτε 120 | οὕτω 121 | οὕτως 122 | οὕτωσ 123 | οὖν 124 | οὗ 125 | οὗτος 126 | οὗτοσ 127 | παρ 128 | παρά 129 | παρα 130 | παρὰ 131 | περί 132 | περὶ 133 | ποια 134 | ποιεσ 135 | ποιο 136 | ποιοι 137 | ποιοσ 138 | ποιουσ 139 | ποιων 140 | ποτε 141 | που 142 | ποῦ 143 | προ 144 | προσ 145 | πρόσ 146 | πρὸ 147 | πρὸς 148 | πως 149 | πωσ 150 | σε 151 | στη 152 | στην 153 | στο 154 | στον 155 | σόσ 156 | σύ 157 | σύν 158 | σὸς 159 | σὺ 160 | σὺν 161 | τά 162 | τήν 163 | τί 164 | τίς 165 | τίσ 166 | τα 167 | ταῖς 168 | τε 169 | την 170 | τησ 171 | τι 172 | τινα 173 | τις 174 | τισ 175 | το 176 | τοί 177 | τοι 178 | τοιοῦτος 179 | τοιοῦτοσ 180 | τον 181 | τοτε 182 | του 183 | τούσ 184 | τοὺς 185 | τοῖς 186 | τοῦ 187 | των 188 | τό 189 | τόν 190 | τότε 191 | τὰ 192 | τὰς 193 | τὴν 194 | τὸ 195 | τὸν 196 | τῆς 197 | τῆσ 198 | τῇ 199 | τῶν 200 | τῷ 201 | ωσ 202 | ἀλλ' 203 | ἀλλά 204 | ἀλλὰ 205 | ἀλλ’ 206 | ἀπ 207 | ἀπό 208 | ἀπὸ 209 | ἀφ 210 | ἂν 211 | ἃ 212 | ἄλλος 213 | ἄλλοσ 214 | ἄν 215 | ἄρα 216 | ἅμα 217 | ἐάν 218 | ἐγώ 219 | ἐγὼ 220 | ἐκ 221 | ἐμόσ 222 | ἐμὸς 223 | ἐν 224 | ἐξ 225 | ἐπί 226 | ἐπεὶ 227 | ἐπὶ 228 | ἐστι 229 | ἐφ 230 | ἐὰν 231 | ἑαυτοῦ 232 | ἔτι 233 | ἡ 234 | ἢ 235 | ἣ 236 | ἤ 237 | ἥ 238 | ἧς 239 | ἵνα 240 | ὁ 241 | ὃ 242 | ὃν 243 | ὃς 244 | ὅ 245 | ὅδε 246 | ὅθεν 247 | ὅπερ 248 | ὅς 249 | ὅσ 250 | ὅστις 251 | ὅστισ 252 | ὅτε 253 | ὅτι 254 | ὑμόσ 255 | ὑπ 256 | ὑπέρ 257 | ὑπό 258 | ὑπὲρ 259 | ὑπὸ 260 | ὡς 261 | ὡσ 262 | ὥς 263 | ὥστε 264 | ὦ 265 | ᾧ 266 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/hebrew: -------------------------------------------------------------------------------- 1 | אני 2 | את 3 | אתה 4 | אנחנו 5 | אתן 6 | אתם 7 | הם 8 | הן 9 | היא 10 | הוא 11 | שלי 12 | שלו 13 | שלך 14 | שלה 15 | שלנו 16 | שלכם 17 | שלכן 18 | שלהם 19 | שלהן 20 | לי 21 | לו 22 | לה 23 | לנו 24 | לכם 25 | לכן 26 | להם 27 | להן 28 | אותה 29 | אותו 30 | זה 31 | זאת 32 | אלה 33 | אלו 34 | תחת 35 | מתחת 36 | מעל 37 | בין 38 | עם 39 | עד 40 | נגר 41 | על 42 | אל 43 | מול 44 | של 45 | אצל 46 | כמו 47 | אחר 48 | אותו 49 | בלי 50 | לפני 51 | אחרי 52 | מאחורי 53 | עלי 54 | עליו 55 | עליה 56 | עליך 57 | עלינו 58 | עליכם 59 | לעיכן 60 | עליהם 61 | עליהן 62 | כל 63 | כולם 64 | כולן 65 | כך 66 | ככה 67 | כזה 68 | זה 69 | זות 70 | אותי 71 | אותה 72 | אותם 73 | אותך 74 | אותו 75 | אותן 76 | אותנו 77 | ואת 78 | את 79 | אתכם 80 | אתכן 81 | איתי 82 | איתו 83 | איתך 84 | איתה 85 | איתם 86 | איתן 87 | איתנו 88 | איתכם 89 | איתכן 90 | יהיה 91 | תהיה 92 | היתי 93 | היתה 94 | היה 95 | להיות 96 | עצמי 97 | עצמו 98 | עצמה 99 | עצמם 100 | עצמן 101 | עצמנו 102 | עצמהם 103 | עצמהן 104 | מי 105 | מה 106 | איפה 107 | היכן 108 | במקום שבו 109 | אם 110 | לאן 111 | למקום שבו 112 | מקום בו 113 | איזה 114 | מהיכן 115 | איך 116 | כיצד 117 | באיזו מידה 118 | מתי 119 | בשעה ש 120 | כאשר 121 | כש 122 | למרות 123 | לפני 124 | אחרי 125 | מאיזו סיבה 126 | הסיבה שבגללה 127 | למה 128 | מדוע 129 | לאיזו תכלית 130 | כי 131 | יש 132 | אין 133 | אך 134 | מנין 135 | מאין 136 | מאיפה 137 | יכל 138 | יכלה 139 | יכלו 140 | יכול 141 | יכולה 142 | יכולים 143 | יכולות 144 | יוכלו 145 | יוכל 146 | מסוגל 147 | לא 148 | רק 149 | אולי 150 | אין 151 | לאו 152 | אי 153 | כלל 154 | נגד 155 | אם 156 | עם 157 | אל 158 | אלה 159 | אלו 160 | אף 161 | על 162 | מעל 163 | מתחת 164 | מצד 165 | בשביל 166 | לבין 167 | באמצע 168 | בתוך 169 | דרך 170 | מבעד 171 | באמצעות 172 | למעלה 173 | למטה 174 | מחוץ 175 | מן 176 | לעבר 177 | מכאן 178 | כאן 179 | הנה 180 | הרי 181 | פה 182 | שם 183 | אך 184 | ברם 185 | שוב 186 | אבל 187 | מבלי 188 | בלי 189 | מלבד 190 | רק 191 | בגלל 192 | מכיוון 193 | עד 194 | אשר 195 | ואילו 196 | למרות 197 | אס 198 | כמו 199 | כפי 200 | אז 201 | אחרי 202 | כן 203 | לכן 204 | לפיכך 205 | מאד 206 | עז 207 | מעט 208 | מעטים 209 | במידה 210 | שוב 211 | יותר 212 | מדי 213 | גם 214 | כן 215 | נו 216 | אחר 217 | אחרת 218 | אחרים 219 | אחרות 220 | אשר 221 | או -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/hinglish: -------------------------------------------------------------------------------- 1 | a 2 | aadi 3 | aaj 4 | aap 5 | aapne 6 | aata 7 | aati 8 | aaya 9 | aaye 10 | ab 11 | abbe 12 | abbey 13 | abe 14 | abhi 15 | able 16 | about 17 | above 18 | accha 19 | according 20 | accordingly 21 | acha 22 | achcha 23 | across 24 | actually 25 | after 26 | afterwards 27 | again 28 | against 29 | agar 30 | ain 31 | aint 32 | ain't 33 | aisa 34 | aise 35 | aisi 36 | alag 37 | all 38 | allow 39 | allows 40 | almost 41 | alone 42 | along 43 | already 44 | also 45 | although 46 | always 47 | am 48 | among 49 | amongst 50 | an 51 | and 52 | andar 53 | another 54 | any 55 | anybody 56 | anyhow 57 | anyone 58 | anything 59 | anyway 60 | anyways 61 | anywhere 62 | ap 63 | apan 64 | apart 65 | apna 66 | apnaa 67 | apne 68 | apni 69 | appear 70 | are 71 | aren 72 | arent 73 | aren't 74 | around 75 | arre 76 | as 77 | aside 78 | ask 79 | asking 80 | at 81 | aur 82 | avum 83 | aya 84 | aye 85 | baad 86 | baar 87 | bad 88 | bahut 89 | bana 90 | banae 91 | banai 92 | banao 93 | banaya 94 | banaye 95 | banayi 96 | banda 97 | bande 98 | bandi 99 | bane 100 | bani 101 | bas 102 | bata 103 | batao 104 | bc 105 | be 106 | became 107 | because 108 | become 109 | becomes 110 | becoming 111 | been 112 | before 113 | beforehand 114 | behind 115 | being 116 | below 117 | beside 118 | besides 119 | best 120 | better 121 | between 122 | beyond 123 | bhai 124 | bheetar 125 | bhi 126 | bhitar 127 | bht 128 | bilkul 129 | bohot 130 | bol 131 | bola 132 | bole 133 | boli 134 | bolo 135 | bolta 136 | bolte 137 | bolti 138 | both 139 | brief 140 | bro 141 | btw 142 | but 143 | by 144 | came 145 | can 146 | cannot 147 | cant 148 | can't 149 | cause 150 | causes 151 | certain 152 | certainly 153 | chahiye 154 | chaiye 155 | chal 156 | chalega 157 | chhaiye 158 | clearly 159 | c'mon 160 | com 161 | come 162 | comes 163 | could 164 | couldn 165 | couldnt 166 | couldn't 167 | d 168 | de 169 | dede 170 | dega 171 | degi 172 | dekh 173 | dekha 174 | dekhe 175 | dekhi 176 | dekho 177 | denge 178 | dhang 179 | di 180 | did 181 | didn 182 | didnt 183 | didn't 184 | dijiye 185 | diya 186 | diyaa 187 | diye 188 | diyo 189 | do 190 | does 191 | doesn 192 | doesnt 193 | doesn't 194 | doing 195 | done 196 | dono 197 | dont 198 | don't 199 | doosra 200 | doosre 201 | down 202 | downwards 203 | dude 204 | dunga 205 | dungi 206 | during 207 | dusra 208 | dusre 209 | dusri 210 | dvaara 211 | dvara 212 | dwaara 213 | dwara 214 | each 215 | edu 216 | eg 217 | eight 218 | either 219 | ek 220 | else 221 | elsewhere 222 | enough 223 | etc 224 | even 225 | ever 226 | every 227 | everybody 228 | everyone 229 | everything 230 | everywhere 231 | ex 232 | exactly 233 | example 234 | except 235 | far 236 | few 237 | fifth 238 | fir 239 | first 240 | five 241 | followed 242 | following 243 | follows 244 | for 245 | forth 246 | four 247 | from 248 | further 249 | furthermore 250 | gaya 251 | gaye 252 | gayi 253 | get 254 | gets 255 | getting 256 | ghar 257 | given 258 | gives 259 | go 260 | goes 261 | going 262 | gone 263 | good 264 | got 265 | gotten 266 | greetings 267 | haan 268 | had 269 | hadd 270 | hadn 271 | hadnt 272 | hadn't 273 | hai 274 | hain 275 | hamara 276 | hamare 277 | hamari 278 | hamne 279 | han 280 | happens 281 | har 282 | hardly 283 | has 284 | hasn 285 | hasnt 286 | hasn't 287 | have 288 | haven 289 | havent 290 | haven't 291 | having 292 | he 293 | hello 294 | help 295 | hence 296 | her 297 | here 298 | hereafter 299 | hereby 300 | herein 301 | here's 302 | hereupon 303 | hers 304 | herself 305 | he's 306 | hi 307 | him 308 | himself 309 | his 310 | hither 311 | hm 312 | hmm 313 | ho 314 | hoga 315 | hoge 316 | hogi 317 | hona 318 | honaa 319 | hone 320 | honge 321 | hongi 322 | honi 323 | hopefully 324 | hota 325 | hotaa 326 | hote 327 | hoti 328 | how 329 | howbeit 330 | however 331 | hoyenge 332 | hoyengi 333 | hu 334 | hua 335 | hue 336 | huh 337 | hui 338 | hum 339 | humein 340 | humne 341 | hun 342 | huye 343 | huyi 344 | i 345 | i'd 346 | idk 347 | ie 348 | if 349 | i'll 350 | i'm 351 | imo 352 | in 353 | inasmuch 354 | inc 355 | inhe 356 | inhi 357 | inho 358 | inka 359 | inkaa 360 | inke 361 | inki 362 | inn 363 | inner 364 | inse 365 | insofar 366 | into 367 | inward 368 | is 369 | ise 370 | isi 371 | iska 372 | iskaa 373 | iske 374 | iski 375 | isme 376 | isn 377 | isne 378 | isnt 379 | isn't 380 | iss 381 | isse 382 | issi 383 | isski 384 | it 385 | it'd 386 | it'll 387 | itna 388 | itne 389 | itni 390 | itno 391 | its 392 | it's 393 | itself 394 | ityaadi 395 | ityadi 396 | i've 397 | ja 398 | jaa 399 | jab 400 | jabh 401 | jaha 402 | jahaan 403 | jahan 404 | jaisa 405 | jaise 406 | jaisi 407 | jata 408 | jayega 409 | jidhar 410 | jin 411 | jinhe 412 | jinhi 413 | jinho 414 | jinhone 415 | jinka 416 | jinke 417 | jinki 418 | jinn 419 | jis 420 | jise 421 | jiska 422 | jiske 423 | jiski 424 | jisme 425 | jiss 426 | jisse 427 | jitna 428 | jitne 429 | jitni 430 | jo 431 | just 432 | jyaada 433 | jyada 434 | k 435 | ka 436 | kaafi 437 | kab 438 | kabhi 439 | kafi 440 | kaha 441 | kahaa 442 | kahaan 443 | kahan 444 | kahi 445 | kahin 446 | kahte 447 | kaisa 448 | kaise 449 | kaisi 450 | kal 451 | kam 452 | kar 453 | kara 454 | kare 455 | karega 456 | karegi 457 | karen 458 | karenge 459 | kari 460 | karke 461 | karna 462 | karne 463 | karni 464 | karo 465 | karta 466 | karte 467 | karti 468 | karu 469 | karun 470 | karunga 471 | karungi 472 | kaun 473 | kaunsa 474 | kayi 475 | kch 476 | ke 477 | keep 478 | keeps 479 | keh 480 | kehte 481 | kept 482 | khud 483 | ki 484 | kin 485 | kine 486 | kinhe 487 | kinho 488 | kinka 489 | kinke 490 | kinki 491 | kinko 492 | kinn 493 | kino 494 | kis 495 | kise 496 | kisi 497 | kiska 498 | kiske 499 | kiski 500 | kisko 501 | kisliye 502 | kisne 503 | kitna 504 | kitne 505 | kitni 506 | kitno 507 | kiya 508 | kiye 509 | know 510 | known 511 | knows 512 | ko 513 | koi 514 | kon 515 | konsa 516 | koyi 517 | krna 518 | krne 519 | kuch 520 | kuchch 521 | kuchh 522 | kul 523 | kull 524 | kya 525 | kyaa 526 | kyu 527 | kyuki 528 | kyun 529 | kyunki 530 | lagta 531 | lagte 532 | lagti 533 | last 534 | lately 535 | later 536 | le 537 | least 538 | lekar 539 | lekin 540 | less 541 | lest 542 | let 543 | let's 544 | li 545 | like 546 | liked 547 | likely 548 | little 549 | liya 550 | liye 551 | ll 552 | lo 553 | log 554 | logon 555 | lol 556 | look 557 | looking 558 | looks 559 | ltd 560 | lunga 561 | m 562 | maan 563 | maana 564 | maane 565 | maani 566 | maano 567 | magar 568 | mai 569 | main 570 | maine 571 | mainly 572 | mana 573 | mane 574 | mani 575 | mano 576 | many 577 | mat 578 | may 579 | maybe 580 | me 581 | mean 582 | meanwhile 583 | mein 584 | mera 585 | mere 586 | merely 587 | meri 588 | might 589 | mightn 590 | mightnt 591 | mightn't 592 | mil 593 | mjhe 594 | more 595 | moreover 596 | most 597 | mostly 598 | much 599 | mujhe 600 | must 601 | mustn 602 | mustnt 603 | mustn't 604 | my 605 | myself 606 | na 607 | naa 608 | naah 609 | nahi 610 | nahin 611 | nai 612 | name 613 | namely 614 | nd 615 | ne 616 | near 617 | nearly 618 | necessary 619 | neeche 620 | need 621 | needn 622 | neednt 623 | needn't 624 | needs 625 | neither 626 | never 627 | nevertheless 628 | new 629 | next 630 | nhi 631 | nine 632 | no 633 | nobody 634 | non 635 | none 636 | noone 637 | nope 638 | nor 639 | normally 640 | not 641 | nothing 642 | novel 643 | now 644 | nowhere 645 | o 646 | obviously 647 | of 648 | off 649 | often 650 | oh 651 | ok 652 | okay 653 | old 654 | on 655 | once 656 | one 657 | ones 658 | only 659 | onto 660 | or 661 | other 662 | others 663 | otherwise 664 | ought 665 | our 666 | ours 667 | ourselves 668 | out 669 | outside 670 | over 671 | overall 672 | own 673 | par 674 | pata 675 | pe 676 | pehla 677 | pehle 678 | pehli 679 | people 680 | per 681 | perhaps 682 | phla 683 | phle 684 | phli 685 | placed 686 | please 687 | plus 688 | poora 689 | poori 690 | provides 691 | pura 692 | puri 693 | q 694 | que 695 | quite 696 | raha 697 | rahaa 698 | rahe 699 | rahi 700 | rakh 701 | rakha 702 | rakhe 703 | rakhen 704 | rakhi 705 | rakho 706 | rather 707 | re 708 | really 709 | reasonably 710 | regarding 711 | regardless 712 | regards 713 | rehte 714 | rha 715 | rhaa 716 | rhe 717 | rhi 718 | ri 719 | right 720 | s 721 | sa 722 | saara 723 | saare 724 | saath 725 | sab 726 | sabhi 727 | sabse 728 | sahi 729 | said 730 | sakta 731 | saktaa 732 | sakte 733 | sakti 734 | same 735 | sang 736 | sara 737 | sath 738 | saw 739 | say 740 | saying 741 | says 742 | se 743 | second 744 | secondly 745 | see 746 | seeing 747 | seem 748 | seemed 749 | seeming 750 | seems 751 | seen 752 | self 753 | selves 754 | sensible 755 | sent 756 | serious 757 | seriously 758 | seven 759 | several 760 | shall 761 | shan 762 | shant 763 | shan't 764 | she 765 | she's 766 | should 767 | shouldn 768 | shouldnt 769 | shouldn't 770 | should've 771 | si 772 | since 773 | six 774 | so 775 | soch 776 | some 777 | somebody 778 | somehow 779 | someone 780 | something 781 | sometime 782 | sometimes 783 | somewhat 784 | somewhere 785 | soon 786 | still 787 | sub 788 | such 789 | sup 790 | sure 791 | t 792 | tab 793 | tabh 794 | tak 795 | take 796 | taken 797 | tarah 798 | teen 799 | teeno 800 | teesra 801 | teesre 802 | teesri 803 | tell 804 | tends 805 | tera 806 | tere 807 | teri 808 | th 809 | tha 810 | than 811 | thank 812 | thanks 813 | thanx 814 | that 815 | that'll 816 | thats 817 | that's 818 | the 819 | theek 820 | their 821 | theirs 822 | them 823 | themselves 824 | then 825 | thence 826 | there 827 | thereafter 828 | thereby 829 | therefore 830 | therein 831 | theres 832 | there's 833 | thereupon 834 | these 835 | they 836 | they'd 837 | they'll 838 | they're 839 | they've 840 | thi 841 | thik 842 | thing 843 | think 844 | thinking 845 | third 846 | this 847 | tho 848 | thoda 849 | thodi 850 | thorough 851 | thoroughly 852 | those 853 | though 854 | thought 855 | three 856 | through 857 | throughout 858 | thru 859 | thus 860 | tjhe 861 | to 862 | together 863 | toh 864 | too 865 | took 866 | toward 867 | towards 868 | tried 869 | tries 870 | true 871 | truly 872 | try 873 | trying 874 | tu 875 | tujhe 876 | tum 877 | tumhara 878 | tumhare 879 | tumhari 880 | tune 881 | twice 882 | two 883 | um 884 | umm 885 | un 886 | under 887 | unhe 888 | unhi 889 | unho 890 | unhone 891 | unka 892 | unkaa 893 | unke 894 | unki 895 | unko 896 | unless 897 | unlikely 898 | unn 899 | unse 900 | until 901 | unto 902 | up 903 | upar 904 | upon 905 | us 906 | use 907 | used 908 | useful 909 | uses 910 | usi 911 | using 912 | uska 913 | uske 914 | usne 915 | uss 916 | usse 917 | ussi 918 | usually 919 | vaala 920 | vaale 921 | vaali 922 | vahaan 923 | vahan 924 | vahi 925 | vahin 926 | vaisa 927 | vaise 928 | vaisi 929 | vala 930 | vale 931 | vali 932 | various 933 | ve 934 | very 935 | via 936 | viz 937 | vo 938 | waala 939 | waale 940 | waali 941 | wagaira 942 | wagairah 943 | wagerah 944 | waha 945 | wahaan 946 | wahan 947 | wahi 948 | wahin 949 | waisa 950 | waise 951 | waisi 952 | wala 953 | wale 954 | wali 955 | want 956 | wants 957 | was 958 | wasn 959 | wasnt 960 | wasn't 961 | way 962 | we 963 | we'd 964 | well 965 | we'll 966 | went 967 | were 968 | we're 969 | weren 970 | werent 971 | weren't 972 | we've 973 | what 974 | whatever 975 | what's 976 | when 977 | whence 978 | whenever 979 | where 980 | whereafter 981 | whereas 982 | whereby 983 | wherein 984 | where's 985 | whereupon 986 | wherever 987 | whether 988 | which 989 | while 990 | who 991 | whoever 992 | whole 993 | whom 994 | who's 995 | whose 996 | why 997 | will 998 | willing 999 | with 1000 | within 1001 | without 1002 | wo 1003 | woh 1004 | wohi 1005 | won 1006 | wont 1007 | won't 1008 | would 1009 | wouldn 1010 | wouldnt 1011 | wouldn't 1012 | y 1013 | ya 1014 | yadi 1015 | yah 1016 | yaha 1017 | yahaan 1018 | yahan 1019 | yahi 1020 | yahin 1021 | ye 1022 | yeah 1023 | yeh 1024 | yehi 1025 | yes 1026 | yet 1027 | you 1028 | you'd 1029 | you'll 1030 | your 1031 | you're 1032 | yours 1033 | yourself 1034 | yourselves 1035 | you've 1036 | yup 1037 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/hungarian: -------------------------------------------------------------------------------- 1 | a 2 | ahogy 3 | ahol 4 | aki 5 | akik 6 | akkor 7 | alatt 8 | által 9 | általában 10 | amely 11 | amelyek 12 | amelyekben 13 | amelyeket 14 | amelyet 15 | amelynek 16 | ami 17 | amit 18 | amolyan 19 | amíg 20 | amikor 21 | át 22 | abban 23 | ahhoz 24 | annak 25 | arra 26 | arról 27 | az 28 | azok 29 | azon 30 | azt 31 | azzal 32 | azért 33 | aztán 34 | azután 35 | azonban 36 | bár 37 | be 38 | belül 39 | benne 40 | cikk 41 | cikkek 42 | cikkeket 43 | csak 44 | de 45 | e 46 | eddig 47 | egész 48 | egy 49 | egyes 50 | egyetlen 51 | egyéb 52 | egyik 53 | egyre 54 | ekkor 55 | el 56 | elég 57 | ellen 58 | elõ 59 | elõször 60 | elõtt 61 | elsõ 62 | én 63 | éppen 64 | ebben 65 | ehhez 66 | emilyen 67 | ennek 68 | erre 69 | ez 70 | ezt 71 | ezek 72 | ezen 73 | ezzel 74 | ezért 75 | és 76 | fel 77 | felé 78 | hanem 79 | hiszen 80 | hogy 81 | hogyan 82 | igen 83 | így 84 | illetve 85 | ill. 86 | ill 87 | ilyen 88 | ilyenkor 89 | ison 90 | ismét 91 | itt 92 | jó 93 | jól 94 | jobban 95 | kell 96 | kellett 97 | keresztül 98 | keressünk 99 | ki 100 | kívül 101 | között 102 | közül 103 | legalább 104 | lehet 105 | lehetett 106 | legyen 107 | lenne 108 | lenni 109 | lesz 110 | lett 111 | maga 112 | magát 113 | majd 114 | majd 115 | már 116 | más 117 | másik 118 | meg 119 | még 120 | mellett 121 | mert 122 | mely 123 | melyek 124 | mi 125 | mit 126 | míg 127 | miért 128 | milyen 129 | mikor 130 | minden 131 | mindent 132 | mindenki 133 | mindig 134 | mint 135 | mintha 136 | mivel 137 | most 138 | nagy 139 | nagyobb 140 | nagyon 141 | ne 142 | néha 143 | nekem 144 | neki 145 | nem 146 | néhány 147 | nélkül 148 | nincs 149 | olyan 150 | ott 151 | össze 152 | õ 153 | õk 154 | õket 155 | pedig 156 | persze 157 | rá 158 | s 159 | saját 160 | sem 161 | semmi 162 | sok 163 | sokat 164 | sokkal 165 | számára 166 | szemben 167 | szerint 168 | szinte 169 | talán 170 | tehát 171 | teljes 172 | tovább 173 | továbbá 174 | több 175 | úgy 176 | ugyanis 177 | új 178 | újabb 179 | újra 180 | után 181 | utána 182 | utolsó 183 | vagy 184 | vagyis 185 | valaki 186 | valami 187 | valamint 188 | való 189 | vagyok 190 | van 191 | vannak 192 | volt 193 | voltam 194 | voltak 195 | voltunk 196 | vissza 197 | vele 198 | viszont 199 | volna 200 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/indonesian: -------------------------------------------------------------------------------- 1 | ada 2 | adalah 3 | adanya 4 | adapun 5 | agak 6 | agaknya 7 | agar 8 | akan 9 | akankah 10 | akhir 11 | akhiri 12 | akhirnya 13 | aku 14 | akulah 15 | amat 16 | amatlah 17 | anda 18 | andalah 19 | antar 20 | antara 21 | antaranya 22 | apa 23 | apaan 24 | apabila 25 | apakah 26 | apalagi 27 | apatah 28 | artinya 29 | asal 30 | asalkan 31 | atas 32 | atau 33 | ataukah 34 | ataupun 35 | awal 36 | awalnya 37 | bagai 38 | bagaikan 39 | bagaimana 40 | bagaimanakah 41 | bagaimanapun 42 | bagi 43 | bagian 44 | bahkan 45 | bahwa 46 | bahwasanya 47 | baik 48 | bakal 49 | bakalan 50 | balik 51 | banyak 52 | bapak 53 | baru 54 | bawah 55 | beberapa 56 | begini 57 | beginian 58 | beginikah 59 | beginilah 60 | begitu 61 | begitukah 62 | begitulah 63 | begitupun 64 | bekerja 65 | belakang 66 | belakangan 67 | belum 68 | belumlah 69 | benar 70 | benarkah 71 | benarlah 72 | berada 73 | berakhir 74 | berakhirlah 75 | berakhirnya 76 | berapa 77 | berapakah 78 | berapalah 79 | berapapun 80 | berarti 81 | berawal 82 | berbagai 83 | berdatangan 84 | beri 85 | berikan 86 | berikut 87 | berikutnya 88 | berjumlah 89 | berkali-kali 90 | berkata 91 | berkehendak 92 | berkeinginan 93 | berkenaan 94 | berlainan 95 | berlalu 96 | berlangsung 97 | berlebihan 98 | bermacam 99 | bermacam-macam 100 | bermaksud 101 | bermula 102 | bersama 103 | bersama-sama 104 | bersiap 105 | bersiap-siap 106 | bertanya 107 | bertanya-tanya 108 | berturut 109 | berturut-turut 110 | bertutur 111 | berujar 112 | berupa 113 | besar 114 | betul 115 | betulkah 116 | biasa 117 | biasanya 118 | bila 119 | bilakah 120 | bisa 121 | bisakah 122 | boleh 123 | bolehkah 124 | bolehlah 125 | buat 126 | bukan 127 | bukankah 128 | bukanlah 129 | bukannya 130 | bulan 131 | bung 132 | cara 133 | caranya 134 | cukup 135 | cukupkah 136 | cukuplah 137 | cuma 138 | dahulu 139 | dalam 140 | dan 141 | dapat 142 | dari 143 | daripada 144 | datang 145 | dekat 146 | demi 147 | demikian 148 | demikianlah 149 | dengan 150 | depan 151 | di 152 | dia 153 | diakhiri 154 | diakhirinya 155 | dialah 156 | diantara 157 | diantaranya 158 | diberi 159 | diberikan 160 | diberikannya 161 | dibuat 162 | dibuatnya 163 | didapat 164 | didatangkan 165 | digunakan 166 | diibaratkan 167 | diibaratkannya 168 | diingat 169 | diingatkan 170 | diinginkan 171 | dijawab 172 | dijelaskan 173 | dijelaskannya 174 | dikarenakan 175 | dikatakan 176 | dikatakannya 177 | dikerjakan 178 | diketahui 179 | diketahuinya 180 | dikira 181 | dilakukan 182 | dilalui 183 | dilihat 184 | dimaksud 185 | dimaksudkan 186 | dimaksudkannya 187 | dimaksudnya 188 | diminta 189 | dimintai 190 | dimisalkan 191 | dimulai 192 | dimulailah 193 | dimulainya 194 | dimungkinkan 195 | dini 196 | dipastikan 197 | diperbuat 198 | diperbuatnya 199 | dipergunakan 200 | diperkirakan 201 | diperlihatkan 202 | diperlukan 203 | diperlukannya 204 | dipersoalkan 205 | dipertanyakan 206 | dipunyai 207 | diri 208 | dirinya 209 | disampaikan 210 | disebut 211 | disebutkan 212 | disebutkannya 213 | disini 214 | disinilah 215 | ditambahkan 216 | ditandaskan 217 | ditanya 218 | ditanyai 219 | ditanyakan 220 | ditegaskan 221 | ditujukan 222 | ditunjuk 223 | ditunjuki 224 | ditunjukkan 225 | ditunjukkannya 226 | ditunjuknya 227 | dituturkan 228 | dituturkannya 229 | diucapkan 230 | diucapkannya 231 | diungkapkan 232 | dong 233 | dua 234 | dulu 235 | empat 236 | enggak 237 | enggaknya 238 | entah 239 | entahlah 240 | guna 241 | gunakan 242 | hal 243 | hampir 244 | hanya 245 | hanyalah 246 | hari 247 | harus 248 | haruslah 249 | harusnya 250 | hendak 251 | hendaklah 252 | hendaknya 253 | hingga 254 | ia 255 | ialah 256 | ibarat 257 | ibaratkan 258 | ibaratnya 259 | ibu 260 | ikut 261 | ingat 262 | ingat-ingat 263 | ingin 264 | inginkah 265 | inginkan 266 | ini 267 | inikah 268 | inilah 269 | itu 270 | itukah 271 | itulah 272 | jadi 273 | jadilah 274 | jadinya 275 | jangan 276 | jangankan 277 | janganlah 278 | jauh 279 | jawab 280 | jawaban 281 | jawabnya 282 | jelas 283 | jelaskan 284 | jelaslah 285 | jelasnya 286 | jika 287 | jikalau 288 | juga 289 | jumlah 290 | jumlahnya 291 | justru 292 | kala 293 | kalau 294 | kalaulah 295 | kalaupun 296 | kalian 297 | kami 298 | kamilah 299 | kamu 300 | kamulah 301 | kan 302 | kapan 303 | kapankah 304 | kapanpun 305 | karena 306 | karenanya 307 | kasus 308 | kata 309 | katakan 310 | katakanlah 311 | katanya 312 | ke 313 | keadaan 314 | kebetulan 315 | kecil 316 | kedua 317 | keduanya 318 | keinginan 319 | kelamaan 320 | kelihatan 321 | kelihatannya 322 | kelima 323 | keluar 324 | kembali 325 | kemudian 326 | kemungkinan 327 | kemungkinannya 328 | kenapa 329 | kepada 330 | kepadanya 331 | kesampaian 332 | keseluruhan 333 | keseluruhannya 334 | keterlaluan 335 | ketika 336 | khususnya 337 | kini 338 | kinilah 339 | kira 340 | kira-kira 341 | kiranya 342 | kita 343 | kitalah 344 | kok 345 | kurang 346 | lagi 347 | lagian 348 | lah 349 | lain 350 | lainnya 351 | lalu 352 | lama 353 | lamanya 354 | lanjut 355 | lanjutnya 356 | lebih 357 | lewat 358 | lima 359 | luar 360 | macam 361 | maka 362 | makanya 363 | makin 364 | malah 365 | malahan 366 | mampu 367 | mampukah 368 | mana 369 | manakala 370 | manalagi 371 | masa 372 | masalah 373 | masalahnya 374 | masih 375 | masihkah 376 | masing 377 | masing-masing 378 | mau 379 | maupun 380 | melainkan 381 | melakukan 382 | melalui 383 | melihat 384 | melihatnya 385 | memang 386 | memastikan 387 | memberi 388 | memberikan 389 | membuat 390 | memerlukan 391 | memihak 392 | meminta 393 | memintakan 394 | memisalkan 395 | memperbuat 396 | mempergunakan 397 | memperkirakan 398 | memperlihatkan 399 | mempersiapkan 400 | mempersoalkan 401 | mempertanyakan 402 | mempunyai 403 | memulai 404 | memungkinkan 405 | menaiki 406 | menambahkan 407 | menandaskan 408 | menanti 409 | menanti-nanti 410 | menantikan 411 | menanya 412 | menanyai 413 | menanyakan 414 | mendapat 415 | mendapatkan 416 | mendatang 417 | mendatangi 418 | mendatangkan 419 | menegaskan 420 | mengakhiri 421 | mengapa 422 | mengatakan 423 | mengatakannya 424 | mengenai 425 | mengerjakan 426 | mengetahui 427 | menggunakan 428 | menghendaki 429 | mengibaratkan 430 | mengibaratkannya 431 | mengingat 432 | mengingatkan 433 | menginginkan 434 | mengira 435 | mengucapkan 436 | mengucapkannya 437 | mengungkapkan 438 | menjadi 439 | menjawab 440 | menjelaskan 441 | menuju 442 | menunjuk 443 | menunjuki 444 | menunjukkan 445 | menunjuknya 446 | menurut 447 | menuturkan 448 | menyampaikan 449 | menyangkut 450 | menyatakan 451 | menyebutkan 452 | menyeluruh 453 | menyiapkan 454 | merasa 455 | mereka 456 | merekalah 457 | merupakan 458 | meski 459 | meskipun 460 | meyakini 461 | meyakinkan 462 | minta 463 | mirip 464 | misal 465 | misalkan 466 | misalnya 467 | mula 468 | mulai 469 | mulailah 470 | mulanya 471 | mungkin 472 | mungkinkah 473 | nah 474 | naik 475 | namun 476 | nanti 477 | nantinya 478 | nyaris 479 | nyatanya 480 | oleh 481 | olehnya 482 | pada 483 | padahal 484 | padanya 485 | pak 486 | paling 487 | panjang 488 | pantas 489 | para 490 | pasti 491 | pastilah 492 | penting 493 | pentingnya 494 | per 495 | percuma 496 | perlu 497 | perlukah 498 | perlunya 499 | pernah 500 | persoalan 501 | pertama 502 | pertama-tama 503 | pertanyaan 504 | pertanyakan 505 | pihak 506 | pihaknya 507 | pukul 508 | pula 509 | pun 510 | punya 511 | rasa 512 | rasanya 513 | rata 514 | rupanya 515 | saat 516 | saatnya 517 | saja 518 | sajalah 519 | saling 520 | sama 521 | sama-sama 522 | sambil 523 | sampai 524 | sampai-sampai 525 | sampaikan 526 | sana 527 | sangat 528 | sangatlah 529 | satu 530 | saya 531 | sayalah 532 | se 533 | sebab 534 | sebabnya 535 | sebagai 536 | sebagaimana 537 | sebagainya 538 | sebagian 539 | sebaik 540 | sebaik-baiknya 541 | sebaiknya 542 | sebaliknya 543 | sebanyak 544 | sebegini 545 | sebegitu 546 | sebelum 547 | sebelumnya 548 | sebenarnya 549 | seberapa 550 | sebesar 551 | sebetulnya 552 | sebisanya 553 | sebuah 554 | sebut 555 | sebutlah 556 | sebutnya 557 | secara 558 | secukupnya 559 | sedang 560 | sedangkan 561 | sedemikian 562 | sedikit 563 | sedikitnya 564 | seenaknya 565 | segala 566 | segalanya 567 | segera 568 | seharusnya 569 | sehingga 570 | seingat 571 | sejak 572 | sejauh 573 | sejenak 574 | sejumlah 575 | sekadar 576 | sekadarnya 577 | sekali 578 | sekali-kali 579 | sekalian 580 | sekaligus 581 | sekalipun 582 | sekarang 583 | sekarang 584 | sekecil 585 | seketika 586 | sekiranya 587 | sekitar 588 | sekitarnya 589 | sekurang-kurangnya 590 | sekurangnya 591 | sela 592 | selain 593 | selaku 594 | selalu 595 | selama 596 | selama-lamanya 597 | selamanya 598 | selanjutnya 599 | seluruh 600 | seluruhnya 601 | semacam 602 | semakin 603 | semampu 604 | semampunya 605 | semasa 606 | semasih 607 | semata 608 | semata-mata 609 | semaunya 610 | sementara 611 | semisal 612 | semisalnya 613 | sempat 614 | semua 615 | semuanya 616 | semula 617 | sendiri 618 | sendirian 619 | sendirinya 620 | seolah 621 | seolah-olah 622 | seorang 623 | sepanjang 624 | sepantasnya 625 | sepantasnyalah 626 | seperlunya 627 | seperti 628 | sepertinya 629 | sepihak 630 | sering 631 | seringnya 632 | serta 633 | serupa 634 | sesaat 635 | sesama 636 | sesampai 637 | sesegera 638 | sesekali 639 | seseorang 640 | sesuatu 641 | sesuatunya 642 | sesudah 643 | sesudahnya 644 | setelah 645 | setempat 646 | setengah 647 | seterusnya 648 | setiap 649 | setiba 650 | setibanya 651 | setidak-tidaknya 652 | setidaknya 653 | setinggi 654 | seusai 655 | sewaktu 656 | siap 657 | siapa 658 | siapakah 659 | siapapun 660 | sini 661 | sinilah 662 | soal 663 | soalnya 664 | suatu 665 | sudah 666 | sudahkah 667 | sudahlah 668 | supaya 669 | tadi 670 | tadinya 671 | tahu 672 | tahun 673 | tak 674 | tambah 675 | tambahnya 676 | tampak 677 | tampaknya 678 | tandas 679 | tandasnya 680 | tanpa 681 | tanya 682 | tanyakan 683 | tanyanya 684 | tapi 685 | tegas 686 | tegasnya 687 | telah 688 | tempat 689 | tengah 690 | tentang 691 | tentu 692 | tentulah 693 | tentunya 694 | tepat 695 | terakhir 696 | terasa 697 | terbanyak 698 | terdahulu 699 | terdapat 700 | terdiri 701 | terhadap 702 | terhadapnya 703 | teringat 704 | teringat-ingat 705 | terjadi 706 | terjadilah 707 | terjadinya 708 | terkira 709 | terlalu 710 | terlebih 711 | terlihat 712 | termasuk 713 | ternyata 714 | tersampaikan 715 | tersebut 716 | tersebutlah 717 | tertentu 718 | tertuju 719 | terus 720 | terutama 721 | tetap 722 | tetapi 723 | tiap 724 | tiba 725 | tiba-tiba 726 | tidak 727 | tidakkah 728 | tidaklah 729 | tiga 730 | tinggi 731 | toh 732 | tunjuk 733 | turut 734 | tutur 735 | tuturnya 736 | ucap 737 | ucapnya 738 | ujar 739 | ujarnya 740 | umum 741 | umumnya 742 | ungkap 743 | ungkapnya 744 | untuk 745 | usah 746 | usai 747 | waduh 748 | wah 749 | wahai 750 | waktu 751 | waktunya 752 | walau 753 | walaupun 754 | wong 755 | yaitu 756 | yakin 757 | yakni 758 | yang -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/italian: -------------------------------------------------------------------------------- 1 | ad 2 | al 3 | allo 4 | ai 5 | agli 6 | all 7 | agl 8 | alla 9 | alle 10 | con 11 | col 12 | coi 13 | da 14 | dal 15 | dallo 16 | dai 17 | dagli 18 | dall 19 | dagl 20 | dalla 21 | dalle 22 | di 23 | del 24 | dello 25 | dei 26 | degli 27 | dell 28 | degl 29 | della 30 | delle 31 | in 32 | nel 33 | nello 34 | nei 35 | negli 36 | nell 37 | negl 38 | nella 39 | nelle 40 | su 41 | sul 42 | sullo 43 | sui 44 | sugli 45 | sull 46 | sugl 47 | sulla 48 | sulle 49 | per 50 | tra 51 | contro 52 | io 53 | tu 54 | lui 55 | lei 56 | noi 57 | voi 58 | loro 59 | mio 60 | mia 61 | miei 62 | mie 63 | tuo 64 | tua 65 | tuoi 66 | tue 67 | suo 68 | sua 69 | suoi 70 | sue 71 | nostro 72 | nostra 73 | nostri 74 | nostre 75 | vostro 76 | vostra 77 | vostri 78 | vostre 79 | mi 80 | ti 81 | ci 82 | vi 83 | lo 84 | la 85 | li 86 | le 87 | gli 88 | ne 89 | il 90 | un 91 | uno 92 | una 93 | ma 94 | ed 95 | se 96 | perché 97 | anche 98 | come 99 | dov 100 | dove 101 | che 102 | chi 103 | cui 104 | non 105 | più 106 | quale 107 | quanto 108 | quanti 109 | quanta 110 | quante 111 | quello 112 | quelli 113 | quella 114 | quelle 115 | questo 116 | questi 117 | questa 118 | queste 119 | si 120 | tutto 121 | tutti 122 | a 123 | c 124 | e 125 | i 126 | l 127 | o 128 | ho 129 | hai 130 | ha 131 | abbiamo 132 | avete 133 | hanno 134 | abbia 135 | abbiate 136 | abbiano 137 | avrò 138 | avrai 139 | avrà 140 | avremo 141 | avrete 142 | avranno 143 | avrei 144 | avresti 145 | avrebbe 146 | avremmo 147 | avreste 148 | avrebbero 149 | avevo 150 | avevi 151 | aveva 152 | avevamo 153 | avevate 154 | avevano 155 | ebbi 156 | avesti 157 | ebbe 158 | avemmo 159 | aveste 160 | ebbero 161 | avessi 162 | avesse 163 | avessimo 164 | avessero 165 | avendo 166 | avuto 167 | avuta 168 | avuti 169 | avute 170 | sono 171 | sei 172 | è 173 | siamo 174 | siete 175 | sia 176 | siate 177 | siano 178 | sarò 179 | sarai 180 | sarà 181 | saremo 182 | sarete 183 | saranno 184 | sarei 185 | saresti 186 | sarebbe 187 | saremmo 188 | sareste 189 | sarebbero 190 | ero 191 | eri 192 | era 193 | eravamo 194 | eravate 195 | erano 196 | fui 197 | fosti 198 | fu 199 | fummo 200 | foste 201 | furono 202 | fossi 203 | fosse 204 | fossimo 205 | fossero 206 | essendo 207 | faccio 208 | fai 209 | facciamo 210 | fanno 211 | faccia 212 | facciate 213 | facciano 214 | farò 215 | farai 216 | farà 217 | faremo 218 | farete 219 | faranno 220 | farei 221 | faresti 222 | farebbe 223 | faremmo 224 | fareste 225 | farebbero 226 | facevo 227 | facevi 228 | faceva 229 | facevamo 230 | facevate 231 | facevano 232 | feci 233 | facesti 234 | fece 235 | facemmo 236 | faceste 237 | fecero 238 | facessi 239 | facesse 240 | facessimo 241 | facessero 242 | facendo 243 | sto 244 | stai 245 | sta 246 | stiamo 247 | stanno 248 | stia 249 | stiate 250 | stiano 251 | starò 252 | starai 253 | starà 254 | staremo 255 | starete 256 | staranno 257 | starei 258 | staresti 259 | starebbe 260 | staremmo 261 | stareste 262 | starebbero 263 | stavo 264 | stavi 265 | stava 266 | stavamo 267 | stavate 268 | stavano 269 | stetti 270 | stesti 271 | stette 272 | stemmo 273 | steste 274 | stettero 275 | stessi 276 | stesse 277 | stessimo 278 | stessero 279 | stando 280 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/kazakh: -------------------------------------------------------------------------------- 1 | ах 2 | ох 3 | эх 4 | ай 5 | эй 6 | ой 7 | тағы 8 | тағыда 9 | әрине 10 | жоқ 11 | сондай 12 | осындай 13 | осылай 14 | солай 15 | мұндай 16 | бұндай 17 | мен 18 | сен 19 | ол 20 | біз 21 | біздер 22 | олар 23 | сіз 24 | сіздер 25 | маған 26 | оған 27 | саған 28 | біздің 29 | сіздің 30 | оның 31 | бізге 32 | сізге 33 | оларға 34 | біздерге 35 | сіздерге 36 | оларға 37 | менімен 38 | сенімен 39 | онымен 40 | бізбен 41 | сізбен 42 | олармен 43 | біздермен 44 | сіздермен 45 | менің 46 | сенің 47 | біздің 48 | сіздің 49 | оның 50 | біздердің 51 | сіздердің 52 | олардың 53 | маған 54 | саған 55 | оған 56 | менен 57 | сенен 58 | одан 59 | бізден 60 | сізден 61 | олардан 62 | біздерден 63 | сіздерден 64 | олардан 65 | айтпақшы 66 | сонымен 67 | сондықтан 68 | бұл 69 | осы 70 | сол 71 | анау 72 | мынау 73 | сонау 74 | осынау 75 | ана 76 | мына 77 | сона 78 | әні 79 | міне 80 | өй 81 | үйт 82 | бүйт 83 | біреу 84 | кейбіреу 85 | кейбір 86 | қайсыбір 87 | әрбір 88 | бірнеше 89 | бірдеме 90 | бірнеше 91 | әркім 92 | әрне 93 | әрқайсы 94 | әрқалай 95 | әлдекім 96 | әлдене 97 | әлдеқайдан 98 | әлденеше 99 | әлдеқалай 100 | әлдеқашан 101 | алдақашан 102 | еш 103 | ешкім 104 | ешбір 105 | ештеме 106 | дәнеңе 107 | ешқашан 108 | ешқандай 109 | ешқайсы 110 | емес 111 | бәрі 112 | барлық 113 | барша 114 | бар 115 | күллі 116 | бүкіл 117 | түгел 118 | өз 119 | өзім 120 | өзің 121 | өзінің 122 | өзіме 123 | өзіне 124 | өзімнің 125 | өзі 126 | өзге 127 | менде 128 | сенде 129 | онда 130 | менен 131 | сенен онан 132 | одан 133 | ау 134 | па 135 | ей 136 | әй 137 | е 138 | уа 139 | уау 140 | уай 141 | я 142 | пай 143 | ә 144 | о 145 | оһо 146 | ой 147 | ие 148 | аһа 149 | ау 150 | беу 151 | мәссаған 152 | бәрекелді 153 | әттегенай 154 | жаракімалла 155 | масқарай 156 | астапыралла 157 | япырмай 158 | ойпырмай 159 | кәне 160 | кәнеки 161 | ал 162 | әйда 163 | кәні 164 | міне 165 | әні 166 | сорап 167 | қош-қош 168 | пфша 169 | пішә 170 | құрау-құрау 171 | шәйт 172 | шек 173 | моһ 174 | тәк 175 | құрау 176 | құр 177 | кә 178 | кәһ 179 | күшім 180 | күшім 181 | мышы 182 | пырс 183 | әукім 184 | алақай 185 | паһ-паһ 186 | бәрекелді 187 | ура 188 | әттең 189 | әттеген-ай 190 | қап 191 | түге 192 | пішту 193 | шіркін 194 | алатау 195 | пай-пай 196 | үшін 197 | сайын 198 | сияқты 199 | туралы 200 | арқылы 201 | бойы 202 | бойымен 203 | шамалы 204 | шақты 205 | қаралы 206 | ғұрлы 207 | ғұрлым 208 | шейін 209 | дейін 210 | қарай 211 | таман 212 | салым 213 | тарта 214 | жуық 215 | таяу 216 | гөрі 217 | бері 218 | кейін 219 | соң 220 | бұрын 221 | бетер 222 | қатар 223 | бірге 224 | қоса 225 | арс 226 | 227 | гүрс 228 | 229 | дүрс 230 | 231 | қорс 232 | 233 | тарс 234 | 235 | тырс 236 | 237 | ырс 238 | 239 | барқ 240 | 241 | борт 242 | 243 | күрт 244 | 245 | кірт 246 | 247 | морт 248 | 249 | сарт 250 | 251 | шырт 252 | 253 | дүңк 254 | 255 | күңк 256 | 257 | қыңқ 258 | 259 | мыңқ 260 | 261 | маңқ 262 | 263 | саңқ 264 | 265 | шаңқ 266 | 267 | шіңк 268 | 269 | сыңқ 270 | 271 | таңқ 272 | 273 | тыңқ 274 | 275 | ыңқ 276 | 277 | болп 278 | 279 | былп 280 | 281 | жалп 282 | 283 | желп 284 | 285 | қолп 286 | 287 | ірк 288 | 289 | ырқ 290 | 291 | сарт-сұрт 292 | 293 | тарс-тұрс 294 | 295 | арс-ұрс 296 | 297 | жалт-жалт 298 | 299 | жалт-жұлт 300 | 301 | қалт-қалт 302 | 303 | қалт-құлт 304 | 305 | қаңқ-қаңқ 306 | 307 | қаңқ-құңқ 308 | 309 | шаңқ-шаңқ 310 | 311 | шаңқ-шұңқ 312 | 313 | арбаң-арбаң 314 | 315 | бүгжең-бүгжең 316 | 317 | арсалаң-арсалаң 318 | 319 | ербелең-ербелең 320 | 321 | батыр-бұтыр 322 | 323 | далаң-далаң 324 | 325 | тарбаң-тарбаң 326 | 327 | қызараң-қызараң 328 | 329 | қаңғыр-күңгір 330 | 331 | қайқаң-құйқаң 332 | 333 | митың-митың 334 | 335 | салаң-сұлаң 336 | 337 | ыржың-тыржың 338 | бірақ 339 | алайда 340 | дегенмен 341 | әйтпесе 342 | әйткенмен 343 | себебі 344 | өйткені 345 | сондықтан 346 | үшін 347 | сайын 348 | сияқты 349 | туралы 350 | арқылы 351 | бойы 352 | бойымен 353 | шамалы 354 | шақты 355 | қаралы 356 | ғұрлы 357 | ғұрлым 358 | гөрі 359 | бері 360 | кейін 361 | соң 362 | бұрын 363 | бетер 364 | қатар 365 | бірге 366 | қоса 367 | шейін 368 | дейін 369 | қарай 370 | таман 371 | салым 372 | тарта 373 | жуық 374 | таяу 375 | арнайы 376 | осындай 377 | ғана 378 | қана 379 | тек 380 | әншейін 381 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/nepali: -------------------------------------------------------------------------------- 1 | छ 2 | र 3 | पनि 4 | छन् 5 | लागि 6 | भएको 7 | गरेको 8 | भने 9 | गर्न 10 | गर्ने 11 | हो 12 | तथा 13 | यो 14 | रहेको 15 | उनले 16 | थियो 17 | हुने 18 | गरेका 19 | थिए 20 | गर्दै 21 | तर 22 | नै 23 | को 24 | मा 25 | हुन् 26 | भन्ने 27 | हुन 28 | गरी 29 | त 30 | हुन्छ 31 | अब 32 | के 33 | रहेका 34 | गरेर 35 | छैन 36 | दिए 37 | भए 38 | यस 39 | ले 40 | गर्नु 41 | औं 42 | सो 43 | त्यो 44 | कि 45 | जुन 46 | यी 47 | का 48 | गरि 49 | ती 50 | न 51 | छु 52 | छौं 53 | लाई 54 | नि 55 | उप 56 | अक्सर 57 | आदि 58 | कसरी 59 | क्रमशः 60 | चाले 61 | अगाडी 62 | अझै 63 | अनुसार 64 | अन्तर्गत 65 | अन्य 66 | अन्यत्र 67 | अन्यथा 68 | अरु 69 | अरुलाई 70 | अर्को 71 | अर्थात 72 | अर्थात् 73 | अलग 74 | आए 75 | आजको 76 | ओठ 77 | आत्म 78 | आफू 79 | आफूलाई 80 | आफ्नै 81 | आफ्नो 82 | आयो 83 | उदाहरण 84 | उनको 85 | उहालाई 86 | एउटै 87 | एक 88 | एकदम 89 | कतै 90 | कम से कम 91 | कसै 92 | कसैले 93 | कहाँबाट 94 | कहिलेकाहीं 95 | का 96 | किन 97 | किनभने 98 | कुनै 99 | कुरा 100 | कृपया 101 | केही 102 | कोही 103 | गए 104 | गरौं 105 | गर्छ 106 | गर्छु 107 | गर्नुपर्छ 108 | गयौ 109 | गैर 110 | चार 111 | चाहनुहुन्छ 112 | चाहन्छु 113 | चाहिए 114 | छू 115 | जताततै 116 | जब 117 | जबकि 118 | जसको 119 | जसबाट 120 | जसमा 121 | जसलाई 122 | जसले 123 | जस्तै 124 | जस्तो 125 | जस्तोसुकै 126 | जहाँ 127 | जान 128 | जाहिर 129 | जे 130 | जो 131 | ठीक 132 | तत्काल 133 | तदनुसार 134 | तपाईको 135 | तपाई 136 | पर्याप्त 137 | पहिले 138 | पहिलो 139 | पहिल्यै 140 | पाँच 141 | पाँचौं 142 | तल 143 | तापनी 144 | तिनी 145 | तिनीहरू 146 | तिनीहरुको 147 | तिनिहरुलाई 148 | तिमी 149 | तिर 150 | तीन 151 | तुरुन्तै 152 | तेस्रो 153 | तेस्कारण 154 | पूर्व 155 | प्रति 156 | प्रतेक 157 | प्लस 158 | फेरी 159 | बने 160 | त्सपछि 161 | त्सैले 162 | त्यहाँ 163 | थिएन 164 | दिनुभएको 165 | दिनुहुन्छ 166 | दुई 167 | देखि 168 | बरु 169 | बारे 170 | बाहिर 171 | देखिन्छ 172 | देखियो 173 | देखे 174 | देखेको 175 | देखेर 176 | दोस्रो 177 | धेरै 178 | नजिकै 179 | नत्र 180 | नयाँ 181 | निम्ति 182 | बाहेक 183 | बीच 184 | बीचमा 185 | भन 186 | निम्न 187 | निम्नानुसार 188 | निर्दिष्ट 189 | नौ 190 | पक्का 191 | पक्कै 192 | पछि 193 | पछिल्लो 194 | पटक 195 | पर्छ 196 | पर्थ्यो 197 | भन्छन् 198 | भन् 199 | भन्छु 200 | भन्दा 201 | भन्नुभयो 202 | भर 203 | भित्र 204 | भित्री 205 | म 206 | मलाई 207 | मात्र 208 | माथि 209 | मुख्य 210 | मेरो 211 | यति 212 | यथोचित 213 | यदि 214 | यद्यपि 215 | यसको 216 | यसपछि 217 | यसबाहेक 218 | यसरी 219 | यसो 220 | यस्तो 221 | यहाँ 222 | यहाँसम्म 223 | या 224 | रही 225 | राखे 226 | राख्छ 227 | राम्रो 228 | रूप 229 | लगभग 230 | वरीपरी 231 | वास्तवमा 232 | बिरुद्ध 233 | बिशेष 234 | सायद 235 | शायद 236 | संग 237 | संगै 238 | सक्छ 239 | सट्टा 240 | सधै 241 | सबै 242 | सबैलाई 243 | समय 244 | सम्भव 245 | सम्म 246 | सही 247 | साँच्चै 248 | सात 249 | साथ 250 | साथै 251 | सारा 252 | सोही 253 | स्पष्ट 254 | हरे 255 | हरेक -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/norwegian: -------------------------------------------------------------------------------- 1 | og 2 | i 3 | jeg 4 | det 5 | at 6 | en 7 | et 8 | den 9 | til 10 | er 11 | som 12 | på 13 | de 14 | med 15 | han 16 | av 17 | ikke 18 | ikkje 19 | der 20 | så 21 | var 22 | meg 23 | seg 24 | men 25 | ett 26 | har 27 | om 28 | vi 29 | min 30 | mitt 31 | ha 32 | hadde 33 | hun 34 | nå 35 | over 36 | da 37 | ved 38 | fra 39 | du 40 | ut 41 | sin 42 | dem 43 | oss 44 | opp 45 | man 46 | kan 47 | hans 48 | hvor 49 | eller 50 | hva 51 | skal 52 | selv 53 | sjøl 54 | her 55 | alle 56 | vil 57 | bli 58 | ble 59 | blei 60 | blitt 61 | kunne 62 | inn 63 | når 64 | være 65 | kom 66 | noen 67 | noe 68 | ville 69 | dere 70 | som 71 | deres 72 | kun 73 | ja 74 | etter 75 | ned 76 | skulle 77 | denne 78 | for 79 | deg 80 | si 81 | sine 82 | sitt 83 | mot 84 | å 85 | meget 86 | hvorfor 87 | dette 88 | disse 89 | uten 90 | hvordan 91 | ingen 92 | din 93 | ditt 94 | blir 95 | samme 96 | hvilken 97 | hvilke 98 | sånn 99 | inni 100 | mellom 101 | vår 102 | hver 103 | hvem 104 | vors 105 | hvis 106 | både 107 | bare 108 | enn 109 | fordi 110 | før 111 | mange 112 | også 113 | slik 114 | vært 115 | være 116 | båe 117 | begge 118 | siden 119 | dykk 120 | dykkar 121 | dei 122 | deira 123 | deires 124 | deim 125 | di 126 | då 127 | eg 128 | ein 129 | eit 130 | eitt 131 | elles 132 | honom 133 | hjå 134 | ho 135 | hoe 136 | henne 137 | hennar 138 | hennes 139 | hoss 140 | hossen 141 | ikkje 142 | ingi 143 | inkje 144 | korleis 145 | korso 146 | kva 147 | kvar 148 | kvarhelst 149 | kven 150 | kvi 151 | kvifor 152 | me 153 | medan 154 | mi 155 | mine 156 | mykje 157 | no 158 | nokon 159 | noka 160 | nokor 161 | noko 162 | nokre 163 | si 164 | sia 165 | sidan 166 | so 167 | somt 168 | somme 169 | um 170 | upp 171 | vere 172 | vore 173 | verte 174 | vort 175 | varte 176 | vart 177 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/portuguese: -------------------------------------------------------------------------------- 1 | a 2 | à 3 | ao 4 | aos 5 | aquela 6 | aquelas 7 | aquele 8 | aqueles 9 | aquilo 10 | as 11 | às 12 | até 13 | com 14 | como 15 | da 16 | das 17 | de 18 | dela 19 | delas 20 | dele 21 | deles 22 | depois 23 | do 24 | dos 25 | e 26 | é 27 | ela 28 | elas 29 | ele 30 | eles 31 | em 32 | entre 33 | era 34 | eram 35 | éramos 36 | essa 37 | essas 38 | esse 39 | esses 40 | esta 41 | está 42 | estamos 43 | estão 44 | estar 45 | estas 46 | estava 47 | estavam 48 | estávamos 49 | este 50 | esteja 51 | estejam 52 | estejamos 53 | estes 54 | esteve 55 | estive 56 | estivemos 57 | estiver 58 | estivera 59 | estiveram 60 | estivéramos 61 | estiverem 62 | estivermos 63 | estivesse 64 | estivessem 65 | estivéssemos 66 | estou 67 | eu 68 | foi 69 | fomos 70 | for 71 | fora 72 | foram 73 | fôramos 74 | forem 75 | formos 76 | fosse 77 | fossem 78 | fôssemos 79 | fui 80 | há 81 | haja 82 | hajam 83 | hajamos 84 | hão 85 | havemos 86 | haver 87 | hei 88 | houve 89 | houvemos 90 | houver 91 | houvera 92 | houverá 93 | houveram 94 | houvéramos 95 | houverão 96 | houverei 97 | houverem 98 | houveremos 99 | houveria 100 | houveriam 101 | houveríamos 102 | houvermos 103 | houvesse 104 | houvessem 105 | houvéssemos 106 | isso 107 | isto 108 | já 109 | lhe 110 | lhes 111 | mais 112 | mas 113 | me 114 | mesmo 115 | meu 116 | meus 117 | minha 118 | minhas 119 | muito 120 | na 121 | não 122 | nas 123 | nem 124 | no 125 | nos 126 | nós 127 | nossa 128 | nossas 129 | nosso 130 | nossos 131 | num 132 | numa 133 | o 134 | os 135 | ou 136 | para 137 | pela 138 | pelas 139 | pelo 140 | pelos 141 | por 142 | qual 143 | quando 144 | que 145 | quem 146 | são 147 | se 148 | seja 149 | sejam 150 | sejamos 151 | sem 152 | ser 153 | será 154 | serão 155 | serei 156 | seremos 157 | seria 158 | seriam 159 | seríamos 160 | seu 161 | seus 162 | só 163 | somos 164 | sou 165 | sua 166 | suas 167 | também 168 | te 169 | tem 170 | tém 171 | temos 172 | tenha 173 | tenham 174 | tenhamos 175 | tenho 176 | terá 177 | terão 178 | terei 179 | teremos 180 | teria 181 | teriam 182 | teríamos 183 | teu 184 | teus 185 | teve 186 | tinha 187 | tinham 188 | tínhamos 189 | tive 190 | tivemos 191 | tiver 192 | tivera 193 | tiveram 194 | tivéramos 195 | tiverem 196 | tivermos 197 | tivesse 198 | tivessem 199 | tivéssemos 200 | tu 201 | tua 202 | tuas 203 | um 204 | uma 205 | você 206 | vocês 207 | vos 208 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/romanian: -------------------------------------------------------------------------------- 1 | a 2 | abia 3 | acea 4 | aceasta 5 | această 6 | aceea 7 | aceeasi 8 | acei 9 | aceia 10 | acel 11 | acela 12 | acelasi 13 | acele 14 | acelea 15 | acest 16 | acesta 17 | aceste 18 | acestea 19 | acestei 20 | acestia 21 | acestui 22 | aceşti 23 | aceştia 24 | adica 25 | ai 26 | aia 27 | aibă 28 | aici 29 | al 30 | ala 31 | ale 32 | alea 33 | alt 34 | alta 35 | altceva 36 | altcineva 37 | alte 38 | altfel 39 | alti 40 | altii 41 | altul 42 | am 43 | anume 44 | apoi 45 | ar 46 | are 47 | as 48 | asa 49 | asta 50 | astea 51 | astfel 52 | asupra 53 | atare 54 | atat 55 | atata 56 | atatea 57 | atatia 58 | ati 59 | atit 60 | atita 61 | atitea 62 | atitia 63 | atunci 64 | au 65 | avea 66 | avem 67 | aveţi 68 | avut 69 | aş 70 | aţi 71 | ba 72 | ca 73 | cam 74 | cand 75 | care 76 | careia 77 | carora 78 | caruia 79 | cat 80 | catre 81 | ce 82 | cea 83 | ceea 84 | cei 85 | ceilalti 86 | cel 87 | cele 88 | celor 89 | ceva 90 | chiar 91 | ci 92 | cind 93 | cine 94 | cineva 95 | cit 96 | cita 97 | cite 98 | citeva 99 | citi 100 | citiva 101 | cu 102 | cui 103 | cum 104 | cumva 105 | cât 106 | câte 107 | câtva 108 | câţi 109 | cînd 110 | cît 111 | cîte 112 | cîtva 113 | cîţi 114 | că 115 | căci 116 | cărei 117 | căror 118 | cărui 119 | către 120 | da 121 | daca 122 | dacă 123 | dar 124 | dat 125 | dată 126 | dau 127 | de 128 | deasupra 129 | deci 130 | decit 131 | deja 132 | desi 133 | despre 134 | deşi 135 | din 136 | dintr 137 | dintr- 138 | dintre 139 | doar 140 | doi 141 | doilea 142 | două 143 | drept 144 | dupa 145 | după 146 | dă 147 | e 148 | ea 149 | ei 150 | el 151 | ele 152 | era 153 | eram 154 | este 155 | eu 156 | eşti 157 | face 158 | fara 159 | fata 160 | fel 161 | fi 162 | fie 163 | fiecare 164 | fii 165 | fim 166 | fiu 167 | fiţi 168 | foarte 169 | fost 170 | fără 171 | i 172 | ia 173 | iar 174 | ii 175 | il 176 | imi 177 | in 178 | inainte 179 | inapoi 180 | inca 181 | incit 182 | insa 183 | intr 184 | intre 185 | isi 186 | iti 187 | la 188 | le 189 | li 190 | lor 191 | lui 192 | lângă 193 | lîngă 194 | m 195 | ma 196 | mai 197 | mea 198 | mei 199 | mele 200 | mereu 201 | meu 202 | mi 203 | mie 204 | mine 205 | mod 206 | mult 207 | multa 208 | multe 209 | multi 210 | multă 211 | mulţi 212 | mâine 213 | mîine 214 | mă 215 | ne 216 | ni 217 | nici 218 | nimeni 219 | nimic 220 | niste 221 | nişte 222 | noastre 223 | noastră 224 | noi 225 | nostri 226 | nostru 227 | nou 228 | noua 229 | nouă 230 | noştri 231 | nu 232 | numai 233 | o 234 | or 235 | ori 236 | oricare 237 | orice 238 | oricine 239 | oricum 240 | oricând 241 | oricât 242 | oricînd 243 | oricît 244 | oriunde 245 | pai 246 | parca 247 | patra 248 | patru 249 | pe 250 | pentru 251 | peste 252 | pic 253 | pina 254 | poate 255 | pot 256 | prea 257 | prima 258 | primul 259 | prin 260 | printr- 261 | putini 262 | puţin 263 | puţina 264 | puţină 265 | până 266 | pînă 267 | sa 268 | sa-mi 269 | sa-ti 270 | sai 271 | sale 272 | sau 273 | se 274 | si 275 | sint 276 | sintem 277 | spate 278 | spre 279 | sub 280 | sunt 281 | suntem 282 | sunteţi 283 | sus 284 | să 285 | săi 286 | său 287 | t 288 | ta 289 | tale 290 | te 291 | ti 292 | tine 293 | toata 294 | toate 295 | toată 296 | tocmai 297 | tot 298 | toti 299 | totul 300 | totusi 301 | totuşi 302 | toţi 303 | trei 304 | treia 305 | treilea 306 | tu 307 | tuturor 308 | tăi 309 | tău 310 | u 311 | ul 312 | ului 313 | un 314 | una 315 | unde 316 | undeva 317 | unei 318 | uneia 319 | unele 320 | uneori 321 | unii 322 | unor 323 | unora 324 | unu 325 | unui 326 | unuia 327 | unul 328 | v 329 | va 330 | vi 331 | voastre 332 | voastră 333 | voi 334 | vom 335 | vor 336 | vostru 337 | vouă 338 | voştri 339 | vreo 340 | vreun 341 | vă 342 | zi 343 | zice 344 | îi 345 | îl 346 | îmi 347 | în 348 | îţi 349 | ăla 350 | ălea 351 | ăsta 352 | ăstea 353 | ăştia 354 | şi 355 | ţi 356 | ţie -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/russian: -------------------------------------------------------------------------------- 1 | и 2 | в 3 | во 4 | не 5 | что 6 | он 7 | на 8 | я 9 | с 10 | со 11 | как 12 | а 13 | то 14 | все 15 | она 16 | так 17 | его 18 | но 19 | да 20 | ты 21 | к 22 | у 23 | же 24 | вы 25 | за 26 | бы 27 | по 28 | только 29 | ее 30 | мне 31 | было 32 | вот 33 | от 34 | меня 35 | еще 36 | нет 37 | о 38 | из 39 | ему 40 | теперь 41 | когда 42 | даже 43 | ну 44 | вдруг 45 | ли 46 | если 47 | уже 48 | или 49 | ни 50 | быть 51 | был 52 | него 53 | до 54 | вас 55 | нибудь 56 | опять 57 | уж 58 | вам 59 | ведь 60 | там 61 | потом 62 | себя 63 | ничего 64 | ей 65 | может 66 | они 67 | тут 68 | где 69 | есть 70 | надо 71 | ней 72 | для 73 | мы 74 | тебя 75 | их 76 | чем 77 | была 78 | сам 79 | чтоб 80 | без 81 | будто 82 | чего 83 | раз 84 | тоже 85 | себе 86 | под 87 | будет 88 | ж 89 | тогда 90 | кто 91 | этот 92 | того 93 | потому 94 | этого 95 | какой 96 | совсем 97 | ним 98 | здесь 99 | этом 100 | один 101 | почти 102 | мой 103 | тем 104 | чтобы 105 | нее 106 | сейчас 107 | были 108 | куда 109 | зачем 110 | всех 111 | никогда 112 | можно 113 | при 114 | наконец 115 | два 116 | об 117 | другой 118 | хоть 119 | после 120 | над 121 | больше 122 | тот 123 | через 124 | эти 125 | нас 126 | про 127 | всего 128 | них 129 | какая 130 | много 131 | разве 132 | три 133 | эту 134 | моя 135 | впрочем 136 | хорошо 137 | свою 138 | этой 139 | перед 140 | иногда 141 | лучше 142 | чуть 143 | том 144 | нельзя 145 | такой 146 | им 147 | более 148 | всегда 149 | конечно 150 | всю 151 | между 152 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/spanish: -------------------------------------------------------------------------------- 1 | de 2 | la 3 | que 4 | el 5 | en 6 | y 7 | a 8 | los 9 | del 10 | se 11 | las 12 | por 13 | un 14 | para 15 | con 16 | no 17 | una 18 | su 19 | al 20 | lo 21 | como 22 | más 23 | pero 24 | sus 25 | le 26 | ya 27 | o 28 | este 29 | sí 30 | porque 31 | esta 32 | entre 33 | cuando 34 | muy 35 | sin 36 | sobre 37 | también 38 | me 39 | hasta 40 | hay 41 | donde 42 | quien 43 | desde 44 | todo 45 | nos 46 | durante 47 | todos 48 | uno 49 | les 50 | ni 51 | contra 52 | otros 53 | ese 54 | eso 55 | ante 56 | ellos 57 | e 58 | esto 59 | mí 60 | antes 61 | algunos 62 | qué 63 | unos 64 | yo 65 | otro 66 | otras 67 | otra 68 | él 69 | tanto 70 | esa 71 | estos 72 | mucho 73 | quienes 74 | nada 75 | muchos 76 | cual 77 | poco 78 | ella 79 | estar 80 | estas 81 | algunas 82 | algo 83 | nosotros 84 | mi 85 | mis 86 | tú 87 | te 88 | ti 89 | tu 90 | tus 91 | ellas 92 | nosotras 93 | vosotros 94 | vosotras 95 | os 96 | mío 97 | mía 98 | míos 99 | mías 100 | tuyo 101 | tuya 102 | tuyos 103 | tuyas 104 | suyo 105 | suya 106 | suyos 107 | suyas 108 | nuestro 109 | nuestra 110 | nuestros 111 | nuestras 112 | vuestro 113 | vuestra 114 | vuestros 115 | vuestras 116 | esos 117 | esas 118 | estoy 119 | estás 120 | está 121 | estamos 122 | estáis 123 | están 124 | esté 125 | estés 126 | estemos 127 | estéis 128 | estén 129 | estaré 130 | estarás 131 | estará 132 | estaremos 133 | estaréis 134 | estarán 135 | estaría 136 | estarías 137 | estaríamos 138 | estaríais 139 | estarían 140 | estaba 141 | estabas 142 | estábamos 143 | estabais 144 | estaban 145 | estuve 146 | estuviste 147 | estuvo 148 | estuvimos 149 | estuvisteis 150 | estuvieron 151 | estuviera 152 | estuvieras 153 | estuviéramos 154 | estuvierais 155 | estuvieran 156 | estuviese 157 | estuvieses 158 | estuviésemos 159 | estuvieseis 160 | estuviesen 161 | estando 162 | estado 163 | estada 164 | estados 165 | estadas 166 | estad 167 | he 168 | has 169 | ha 170 | hemos 171 | habéis 172 | han 173 | haya 174 | hayas 175 | hayamos 176 | hayáis 177 | hayan 178 | habré 179 | habrás 180 | habrá 181 | habremos 182 | habréis 183 | habrán 184 | habría 185 | habrías 186 | habríamos 187 | habríais 188 | habrían 189 | había 190 | habías 191 | habíamos 192 | habíais 193 | habían 194 | hube 195 | hubiste 196 | hubo 197 | hubimos 198 | hubisteis 199 | hubieron 200 | hubiera 201 | hubieras 202 | hubiéramos 203 | hubierais 204 | hubieran 205 | hubiese 206 | hubieses 207 | hubiésemos 208 | hubieseis 209 | hubiesen 210 | habiendo 211 | habido 212 | habida 213 | habidos 214 | habidas 215 | soy 216 | eres 217 | es 218 | somos 219 | sois 220 | son 221 | sea 222 | seas 223 | seamos 224 | seáis 225 | sean 226 | seré 227 | serás 228 | será 229 | seremos 230 | seréis 231 | serán 232 | sería 233 | serías 234 | seríamos 235 | seríais 236 | serían 237 | era 238 | eras 239 | éramos 240 | erais 241 | eran 242 | fui 243 | fuiste 244 | fue 245 | fuimos 246 | fuisteis 247 | fueron 248 | fuera 249 | fueras 250 | fuéramos 251 | fuerais 252 | fueran 253 | fuese 254 | fueses 255 | fuésemos 256 | fueseis 257 | fuesen 258 | sintiendo 259 | sentido 260 | sentida 261 | sentidos 262 | sentidas 263 | siente 264 | sentid 265 | tengo 266 | tienes 267 | tiene 268 | tenemos 269 | tenéis 270 | tienen 271 | tenga 272 | tengas 273 | tengamos 274 | tengáis 275 | tengan 276 | tendré 277 | tendrás 278 | tendrá 279 | tendremos 280 | tendréis 281 | tendrán 282 | tendría 283 | tendrías 284 | tendríamos 285 | tendríais 286 | tendrían 287 | tenía 288 | tenías 289 | teníamos 290 | teníais 291 | tenían 292 | tuve 293 | tuviste 294 | tuvo 295 | tuvimos 296 | tuvisteis 297 | tuvieron 298 | tuviera 299 | tuvieras 300 | tuviéramos 301 | tuvierais 302 | tuvieran 303 | tuviese 304 | tuvieses 305 | tuviésemos 306 | tuvieseis 307 | tuviesen 308 | teniendo 309 | tenido 310 | tenida 311 | tenidos 312 | tenidas 313 | tened 314 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/swedish: -------------------------------------------------------------------------------- 1 | och 2 | det 3 | att 4 | i 5 | en 6 | jag 7 | hon 8 | som 9 | han 10 | på 11 | den 12 | med 13 | var 14 | sig 15 | för 16 | så 17 | till 18 | är 19 | men 20 | ett 21 | om 22 | hade 23 | de 24 | av 25 | icke 26 | mig 27 | du 28 | henne 29 | då 30 | sin 31 | nu 32 | har 33 | inte 34 | hans 35 | honom 36 | skulle 37 | hennes 38 | där 39 | min 40 | man 41 | ej 42 | vid 43 | kunde 44 | något 45 | från 46 | ut 47 | när 48 | efter 49 | upp 50 | vi 51 | dem 52 | vara 53 | vad 54 | över 55 | än 56 | dig 57 | kan 58 | sina 59 | här 60 | ha 61 | mot 62 | alla 63 | under 64 | någon 65 | eller 66 | allt 67 | mycket 68 | sedan 69 | ju 70 | denna 71 | själv 72 | detta 73 | åt 74 | utan 75 | varit 76 | hur 77 | ingen 78 | mitt 79 | ni 80 | bli 81 | blev 82 | oss 83 | din 84 | dessa 85 | några 86 | deras 87 | blir 88 | mina 89 | samma 90 | vilken 91 | er 92 | sådan 93 | vår 94 | blivit 95 | dess 96 | inom 97 | mellan 98 | sådant 99 | varför 100 | varje 101 | vilka 102 | ditt 103 | vem 104 | vilket 105 | sitta 106 | sådana 107 | vart 108 | dina 109 | vars 110 | vårt 111 | våra 112 | ert 113 | era 114 | vilkas 115 | -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/tajik: -------------------------------------------------------------------------------- 1 | аз 2 | дар 3 | ба 4 | бо 5 | барои 6 | бе 7 | то 8 | ҷуз 9 | пеши 10 | назди 11 | рӯйи 12 | болои 13 | паси 14 | ғайри 15 | ҳамон 16 | ҳамоно 17 | инҷониб 18 | замон 19 | замоно 20 | эътиборан 21 | пеш 22 | қабл 23 | дида 24 | сар карда 25 | агар 26 | агар ки 27 | валекин 28 | ки 29 | лекин 30 | аммо 31 | вале 32 | балки 33 | ва 34 | ҳарчанд 35 | чунки 36 | зеро 37 | зеро ки 38 | вақте ки 39 | то вақте ки 40 | барои он ки 41 | бо нияти он ки 42 | лекин ва ҳол он ки 43 | ё 44 | ё ин ки 45 | бе он ки 46 | дар ҳолате ки 47 | то даме ки 48 | баъд аз он ки 49 | даме ки 50 | ба тразе ки 51 | аз баҳри он ки 52 | гар 53 | ар 54 | ба шарте 55 | азбаски 56 | модоме ки 57 | агар чи 58 | гарчанде ки 59 | бо вуҷуди он ки 60 | гӯё 61 | аз-баски 62 | чун-ки 63 | агар-чанд 64 | агар-чи 65 | гар-чи 66 | то ки 67 | чунон ки 68 | то даме ки 69 | ҳар қадар ки 70 | магар 71 | оё 72 | наход 73 | ҳатто 74 | ҳам 75 | бале 76 | оре 77 | хуб 78 | хуш 79 | хайр 80 | не 81 | на 82 | мана 83 | э 84 | фақат 85 | танҳо 86 | кошки 87 | мабодо 88 | ҳтимол 89 | ана ҳамин 90 | наход ки 91 | ҳатто ки 92 | аз афташ 93 | майлаш куя 94 | ана 95 | ҳа 96 | канӣ 97 | гӯё ки 98 | ҳо ана 99 | на ин ки 100 | ваҳ 101 | ҳой 102 | и 103 | а 104 | о 105 | эҳ 106 | ҳе 107 | ҳу 108 | аҳа 109 | оҳе 110 | уҳа 111 | ҳм 112 | нм 113 | оббо 114 | ӯббо 115 | ҳой-ҳой 116 | вой-вой 117 | ту-ту 118 | ҳмм 119 | эҳа 120 | тавба 121 | ӯҳӯ 122 | аҷабо 123 | ало 124 | аё 125 | ой 126 | ӯим 127 | ором 128 | хом?ш 129 | ҳай-ҳай 130 | бай-бай 131 | аз 132 | он 133 | баъд 134 | азбаски 135 | ӯ 136 | ҳангоми 137 | чӣ 138 | кадом 139 | ин 140 | ҷо 141 | ҳам 142 | ё ки 143 | бояд 144 | аст 145 | чанд 146 | ҳар 147 | бар 148 | чаро ки 149 | агар 150 | то кӣ 151 | бинобар 152 | бинобар ин 153 | ҳаргиз 154 | асло 155 | нахот 156 | нахот ки 157 | кошкӣ 158 | шояд 159 | шояд ки 160 | охир 161 | аз рӯи 162 | аз рӯйи 163 | рӯ -------------------------------------------------------------------------------- /src/sources/nltk_data/corpora/stopwords/turkish: -------------------------------------------------------------------------------- 1 | acaba 2 | ama 3 | aslında 4 | az 5 | bazı 6 | belki 7 | biri 8 | birkaç 9 | birşey 10 | biz 11 | bu 12 | çok 13 | çünkü 14 | da 15 | daha 16 | de 17 | defa 18 | diye 19 | eğer 20 | en 21 | gibi 22 | hem 23 | hep 24 | hepsi 25 | her 26 | hiç 27 | için 28 | ile 29 | ise 30 | kez 31 | ki 32 | kim 33 | mı 34 | mu 35 | mü 36 | nasıl 37 | ne 38 | neden 39 | nerde 40 | nerede 41 | nereye 42 | niçin 43 | niye 44 | o 45 | sanki 46 | şey 47 | siz 48 | şu 49 | tüm 50 | ve 51 | veya 52 | ya 53 | yani 54 | -------------------------------------------------------------------------------- /src/sources/post_process.py: -------------------------------------------------------------------------------- 1 | import re, os 2 | import sqlparse 3 | import argparse 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--llm", default='codellama') 6 | parser.add_argument("--file", default='') 7 | parser.add_argument("--output", default="") 8 | 9 | args = parser.parse_args() 10 | 11 | llm = args.llm 12 | 13 | def process_duplication(sql): 14 | sql = sql.strip().split("#")[0] 15 | return sql 16 | 17 | def extract_sql(query, llm='codellama'): 18 | if llm == "sensechat": 19 | query = query.replace("### SQL:","").replace("###","").replace("#","") 20 | if '```SQL ' in query: 21 | try: 22 | query = re.findall(r"```SQL(.*?)```", query.replace('\n', ' '))[0] 23 | # return re.findall(r"```SQL(.*?)```", query.replace('\n', ' '))[0] 24 | except: 25 | query = re.findall(r"```SQL(.*?)", query.replace('\n', ' '))[0] 26 | # return re.findall(r"```SQL(.*?)", query.replace('\n', ' '))[0] 27 | if '```sql ' in query: 28 | 29 | # print(query) 30 | try: 31 | return re.findall(r"```sql(.*?)```", query.replace('\n', ' '))[0] 32 | except: 33 | return re.findall(r"```sql(.*?)", query.replace('\n', ' '))[0] 34 | else: 35 | return query.replace('\n', '').replace("`","") 36 | elif llm == "codellama" or llm == "sqlcoder": 37 | if ';' in query: 38 | res = re.findall(r'(.*?);', query.replace('\n', ' '))[0].strip() 39 | # print(res) 40 | res = res.replace('# ', '').replace('##', '') 41 | if res.lower().strip().startswith("SELECT".lower()): 42 | return res.replace('# ', '').replace('##', '') 43 | else: 44 | return "SELECT " + res 45 | 46 | else: 47 | res = query.replace('\n', '').split("###")[0].replace('# ', '').replace('##', '') 48 | if res.lower().strip().startswith("SELECT".lower()): 49 | return res.replace('# ', '').replace('##', '') 50 | else: 51 | return "SELECT " + res 52 | elif llm == "puyu": 53 | res = query.replace("#","") 54 | if res.lower().strip().startswith("SELECT".lower()): 55 | return res 56 | else: 57 | return "SELECT " + res 58 | elif llm == 'gpt': 59 | sql = " ".join(query.replace("\n", " ").split()) 60 | sql = process_duplication(sql) 61 | # python version should >= 3.8 62 | if sql.lower().strip().startswith("select"): 63 | return sql 64 | elif sql.startswith(" "): 65 | return "SELECT" + sql 66 | else: 67 | return "SELECT " + sql 68 | else: 69 | return query 70 | 71 | def extract_sql_from_text(text): 72 | sql_pattern = text.replace("\n", " ").replace("ി", " ").split('~~') 73 | return sql_pattern 74 | 75 | 76 | with open(args.file, 'r', encoding='utf-8') as file: 77 | content = file.readlines() 78 | 79 | mid = extract_sql_from_text("\n".join(content)) 80 | extracted_query = [sqlparse.format(extract_sql(q, llm), reindent=False) for q in extract_sql_from_text("\n".join(content))] 81 | if args.output: 82 | with open(args.output, 'w', encoding='utf-8') as file: 83 | file.write('\n'.join(extracted_query)) 84 | else: 85 | with open(args.file.replace(".txt", "_out.txt") , 'w', encoding='utf-8') as file: 86 | file.write('\n'.join(extracted_query)) 87 | -------------------------------------------------------------------------------- /src/sources/run_all.sh: -------------------------------------------------------------------------------- 1 | bash src/sources/run_one_model.sh gpt 2 | bash src/sources/run_cross_voting.sh -------------------------------------------------------------------------------- /src/sources/run_cross_voting.sh: -------------------------------------------------------------------------------- 1 | echo "## Start generating sqls by other llms ..." 2 | 3 | python src/sources/sql_gen/main.py --model sqlcoderapi --kshot 9 --pool 1 --out_file src/sources/raw.txt --select_type Euclidean_mask --dataset ppl_dev_add_sl.json --sl 4 | sleep 1 5 | python src/sources/post_process.py --file src/sources/raw.txt --llm sqlcoder 6 | 7 | python src/sources/sql_gen/main.py --model puyuapi --kshot 9 --pool 1 --out_file src/sources/raw.txt --select_type Euclidean_mask --dataset ppl_dev_add_sl.json --sl 8 | sleep 1 9 | python src/sources/post_process.py --file src/sources/raw.txt --llm puyu 10 | 11 | python src/sources/sql_gen/main.py --model codellamaapi --kshot 9 --pool 1 --out_file src/sources/raw.txt --select_type Euclidean_mask --dataset ppl_dev_add_sl.json --sl 12 | sleep 1 13 | python src/sources/post_process.py --file src/sources/raw.txt --llm codellama -------------------------------------------------------------------------------- /src/sources/run_one_model.sh: -------------------------------------------------------------------------------- 1 | echo "## Data processing ..." 2 | source /root/spider_env/bin/activate 3 | export NLTK_DATA=src/sources/nltk_data 4 | export CORENLP_HOME=/root/stanford-corenlp-full-2018-10-05 5 | export CACHE_DIR=/root 6 | START_TIME=`date +%s` 7 | python src/sources/data_preprocess.py 8 | END_TIME=`date +%s` 9 | EXECUTING_TIME=`expr $END_TIME - $START_TIME` 10 | echo "data preprocess time consume: $EXECUTING_TIME s" 11 | 12 | echo "## Start generation of prediction file: 'predicted_sql.txt' ..." 13 | START_TIME=`date +%s` 14 | python src/sources/sql_gen/main.py --model "$1api" --kshot 9 --pool 1 --out_file src/sources/raw.txt --select_type Euclidean_mask 15 | sleep 1 16 | echo "1st round done!" 17 | END_TIME=`date +%s` 18 | EXECUTING_TIME=`expr $END_TIME - $START_TIME` 19 | echo "1st round time consume: $EXECUTING_TIME s" 20 | 21 | python src/sources/post_process.py --file src/sources/raw.txt --llm $1 22 | mv src/sources/raw_out.txt src/sources/"intermediate_results_only_dont_use_$1.txt" 23 | python src/sources/schemalink.py --output ppl_dev_add_sl.json 24 | 25 | START_TIME=`date +%s` 26 | python src/sources/sql_gen/main.py --model "$1api" --kshot 9 --pool 1 --out_file src/sources/raw.txt --select_type Euclidean_mask --sl --dataset ppl_dev_add_sl.json 27 | sleep 1 28 | echo "2nd round done!" 29 | END_TIME=`date +%s` 30 | EXECUTING_TIME=`expr $END_TIME - $START_TIME` 31 | echo "2nd round time consume: $EXECUTING_TIME s" 32 | 33 | python src/sources/post_process.py --file src/sources/raw.txt --llm $1 34 | mv src/sources/raw_out.txt src/sources/"predicted_sql_$1.txt" 35 | echo "File 'predicted_sql.txt' generated." -------------------------------------------------------------------------------- /src/sources/schemalink.py: -------------------------------------------------------------------------------- 1 | import json, tqdm, sys, os, re 2 | from multiprocessing import Pool 3 | import time 4 | 5 | proj_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) 6 | sys.path.append(proj_dir) 7 | from llms import * 8 | from llms import Puyu as puyu 9 | import logging 10 | import tqdm, logging 11 | from sql_metadata import Parser 12 | import argparse 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--output", default='ppl_dev_add_sl.json') 15 | args = parser.parse_args() 16 | 17 | def extract_tab_from_sql(item, sample): 18 | try: 19 | linked_tables = Parser(item).tables 20 | except: 21 | ddl = sample['simplified_ddl'] 22 | ele_query = " ".join([i for i in re.split(r'[^\w\s*]', item)]).split() 23 | ele_query = [i.lower() for i in ele_query] 24 | split_ddl = ddl.split(";\n") 25 | linked_tables = [] 26 | all_tables = [] 27 | for it in split_ddl: 28 | all_tables.append(it[2:it.index('(')]) 29 | for one_table in all_tables: 30 | one_table = one_table.lower() 31 | if one_table in ele_query: 32 | linked_tables.append(one_table) 33 | sample['linked_tables_gpt'] = linked_tables 34 | 35 | 36 | if __name__ == '__main__': 37 | input_data = json.load(open(os.path.dirname(__file__) + "/ppl_dev.json", 'r')) 38 | file_path = os.path.dirname(__file__) 39 | with open(file_path + '/intermediate_results_only_dont_use_gpt.txt', 'r') as f: 40 | clm = f.readlines() 41 | gpt_tab = [] 42 | for ix, it in enumerate(clm): 43 | extract_tab_from_sql(it, input_data[ix]) 44 | json.dump(input_data, open(os.path.dirname(__file__) + f"/{args.output}", 'w'), indent=4) 45 | -------------------------------------------------------------------------------- /src/sources/sql_gen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhshLii/PETSQL/3d73b36ab42525e593c4d70edb74987eac741744/src/sources/sql_gen/__init__.py -------------------------------------------------------------------------------- /src/sources/sql_gen/get_example_modules.py: -------------------------------------------------------------------------------- 1 | import json, os 2 | import jsonlines 3 | import random 4 | import numpy as np 5 | from sentence_transformers import SentenceTransformer 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | from sklearn.metrics.pairwise import euclidean_distances 8 | from sql_gen_utils import jaccard_similarity, mask_question_with_schema_linking, sql2skeleton 9 | 10 | 11 | class BasicExampleSelector(object): 12 | def __init__(self): 13 | 14 | with open("src/code_submit/dataset/ppl_train_other.json") as f: 15 | self.train_json = json.load(f) 16 | print(f"stored {len(self.train_json)} libray") 17 | self.train_questions = [sample['question'] for sample in self.train_json] 18 | 19 | with jsonlines.open('src/code_submit/dataset/train_schema-linking.jsonl', 'r') as jsonl_f: 20 | self.train_schema_jsonl = [obj for obj in jsonl_f] 21 | with jsonlines.open('test_schema-linking.jsonl', 'r') as jsonl_f: 22 | self.test_schema_jsonl = [obj for obj in jsonl_f] 23 | 24 | 25 | def get_examples(self, target, num_example, cross_domain=False): 26 | pass 27 | 28 | 29 | def get_schemas_and_preresult(self,): 30 | self.db_id_to_table_json = dict() 31 | # try: 32 | for table_json in json.load(open("data/spider/test_data/tables.json", "r")): 33 | self.db_id_to_table_json[table_json["db_id"]] = table_json 34 | # except: 35 | # data_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) 36 | # for table_json in json.load(open(data_path + "/data/tables.json", "r")): 37 | # self.db_id_to_table_json[table_json["db_id"]] = table_json 38 | dirs = os.path.dirname(os.path.dirname(__file__)) 39 | with open(dirs + "/raw_out.txt", 'r') as f: 40 | lines = f.readlines() 41 | self.preresult = [line.strip() for line in lines] 42 | 43 | def get_train_pre_skeleton(self): 44 | skeletons = [] 45 | queries= [data["gold_sql"] for data in self.train_json] 46 | schemas = [self.db_id_to_table_json[d["db"]] for d in self.train_json] 47 | for query,schema in zip(queries, schemas): 48 | skeletons.append(sql2skeleton(query, schema)) 49 | 50 | for id in range(min(len(self.train_json), len(skeletons))): 51 | self.train_json[id]["pre_skeleton"] = skeletons[id] 52 | 53 | 54 | def get_target_pre_skeleton(self, target): 55 | schema = self.db_id_to_table_json[target["db"]] 56 | query = self.preresult[target['id']] 57 | # query = target['gold_sql'] 58 | return sql2skeleton(query, schema) 59 | 60 | class RandomExampleSelector(BasicExampleSelector): 61 | def __init__(self, *args, **kwargs): 62 | super().__init__() 63 | random.seed(0) 64 | print(f"set seed for random select") 65 | 66 | def get_examples(self, target, num_example, cross_domain=False): 67 | train_json = self.train_json 68 | indexes = list(range(len(train_json))) 69 | selected_indexes = random.sample(indexes, num_example) 70 | return [train_json[index] for index in selected_indexes] 71 | 72 | 73 | class CosineSimilarExampleSelector(BasicExampleSelector): 74 | def __init__(self, *args, **kwargs): 75 | super().__init__() 76 | 77 | self.SELECT_MODEL = "./sentence_transformers" 78 | 79 | self.bert_model = SentenceTransformer(self.SELECT_MODEL, device="cpu") 80 | self.train_embeddings = self.bert_model.encode(self.train_questions, show_progress_bar=False) 81 | print(f"processed embedding for cosine select") 82 | 83 | def get_examples(self, target, num_example, cross_domain=False): 84 | target_embedding = self.bert_model.encode([target["question"]], show_progress_bar=False) 85 | 86 | similarities = np.squeeze(cosine_similarity(target_embedding, self.train_embeddings)).tolist() 87 | pairs = [(similarity, index) for similarity, index in zip(similarities, range(len(similarities)))] 88 | 89 | train_json = self.train_json 90 | pairs_sorted = sorted(pairs, key=lambda x: x[0], reverse=True) 91 | top_pairs = list() 92 | for s, index in pairs_sorted: 93 | if train_json[index]["question"] == target["question"]: 94 | continue 95 | top_pairs.append((index, s)) 96 | if len(top_pairs) >= num_example: 97 | break 98 | return [train_json[index] for (index, s) in top_pairs] 99 | 100 | class EuclideanDistanceQuestionMaskSelector(BasicExampleSelector): 101 | def __init__(self, *args, **kwargs): 102 | super().__init__() 103 | 104 | self.SELECT_MODEL = "./sentence_transformers" 105 | self.mask_token = "" # the "" is the mask token of all-mpnet-base-v2 106 | self.value_token = "" # the "" is the unknown token of all-mpnet-base-v2 107 | 108 | train_mask_questions = mask_question_with_schema_linking(self.train_schema_jsonl, mask_tag=self.mask_token, value_tag=self.value_token) 109 | self.bert_model = SentenceTransformer(self.SELECT_MODEL, device="cpu") 110 | self.train_embeddings = self.bert_model.encode(train_mask_questions, show_progress_bar=False) 111 | 112 | def get_examples(self, target, num_example, cross_domain=False): 113 | 114 | target_mask_question = mask_question_with_schema_linking([self.test_schema_jsonl[target['id']]], mask_tag=self.mask_token, value_tag=self.value_token) 115 | target_embedding = self.bert_model.encode(target_mask_question, show_progress_bar=False) 116 | 117 | # find the most similar question in train dataset 118 | distances = np.squeeze(euclidean_distances(target_embedding, self.train_embeddings)).tolist() 119 | pairs = [(distance, index) for distance, index in zip(distances, range(len(distances)))] 120 | 121 | train_json = self.train_json 122 | pairs_sorted = sorted(pairs, key=lambda x: x[0]) 123 | top_pairs = list() 124 | for d, index in pairs_sorted: 125 | similar_db_id = train_json[index]["db"] 126 | if cross_domain and similar_db_id != target["db"]: 127 | continue 128 | top_pairs.append((index, d)) 129 | if len(top_pairs) >= num_example: 130 | break 131 | 132 | return [train_json[index] for (index, d) in top_pairs] 133 | 134 | 135 | class EuclideanDistanceQuestionMaskPreSkeletonSimilarThresholdSelector(BasicExampleSelector): 136 | def __init__(self, *args, **kwargs): 137 | super().__init__() 138 | 139 | self.get_schemas_and_preresult() 140 | self.get_train_pre_skeleton() 141 | 142 | self.SELECT_MODEL = "./sentence_transformers" 143 | self.mask_token = "" # the "" is the mask token of all-mpnet-base-v2 144 | self.value_token = "" # the "" is the unknown token of all-mpnet-base-v2 145 | self.threshold = 0.85 146 | 147 | train_mask_questions = mask_question_with_schema_linking(self.train_schema_jsonl, mask_tag=self.mask_token, value_tag=self.value_token) 148 | self.bert_model = SentenceTransformer(self.SELECT_MODEL, device="cpu") 149 | self.train_embeddings = self.bert_model.encode(train_mask_questions, show_progress_bar=False) 150 | 151 | 152 | 153 | def get_examples(self, target, num_example, cross_domain=False): 154 | scope_factor = 100 155 | target_mask_question = mask_question_with_schema_linking([self.test_schema_jsonl[target['id']]], mask_tag=self.mask_token, value_tag=self.value_token) 156 | target_embedding = self.bert_model.encode(target_mask_question, show_progress_bar=False) 157 | 158 | target["pre_skeleton"] = self.get_target_pre_skeleton(target) 159 | # find the most similar question in train dataset 160 | distances = np.squeeze(euclidean_distances(target_embedding, self.train_embeddings)).tolist() 161 | pairs = [(distance, index) for distance, index in zip(distances, range(len(distances)))] 162 | 163 | train_json = self.train_json 164 | pairs_sorted = sorted(pairs, key=lambda x: x[0]) 165 | top_pairs = list() 166 | for d, index in pairs_sorted: 167 | similar_db_id = train_json[index]["db"] 168 | if cross_domain and similar_db_id != target["db"]: 169 | continue 170 | # Skeleton similarity 171 | if jaccard_similarity(train_json[index]["pre_skeleton"], target["pre_skeleton"]) < self.threshold: 172 | continue 173 | top_pairs.append((index, d)) 174 | if len(top_pairs) >= num_example*scope_factor: 175 | break 176 | 177 | if len(top_pairs) < num_example*scope_factor: 178 | for d, index in pairs_sorted: 179 | similar_db_id = train_json[index]["db"] 180 | if cross_domain and similar_db_id != target["db"]: 181 | continue 182 | # Skeleton similarity 183 | if jaccard_similarity(train_json[index]["pre_skeleton"], target["pre_skeleton"]) >= self.threshold: 184 | continue 185 | top_pairs.append((index, d)) 186 | if len(top_pairs) >= num_example*scope_factor: 187 | break 188 | return [train_json[index] for (index, d) in top_pairs[:num_example]] 189 | 190 | 191 | def get_examples_ins(select_type = "Random"): 192 | if select_type == "CosineSimilar": 193 | examples_libary = CosineSimilarExampleSelector() 194 | elif select_type == "Random": 195 | examples_libary = RandomExampleSelector() 196 | elif select_type == "Euclidean_mask": 197 | examples_libary = EuclideanDistanceQuestionMaskSelector() 198 | elif select_type == "Euclidean_mask_select": 199 | examples_libary = EuclideanDistanceQuestionMaskPreSkeletonSimilarThresholdSelector() 200 | return examples_libary 201 | 202 | 203 | if __name__=='__main__': 204 | with open("data/ppl_test_input_sql_gen.json") as f: 205 | input_data = json.load(f) 206 | target = input_data[0] 207 | examples_libary = get_examples_ins(target, "CosineSimilar", 5) 208 | examples_libary.get_examples(target, 5) 209 | import pdb; pdb.set_trace() 210 | -------------------------------------------------------------------------------- /src/sources/sql_gen/main.py: -------------------------------------------------------------------------------- 1 | import json, re 2 | import os, sys 3 | 4 | proj_dir = os.path.dirname(os.path.dirname(__file__)) 5 | # print(proj_dir) 6 | sys.path.append(proj_dir) 7 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 8 | from llms import codellama, Puyu, Llama2, SQLCoder, vicuna, GPT 9 | from tqdm import tqdm 10 | import argparse 11 | from get_example_modules import get_examples_ins 12 | from data_preprocess import gen_ppl_from_json 13 | 14 | import logging, sqlite3 15 | 16 | 17 | def get_example_prefix(): 18 | return "### Some example pairs of question and corresponding SQL query are provided based on similar problems:\n\n" 19 | 20 | 21 | def format_example(example: dict): 22 | template_qa = "### {}\n{}" 23 | return template_qa.format(example['question'], example['gold_sql']) 24 | 25 | 26 | def formatting_prompt(sample): 27 | question = sample['question'] 28 | ddls = sample['simplified_ddl'] 29 | db = sample['db'] 30 | # 动态加载前三行数据 31 | simplified_ddl_data = [] 32 | # 读取数据库 33 | mydb = sqlite3.connect( 34 | fr"data/spider/test_database/{db}/{db}.sqlite") # 链接数据库 35 | cur = mydb.cursor() 36 | # 表 37 | cur.execute("SELECT name FROM sqlite_master WHERE type='table'") 38 | Tables = cur.fetchall() # Tables 为元组列表 39 | for table in Tables: 40 | # 列 41 | cur.execute(f"select * from `{table[0]}`") 42 | col_name_list = [tuple[0] for tuple in cur.description] 43 | # print(col_name_list) 44 | db_data_all = [] 45 | # 获取前三行数据 46 | for i in range(3): 47 | db_data_all.append(cur.fetchone()) 48 | # ddls_data 49 | test = "" 50 | for idx, column_data in enumerate(col_name_list): 51 | # print(list(db_data_all[2])[idx]) 52 | try: 53 | test += f"{column_data}[{list(db_data_all[0])[idx]},{list(db_data_all[1])[idx]},{list(db_data_all[2])[idx]}]," 54 | except: 55 | test = test 56 | simplified_ddl_data.append(f"{table[0]}({test[:-1]})") 57 | ddls_data = "# " + ";\n# ".join(simplified_ddl_data) + ";\n" 58 | foreign_key = "" 59 | for foreign_key_data in sample["foreign_key"][0].split("\n"): 60 | foreign_key += f'# {foreign_key_data};\n' 61 | foreign_key = foreign_key[:-2] 62 | # evidence = "".join(sample['gt_evidence']) 63 | prompt = f'''### Answer the question by sqlite SQL query only and with no explanation. You must minimize SQL execution time while ensuring correctness.\n### Sqlite SQL tables, with their properties:\n#\n{ddls}#\n### Here are some data information about database references.\n#\n{ddls_data}#\n### Foreign key information of Sqlite SQL tables, used for table joins: \n#\n{foreign_key}\n#\n### Question: {question}\n### SQL: ''' 64 | 65 | return prompt 66 | 67 | 68 | def formatting_prompt_sl(sample): 69 | linked_tables = [i.lower() for i in sample['linked_tables_gpt']] 70 | tbs = [] 71 | for tb in sample['simplified_ddl'].split("\n"): 72 | t = tb.split("(")[0].strip("#").strip() 73 | if t.lower() in linked_tables: 74 | tbs.append(t) 75 | sc_tables = tbs 76 | ddl = sample['simplified_ddl'][:-2] 77 | split_ddl = ddl.split(";\n") 78 | fk_all = sample["foreign_key"][0].split("\n") 79 | ddl_sc = [] 80 | fk_sc = [] 81 | db = sample['db'] 82 | # 动态加载前三行数据 83 | simplified_ddl_data = [] 84 | # 读取数据库 85 | mydb = sqlite3.connect( 86 | fr"data/spider/test_database/{db}/{db}.sqlite") # 链接数据库 87 | cur = mydb.cursor() 88 | # 外键 89 | for fk_test in fk_all: 90 | num = 0 91 | for tab in sc_tables: 92 | if str(" " + tab + "(").lower() in " " + str(fk_test).lower(): 93 | num += 1 94 | if num == 2: 95 | fk_sc.append(fk_test) 96 | fk_sc = list(set(fk_sc)) 97 | for table in sc_tables: 98 | # ddl 99 | for ddl_test in split_ddl: 100 | if str(" " + table + "(").lower() in str(ddl_test).lower(): 101 | ddl_sc.append(ddl_test) 102 | # 前三行数据 103 | try: 104 | cur.execute(f"select * from `{table}`") 105 | col_name_list = [tuple[0] for tuple in cur.description] 106 | db_data_all = [] 107 | # 获取前三行数据 108 | for i in range(3): 109 | db_data_all.append(cur.fetchone()) 110 | # ddls_data 111 | test = "" 112 | for idx, column_data in enumerate(col_name_list): 113 | try: 114 | test += f"{column_data}[{list(db_data_all[0])[idx]},{list(db_data_all[1])[idx]},{list(db_data_all[2])[idx]}]," 115 | except: 116 | test = test 117 | simplified_ddl_data.append(f"{table}({test[:-1]})") 118 | except: 119 | print() 120 | # res_ddl = [] 121 | # tables = [] 122 | # for test in ddl_sc: 123 | # tables.append(test.split("(")[0].replace("# ","")) 124 | # for one_ddl in split_ddl: 125 | # hit = 0 126 | # for one_table in linked_tables: 127 | # if f" {one_table.lower()}(" in one_ddl.lower(): 128 | # hit = 1 129 | # if hit: 130 | # res_ddl.append(one_ddl) 131 | ddl = ";\n".join(ddl_sc) + '.' 132 | ddls_data = "# " + ";\n# ".join(simplified_ddl_data) + ";\n" 133 | foreign_key = "" 134 | if len(fk_sc) > 0: 135 | for foreign_key_data in fk_sc: 136 | foreign_key += f'# {foreign_key_data};\n' 137 | foreign_key = "\n### Foreign key information of Sqlite SQL tables, used for table joins: \n#\n" + foreign_key[:-2] 138 | else: 139 | foreign_key = "" 140 | 141 | # prompt=f'''### Answer the question by sqlite SQL query only and with no explanation\n### Sqlite SQL tables, with their properties:\n{ddl}\n### Question: {sample['question']}\n### SQL: ''' 142 | if foreign_key: 143 | prompt = f'''### Answer the question by sqlite SQL query only and with no explanation. You must minimize SQL execution time while ensuring correctness.\n### Sqlite SQL tables, with their properties:\n#\n{ddl}\n#\n### Here are some data information about database references.\n#\n{ddls_data}#{foreign_key}\n#\n### Question: {sample['question']}\n### SQL: ''' 144 | else: 145 | prompt = f'''### Answer the question by sqlite SQL query only and with no explanation. You must minimize SQL execution time while ensuring correctness.\n### Sqlite SQL tables, with their properties:\n#\n{ddl}\n#\n### Here are some data information about database references.\n#\n{ddls_data}#\n### Question: {sample['question']}\n### SQL: ''' 146 | return prompt 147 | 148 | 149 | def run_sql_generation(model, 150 | input_data, 151 | out_file, 152 | k_shot=0, 153 | select_type="Euclidean_mask", 154 | pool_num=1, 155 | sl=False): 156 | 157 | domain = False 158 | 159 | # load_libray 160 | if k_shot != 0: 161 | examples_libary = get_examples_ins(select_type) 162 | print(f"select type: {select_type}, k shot: {k_shot}") 163 | # read file 164 | 165 | if model == "codellamaapi": 166 | llmapi = codellama() 167 | elif model == "puyuapi": 168 | llmapi = Puyu() 169 | elif model == "llamaapi": 170 | llmapi = Llama2() 171 | elif model == "sqlcoderapi": 172 | llmapi = SQLCoder() 173 | elif model == "vicunaapi": 174 | llmapi = vicuna() 175 | elif model == "gptapi": 176 | llmapi = GPT() 177 | else: 178 | raise Exception("no llm selected!") 179 | 180 | all_prompts = [] 181 | # get all prompts for parallel 182 | print('Generating ...') 183 | for i, sample in enumerate(input_data): 184 | prompt_target = formatting_prompt(sample) if not sl else formatting_prompt_sl(sample) 185 | 186 | if k_shot != 0: 187 | examples = examples_libary.get_examples(sample, 188 | k_shot, 189 | cross_domain=domain) 190 | prompt_example = [format_example(exm) for exm in examples] 191 | prompt = get_example_prefix() + "\n\n".join(prompt_example + 192 | [prompt_target]) 193 | else: 194 | prompt = prompt_target 195 | logger.info(prompt) 196 | all_prompts.append(prompt) 197 | 198 | # for api 199 | if model == "gptapi": 200 | result = [] 201 | for it in all_prompts: 202 | result.append(llmapi(it)) 203 | else: 204 | from multiprocessing import Pool 205 | pool = Pool(pool_num) 206 | result = list(pool.map(llmapi, all_prompts)) 207 | 208 | result = '~~'.join(result[i].replace("\n", " ") 209 | for i in range(len(result))) 210 | with open(log_file_path, 'w', encoding='utf-8') as file: 211 | print(log_file_path) 212 | file.write("\n".join(all_prompts) + '\n') 213 | with open(out_file, 'w', encoding='utf-8') as file: 214 | file.write(str(result) + '\n') 215 | 216 | 217 | if __name__ == '__main__': 218 | 219 | # 创建 ArgumentParser 对象 220 | parser = argparse.ArgumentParser() 221 | 222 | # 添加命令行选项 223 | parser.add_argument("--model", type=str, default="puyuapi") 224 | parser.add_argument("--dataset", type=str, default="ppl_dev.json") 225 | parser.add_argument("--out_file", type=str, default="raw.txt") 226 | parser.add_argument("--kshot", type=int, default=3) 227 | parser.add_argument("--pool", type=int, default=1) 228 | parser.add_argument("--sl", action="store_true") 229 | parser.add_argument("--select_type", type=str, default="Euclidean_mask") 230 | # 解析命令行参数 231 | args = parser.parse_args() 232 | log_file_path = os.path.dirname( 233 | __file__) + f"/logs/{args.model}_{args.select_type}.log" 234 | logging.basicConfig(filename=log_file_path, 235 | level=logging.INFO, 236 | filemode='w') 237 | logger = logging.getLogger() 238 | if args.sl == False: 239 | input_data = gen_ppl_from_json(args.dataset, args.model[:-3]) 240 | else: 241 | input_data = json.load(open(args.dataset, 'r')) 242 | print("schema linking: ", args.sl) 243 | print(args.dataset) 244 | run_sql_generation(args.model, input_data, args.out_file, args.kshot, 245 | args.select_type, args.pool, args.sl) 246 | -------------------------------------------------------------------------------- /src/sources/utils/data_builder.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from utils.utils import get_tables, sql2skeleton 5 | from utils.linking_utils.application import get_question_pattern_with_schema_linking 6 | 7 | 8 | class BasicDataset(object): 9 | def __init__(self, path_data, pre_test_result=None): 10 | self.path_data = os.path.join(path_data, self.name) 11 | self.path_db = os.path.join(self.path_data, "database") 12 | self.test_json = os.path.join(self.path_data, self.test_json) 13 | self.test_gold = os.path.join(self.path_data, self.test_gold) 14 | self.train_json = os.path.join(self.path_data, self.train_json) 15 | self.train_gold = os.path.join(self.path_data, self.train_gold) 16 | self.table_json = os.path.join(self.path_data, self.table_json) 17 | self.path_test_schema_linking = os.path.join(self.path_data, "enc/test_schema-linking.jsonl") 18 | self.path_train_schema_linking = os.path.join(self.path_data, "enc/train_schema-linking.jsonl") 19 | if self.mini_test_index_json: 20 | self.mini_test_index_json = os.path.join(self.path_data, self.mini_test_index_json) 21 | else: 22 | self.mini_test_index_json = None 23 | 24 | self.pre_test_result = pre_test_result 25 | 26 | # lazy load for tables 27 | self.databases = None 28 | 29 | # test a mini set 30 | def set_mini_test(self, mini_file): 31 | self.mini_test_index_json = os.path.join(self.path_data, mini_file) 32 | 33 | def get_databases(self): 34 | if self.databases is None: 35 | self.databases = dict() 36 | # for db_id in os.listdir(self.path_db): 37 | # self.databases[db_id] = self.get_tables(db_id) 38 | with open(self.table_json) as f: 39 | tables = json.load(f) 40 | for tj in tables: 41 | db_id = tj["db_id"] 42 | self.databases[db_id] = self.get_tables(db_id) 43 | return self.databases 44 | 45 | def get_tables(self, db_id): 46 | if db_id in self.databases: 47 | return self.databases[db_id] 48 | else: 49 | path_db = os.path.join(self.path_db, db_id, db_id + ".sqlite") 50 | tables = get_tables(path_db) 51 | self.databases[db_id] = tables 52 | return tables 53 | 54 | def get_path_sql(self, db_id): 55 | path_sql = os.path.join(self.path_db, db_id, "schema.sql") 56 | return path_sql 57 | 58 | def get_table_json(self): 59 | return json.load(open(self.table_json, "r")) 60 | 61 | def get_path_db(self, db_id): 62 | return os.path.join(self.path_db, db_id, f"{db_id}.sqlite") 63 | 64 | def get_train_questions(self): 65 | questions = json.load(open(self.train_json, "r")) 66 | return [_["question"] for _ in questions] 67 | 68 | def get_mini_index(self): 69 | if self.mini_test_index_json: 70 | return json.load(open(self.mini_test_index_json, "r")) 71 | else: 72 | return None 73 | 74 | def get_test_questions(self, mini_set=False): 75 | questions = json.load(open(self.test_json, "r")) 76 | if mini_set and self.mini_test_index_json: 77 | mini_test_index = self.get_mini_index() 78 | questions = [questions[i] for i in mini_test_index] 79 | return [_["question"] for _ in questions] 80 | 81 | # get query skeletons 82 | def get_pre_skeleton(self, queries=None, schemas=None, mini_set=False): 83 | if queries: 84 | skeletons = [] 85 | for query,schema in zip(queries, schemas): 86 | skeletons.append(sql2skeleton(query, schema)) 87 | if mini_set and self.mini_test_index_json: 88 | mini_index = self.get_mini_index() 89 | skeletons = [skeletons[i] for i in mini_index] 90 | return skeletons 91 | else: 92 | return False 93 | 94 | # get all train information 95 | def get_train_json(self): 96 | datas = json.load(open(self.train_json, "r")) 97 | linking_infos = self.get_train_schema_linking() 98 | db_id_to_table_json = dict() 99 | for table_json in self.get_table_json(): 100 | db_id_to_table_json[table_json["db_id"]] = table_json 101 | schemas = [db_id_to_table_json[d["db_id"]] for d in datas] 102 | queries = [data["query"] for data in datas] 103 | pre_queries = self.get_pre_skeleton(queries, schemas) 104 | return self.data_pre_process(datas, linking_infos, pre_queries) 105 | 106 | # get all test information 107 | def get_test_json(self, mini_set=False): 108 | tests = json.load(open(self.test_json, "r")) 109 | if mini_set and self.mini_test_index_json: 110 | mini_test_index = self.get_mini_index() 111 | tests = [tests[i] for i in mini_test_index] 112 | linking_infos = self.get_test_schema_linking(mini_set) 113 | db_id_to_table_json = dict() 114 | for table_json in self.get_table_json(): 115 | db_id_to_table_json[table_json["db_id"]] = table_json 116 | schemas = [db_id_to_table_json[d["db_id"]] for d in tests] 117 | if self.pre_test_result: 118 | with open(self.pre_test_result, 'r') as f: 119 | lines = f.readlines() 120 | queries = [line.strip() for line in lines] 121 | pre_queries = self.get_pre_skeleton(queries, schemas, mini_set) 122 | else: 123 | pre_queries = None 124 | return self.data_pre_process(tests, linking_infos, pre_queries) 125 | 126 | def get_test_schema_linking(self, mini_set=False): 127 | if not os.path.exists(self.path_test_schema_linking): 128 | return None 129 | linking_infos = [] 130 | with open(self.path_test_schema_linking, 'r') as f: 131 | for line in f.readlines(): 132 | if line.strip(): 133 | linking_infos.append(json.loads(line)) 134 | if mini_set and self.mini_test_index_json: 135 | mini_test_index = self.get_mini_index() 136 | linking_infos = [linking_infos[i] for i in mini_test_index] 137 | return linking_infos 138 | 139 | def get_train_schema_linking(self): 140 | if not os.path.exists(self.path_train_schema_linking): 141 | return None 142 | linking_infos = [] 143 | with open(self.path_train_schema_linking, 'r') as f: 144 | for line in f.readlines(): 145 | if line.strip(): 146 | linking_infos.append(json.loads(line)) 147 | return linking_infos 148 | 149 | def get_all_json(self): 150 | return self.get_train_json() + self.get_test_json() 151 | 152 | def get_train_answers(self): 153 | with open(self.train_gold, "r") as file: 154 | answers = file.readlines() 155 | return answers 156 | 157 | def get_test_answers(self, mini_set=False): 158 | with open(self.test_gold, "r") as file: 159 | answers = file.readlines() 160 | if mini_set and self.mini_test_index_json: 161 | mini_test_index = self.get_mini_index() 162 | answers = [answers[i] for i in mini_test_index] 163 | return answers 164 | 165 | def get_train_duplicated_index(self): 166 | train_data = self.get_train_json() 167 | example_dict = {} 168 | duplicated_index = [] 169 | for i in range(len(train_data)): 170 | db_id = train_data[i]["db_id"] 171 | question = train_data[i]["question"] 172 | if (db_id, question) in example_dict.keys(): 173 | duplicated_index.append(i) 174 | else: 175 | example_dict[(db_id, question)] = True 176 | return duplicated_index 177 | 178 | # get skeletons and schema_linking info 179 | def data_pre_process(self, datas, linking_infos=None, pre_queries=None): 180 | db_id_to_table_json = dict() 181 | for table_json in self.get_table_json(): 182 | db_id_to_table_json[table_json["db_id"]] = table_json 183 | for data in datas: 184 | db_id = data["db_id"] 185 | data["tables"] = self.get_tables(db_id) 186 | if data["query"].strip()[:6] != 'SELECT': 187 | data["query_skeleton"] = data["query"] 188 | else: 189 | data["query_skeleton"] = sql2skeleton(data["query"], db_id_to_table_json[db_id]) 190 | data["path_db"] = self.get_path_db(db_id) 191 | if linking_infos: 192 | db_id_to_table_json = dict() 193 | for table_json in self.get_table_json(): 194 | db_id_to_table_json[table_json["db_id"]] = table_json 195 | for id in range(min(len(datas), len(linking_infos))): 196 | datas[id]["sc_link"] = linking_infos[id]["sc_link"] 197 | datas[id]["cv_link"] = linking_infos[id]["cv_link"] 198 | datas[id]["question_for_copying"] = linking_infos[id]["question_for_copying"] 199 | datas[id]["column_to_table"] = linking_infos[id]["column_to_table"] 200 | db_id = datas[id]["db_id"] 201 | datas[id]["table_names_original"] = db_id_to_table_json[db_id]["table_names_original"] 202 | question_patterns = get_question_pattern_with_schema_linking(datas) 203 | for id in range(len(datas)): 204 | datas[id]["question_pattern"] = question_patterns[id] 205 | if pre_queries: 206 | for id in range(min(len(datas), len(pre_queries))): 207 | datas[id]["pre_skeleton"] = pre_queries[id] 208 | return datas 209 | 210 | 211 | class SpiderDataset(BasicDataset): 212 | name = "spider" 213 | test_json = "dev.json" 214 | test_gold = "dev_gold.sql" 215 | train_json = "train_spider_and_others.json" 216 | train_gold = "train_gold.sql" 217 | table_json = "tables.json" 218 | mini_test_index_json = "mini_dev_index.json" 219 | 220 | 221 | class RealisticDataset(BasicDataset): 222 | # only used for data path, shared with spider 223 | name = "spider_realistic" 224 | test_json = "spider-realistic.json" 225 | test_gold = "spider-realistic_gold.sql" 226 | train_json = "train_spider_and_others.json" 227 | train_gold = "train_gold.sql" 228 | table_json = "tables.json" 229 | mini_test_index_json = None 230 | 231 | class BirdDataset(BasicDataset): 232 | name = "bird" 233 | test_json = "dev.json" 234 | test_gold = "dev.sql" 235 | train_json = "train.json" 236 | train_gold = "train_gold.sql" 237 | table_json = "tables.json" 238 | mini_test_index_json = None 239 | 240 | 241 | def load_data(data_type, path_data, pre_test_result=None): 242 | if data_type.lower() == "spider": 243 | return SpiderDataset(path_data, pre_test_result) 244 | elif data_type.lower() == "realistic": 245 | return RealisticDataset(path_data, pre_test_result) 246 | elif data_type.lower() == "bird": 247 | return BirdDataset(path_data, pre_test_result) 248 | else: 249 | raise RuntimeError() 250 | -------------------------------------------------------------------------------- /src/sources/utils/datasets/spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import sqlite3 4 | from copy import copy 5 | from pathlib import Path 6 | from typing import List, Dict 7 | 8 | import attr 9 | import torch 10 | import networkx as nx 11 | from tqdm import tqdm 12 | 13 | 14 | def build_foreign_key_map(entry): 15 | cols_orig = entry["column_names_original"] 16 | tables_orig = entry["table_names_original"] 17 | 18 | # rebuild cols corresponding to idmap in Schema 19 | cols = [] 20 | for col_orig in cols_orig: 21 | if col_orig[0] >= 0: 22 | t = tables_orig[col_orig[0]] 23 | c = col_orig[1] 24 | cols.append("__" + t.lower() + "." + c.lower() + "__") 25 | else: 26 | cols.append("__all__") 27 | 28 | def keyset_in_list(k1, k2, k_list): 29 | for k_set in k_list: 30 | if k1 in k_set or k2 in k_set: 31 | return k_set 32 | new_k_set = set() 33 | k_list.append(new_k_set) 34 | return new_k_set 35 | 36 | foreign_key_list = [] 37 | foreign_keys = entry["foreign_keys"] 38 | for fkey in foreign_keys: 39 | key1, key2 = fkey 40 | key_set = keyset_in_list(key1, key2, foreign_key_list) 41 | key_set.add(key1) 42 | key_set.add(key2) 43 | 44 | foreign_key_map = {} 45 | for key_set in foreign_key_list: 46 | sorted_list = sorted(list(key_set)) 47 | midx = sorted_list[0] 48 | for idx in sorted_list: 49 | foreign_key_map[cols[idx]] = cols[midx] 50 | 51 | return foreign_key_map 52 | 53 | 54 | 55 | @attr.s 56 | class SpiderItem: 57 | text = attr.ib() 58 | code = attr.ib() 59 | schema = attr.ib() 60 | orig = attr.ib() 61 | orig_schema = attr.ib() 62 | 63 | 64 | @attr.s 65 | class Column: 66 | id = attr.ib() 67 | table = attr.ib() 68 | name = attr.ib() 69 | unsplit_name = attr.ib() 70 | orig_name = attr.ib() 71 | type = attr.ib() 72 | foreign_key_for = attr.ib(default=None) 73 | 74 | 75 | @attr.s 76 | class Table: 77 | id = attr.ib() 78 | name = attr.ib() 79 | unsplit_name = attr.ib() 80 | orig_name = attr.ib() 81 | columns = attr.ib(factory=list) 82 | primary_keys = attr.ib(factory=list) 83 | 84 | 85 | @attr.s 86 | class Schema: 87 | db_id = attr.ib() 88 | tables = attr.ib() 89 | columns = attr.ib() 90 | foreign_key_graph = attr.ib() 91 | orig = attr.ib() 92 | connection = attr.ib(default=None) 93 | 94 | 95 | def postprocess_original_name(s: str): 96 | return re.sub(r'([A-Z]+)', r' \1', s).replace('_', ' ').lower().strip() 97 | 98 | 99 | def load_tables(paths): 100 | schemas = {} 101 | eval_foreign_key_maps = {} 102 | 103 | for path in paths: 104 | schema_dicts = json.load(open(path)) 105 | for schema_dict in schema_dicts: 106 | tables = tuple( 107 | Table( 108 | id=i, 109 | name=name.split(), 110 | unsplit_name=name, 111 | orig_name=orig_name, 112 | ) 113 | for i, (name, orig_name) in enumerate(zip( 114 | schema_dict['table_names'], schema_dict['table_names_original'])) 115 | ) 116 | columns = tuple( 117 | Column( 118 | id=i, 119 | table=tables[table_id] if table_id >= 0 else None, 120 | name=col_name.split(), 121 | unsplit_name=col_name, 122 | orig_name=orig_col_name, 123 | type=col_type, 124 | ) 125 | for i, ((table_id, col_name), (_, orig_col_name), col_type) in enumerate(zip( 126 | schema_dict['column_names'], 127 | schema_dict['column_names_original'], 128 | schema_dict['column_types'])) 129 | ) 130 | 131 | # Link columns to tables 132 | for column in columns: 133 | if column.table: 134 | column.table.columns.append(column) 135 | 136 | for column_id in schema_dict['primary_keys']: 137 | # Register primary keys 138 | if isinstance(column_id, list): 139 | for each_id in column_id: 140 | column = columns[each_id] 141 | column.table.primary_keys.append(column) 142 | else: 143 | column = columns[column_id] 144 | column.table.primary_keys.append(column) 145 | 146 | foreign_key_graph = nx.DiGraph() 147 | for source_column_id, dest_column_id in schema_dict['foreign_keys']: 148 | # Register foreign keys 149 | source_column = columns[source_column_id] 150 | dest_column = columns[dest_column_id] 151 | source_column.foreign_key_for = dest_column 152 | foreign_key_graph.add_edge( 153 | source_column.table.id, 154 | dest_column.table.id, 155 | columns=(source_column_id, dest_column_id)) 156 | foreign_key_graph.add_edge( 157 | dest_column.table.id, 158 | source_column.table.id, 159 | columns=(dest_column_id, source_column_id)) 160 | 161 | db_id = schema_dict['db_id'] 162 | assert db_id not in schemas 163 | schemas[db_id] = Schema(db_id, tables, columns, foreign_key_graph, schema_dict) 164 | # eval_foreign_key_maps[db_id] = build_foreign_key_map(schema_dict) 165 | 166 | return schemas, eval_foreign_key_maps 167 | 168 | -------------------------------------------------------------------------------- /src/sources/utils/enums.py: -------------------------------------------------------------------------------- 1 | class REPR_TYPE: 2 | CODE_REPRESENTATION = "SQL" 3 | TEXT_REPRESENTATION = "TEXT" 4 | OPENAI_DEMOSTRATION = "NUMBERSIGN" 5 | BASIC = "BASELINE" 6 | ALPACA_SFT = "INSTRUCTION" 7 | OPENAI_DEMOSTRATION_WFK = "NUMBERSIGNWFK" 8 | BASIC_WOFK = "BASELINEWOFK" 9 | TEXT_REPRESENTATION_WFK = "TEXTWFK" 10 | ALPACA_SFT_WFK = "INSTRUCTIONWFK" 11 | OPENAI_DEMOSTRATION_WORULE = "NUMBERSIGNWORULE" 12 | CODE_REPRESENTATION_WRULE = "SQLWRULE" 13 | ALPACA_SFT_WRULE = "INSTRUCTIONWRULE" 14 | TEXT_REPRESENTATION_WRULE = "TEXTWRULE" 15 | CODE_REPRESENTATION_COT = "SQLCOT" 16 | TEXT_REPRESENTATION_COT = "TEXTCOT" 17 | OPENAI_DEMOSTRATION_COT = "NUMBERSIGNCOT" 18 | ALPACA_SFT_COT = "INSTRUCTIONCOT" 19 | CBR = "CBR" 20 | 21 | 22 | class EXAMPLE_TYPE: 23 | ONLY_SQL = "ONLYSQL" 24 | QA = "QA" 25 | COMPLETE = "COMPLETE" 26 | QAWRULE = "QAWRULE" 27 | OPENAI_DEMOSTRATION_QA = "NUMBERSIGNQA" 28 | BASIC_QA = "BASELINEQA" 29 | 30 | 31 | class SELECTOR_TYPE: 32 | COS_SIMILAR = "COSSIMILAR" 33 | RANDOM = "RANDOM" 34 | EUC_DISTANCE = "EUCDISTANCE" 35 | EUC_DISTANCE_THRESHOLD = "EUCDISTANCETHRESHOLD" 36 | EUC_DISTANCE_SKELETON_SIMILARITY_THRESHOLD = "EUCDISSKLSIMTHR" 37 | EUC_DISTANCE_QUESTION_MASK = "EUCDISQUESTIONMASK" 38 | EUC_DISTANCE_PRE_SKELETON_SIMILARITY_THRESHOLD = "EUCDISPRESKLSIMTHR" 39 | EUC_DISTANCE_PRE_SKELETON_SIMILARITY_PLUS = "EUCDISPRESKLSIMPLUS" 40 | EUC_DISTANCE_MASK_PRE_SKELETON_SIMILARITY_THRESHOLD = "EUCDISMASKPRESKLSIMTHR" 41 | EUC_DISTANCE_MASK_PRE_SKELETON_SIMILARITY_THRESHOLD_SHIFT = "EUCDISMASKPRESKLSIMTHRSHIFT" 42 | # TODO: from the same database 43 | 44 | 45 | class LLM: 46 | # openai LLMs 47 | TEXT_DAVINCI_003 = "text-davinci-003" 48 | CODE_DAVINCI_002 = "code-davinci-002" 49 | GPT_35_TURBO = "gpt-3.5-turbo" 50 | GPT_35_TURBO_0613 = "gpt-3.5-turbo-0613" 51 | GPT_35_TURBO_16K = "gpt-3.5-turbo-16k" 52 | GPT_35_TURBO_0301 = "gpt-3.5-turbo-0301" 53 | GPT_4 = "gpt-4" 54 | 55 | # LLMs that use openai completion api 56 | TASK_COMPLETIONS = [ 57 | TEXT_DAVINCI_003, 58 | CODE_DAVINCI_002 59 | ] 60 | 61 | # LLMs that use openai chat api 62 | TASK_CHAT = [ 63 | GPT_35_TURBO, 64 | GPT_35_TURBO_0613, 65 | GPT_35_TURBO_16K, 66 | GPT_35_TURBO_0301, 67 | GPT_4 68 | ] 69 | 70 | # LLMs that can run in batch 71 | BATCH_FORWARD = [ 72 | TEXT_DAVINCI_003, 73 | CODE_DAVINCI_002 74 | ] 75 | 76 | costs_per_thousand = { 77 | TEXT_DAVINCI_003: 0.0200, 78 | CODE_DAVINCI_002: 0.0200, 79 | GPT_35_TURBO: 0.0020, 80 | GPT_35_TURBO_0613: 0.0020, 81 | GPT_35_TURBO_16K: 0.003, 82 | GPT_35_TURBO_0301: 0.0020, 83 | GPT_4: 0.03 84 | } 85 | 86 | # local LLMs 87 | LLAMA_7B = "llama-7b" 88 | ALPACA_7B = "alpaca-7b" 89 | # TONG_YI_QIAN_WEN = "qwen-v1" 90 | -------------------------------------------------------------------------------- /src/sources/utils/linking_process.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import itertools 3 | import json 4 | import os 5 | 6 | import attr 7 | import numpy as np 8 | import torch 9 | 10 | from utils.linking_utils import abstract_preproc, corenlp, serialization 11 | from utils.linking_utils.spider_match_utils import ( 12 | compute_schema_linking, 13 | compute_cell_value_linking, 14 | match_shift 15 | ) 16 | 17 | @attr.s 18 | class PreprocessedSchema: 19 | column_names = attr.ib(factory=list) 20 | table_names = attr.ib(factory=list) 21 | table_bounds = attr.ib(factory=list) 22 | column_to_table = attr.ib(factory=dict) 23 | table_to_columns = attr.ib(factory=dict) 24 | foreign_keys = attr.ib(factory=dict) 25 | foreign_keys_tables = attr.ib(factory=lambda: collections.defaultdict(set)) 26 | primary_keys = attr.ib(factory=list) 27 | 28 | # only for bert version 29 | normalized_column_names = attr.ib(factory=list) 30 | normalized_table_names = attr.ib(factory=list) 31 | 32 | 33 | def preprocess_schema_uncached(schema, 34 | tokenize_func, 35 | include_table_name_in_column, 36 | fix_issue_16_primary_keys, 37 | bert=False): 38 | """If it's bert, we also cache the normalized version of 39 | question/column/table for schema linking""" 40 | r = PreprocessedSchema() 41 | 42 | if bert: assert not include_table_name_in_column 43 | 44 | last_table_id = None 45 | for i, column in enumerate(schema.columns): 46 | col_toks = tokenize_func( 47 | column.name, column.unsplit_name) 48 | 49 | # assert column.type in ["text", "number", "time", "boolean", "others"] 50 | type_tok = f'' 51 | if bert: 52 | # for bert, we take the representation of the first word 53 | column_name = col_toks + [type_tok] 54 | r.normalized_column_names.append(Bertokens(col_toks)) 55 | else: 56 | column_name = [type_tok] + col_toks 57 | 58 | if include_table_name_in_column: 59 | if column.table is None: 60 | table_name = [''] 61 | else: 62 | table_name = tokenize_func( 63 | column.table.name, column.table.unsplit_name) 64 | column_name += [''] + table_name 65 | r.column_names.append(column_name) 66 | 67 | table_id = None if column.table is None else column.table.id 68 | r.column_to_table[str(i)] = table_id 69 | if table_id is not None: 70 | columns = r.table_to_columns.setdefault(str(table_id), []) 71 | columns.append(i) 72 | if last_table_id != table_id: 73 | r.table_bounds.append(i) 74 | last_table_id = table_id 75 | 76 | if column.foreign_key_for is not None: 77 | r.foreign_keys[str(column.id)] = column.foreign_key_for.id 78 | r.foreign_keys_tables[str(column.table.id)].add(column.foreign_key_for.table.id) 79 | 80 | r.table_bounds.append(len(schema.columns)) 81 | assert len(r.table_bounds) == len(schema.tables) + 1 82 | 83 | for i, table in enumerate(schema.tables): 84 | table_toks = tokenize_func( 85 | table.name, table.unsplit_name) 86 | r.table_names.append(table_toks) 87 | if bert: 88 | r.normalized_table_names.append(Bertokens(table_toks)) 89 | last_table = schema.tables[-1] 90 | 91 | r.foreign_keys_tables = serialization.to_dict_with_sorted_values(r.foreign_keys_tables) 92 | r.primary_keys = [ 93 | column.id 94 | for table in schema.tables 95 | for column in table.primary_keys 96 | ] if fix_issue_16_primary_keys else [ 97 | column.id 98 | for column in last_table.primary_keys 99 | for table in schema.tables 100 | ] 101 | 102 | return r 103 | 104 | 105 | class SpiderEncoderV2Preproc(abstract_preproc.AbstractPreproc): 106 | 107 | def __init__( 108 | self, 109 | save_path, 110 | min_freq=3, 111 | max_count=5000, 112 | include_table_name_in_column=True, 113 | word_emb=None, 114 | # count_tokens_in_word_emb_for_vocab=False, 115 | fix_issue_16_primary_keys=False, 116 | compute_sc_link=False, 117 | compute_cv_link=False): 118 | if word_emb is None: 119 | self.word_emb = None 120 | else: 121 | self.word_emb = word_emb 122 | 123 | self.data_dir = os.path.join(save_path, 'enc') 124 | self.include_table_name_in_column = include_table_name_in_column 125 | # self.count_tokens_in_word_emb_for_vocab = count_tokens_in_word_emb_for_vocab 126 | self.fix_issue_16_primary_keys = fix_issue_16_primary_keys 127 | self.compute_sc_link = compute_sc_link 128 | self.compute_cv_link = compute_cv_link 129 | self.texts = collections.defaultdict(list) 130 | # self.db_path = db_path 131 | 132 | # self.vocab_builder = vocab.VocabBuilder(min_freq, max_count) 133 | # self.vocab_path = os.path.join(save_path, 'enc_vocab.json') 134 | # self.vocab_word_freq_path = os.path.join(save_path, 'enc_word_freq.json') 135 | # self.vocab = None 136 | # self.counted_db_ids = set() 137 | self.preprocessed_schemas = {} 138 | 139 | def validate_item(self, item, schema, section): 140 | return True, None 141 | 142 | def add_item(self, item, schema, section, validation_info): 143 | preprocessed = self.preprocess_item(item, schema, validation_info) 144 | self.texts[section].append(preprocessed) 145 | 146 | def clear_items(self): 147 | self.texts = collections.defaultdict(list) 148 | 149 | def preprocess_item(self, item, schema, validation_info): 150 | question, question_for_copying = self._tokenize_for_copying(item['question_toks'], item['question']) 151 | preproc_schema = self._preprocess_schema(schema) 152 | if self.compute_sc_link: 153 | assert preproc_schema.column_names[0][0].startswith(" 0: 54 | for i in range(len(question) - n + 1): 55 | n_gram_list = question[i:i + n] 56 | n_gram = " ".join(n_gram_list) 57 | if len(n_gram.strip()) == 0: 58 | continue 59 | # exact match case 60 | for col_id in col_id2list: 61 | if exact_match(n_gram_list, col_id2list[col_id]): 62 | for q_id in range(i, i + n): 63 | q_col_match[f"{q_id},{col_id}"] = COL_EXACT_MATCH_FLAG 64 | for tab_id in tab_id2list: 65 | if exact_match(n_gram_list, tab_id2list[tab_id]): 66 | for q_id in range(i, i + n): 67 | q_tab_match[f"{q_id},{tab_id}"] = TAB_EXACT_MATCH_FLAG 68 | 69 | # partial match case 70 | for col_id in col_id2list: 71 | if partial_match(n_gram_list, col_id2list[col_id]): 72 | for q_id in range(i, i + n): 73 | if f"{q_id},{col_id}" not in q_col_match: 74 | q_col_match[f"{q_id},{col_id}"] = COL_PARTIAL_MATCH_FLAG 75 | for tab_id in tab_id2list: 76 | if partial_match(n_gram_list, tab_id2list[tab_id]): 77 | for q_id in range(i, i + n): 78 | if f"{q_id},{tab_id}" not in q_tab_match: 79 | q_tab_match[f"{q_id},{tab_id}"] = TAB_PARTIAL_MATCH_FLAG 80 | n -= 1 81 | return {"q_col_match": q_col_match, "q_tab_match": q_tab_match} 82 | 83 | 84 | def compute_cell_value_linking(tokens, schema): 85 | def isnumber(word): 86 | try: 87 | float(word) 88 | return True 89 | except: 90 | return False 91 | 92 | def db_word_partial_match(word, column, table, db_conn): 93 | cursor = db_conn.cursor() 94 | 95 | p_str = f"select {column} from {table} where {column} like '{word} %' or {column} like '% {word}' or " \ 96 | f"{column} like '% {word} %' or {column} like '{word}'" 97 | try: 98 | cursor.execute(p_str) 99 | p_res = cursor.fetchall() 100 | if len(p_res) == 0: 101 | return False 102 | else: 103 | return p_res 104 | except Exception as e: 105 | return False 106 | 107 | def db_word_exact_match(word, column, table, db_conn): 108 | cursor = db_conn.cursor() 109 | 110 | p_str = f"select {column} from {table} where {column} like '{word}' or {column} like ' {word}' or " \ 111 | f"{column} like '{word} ' or {column} like ' {word} '" 112 | try: 113 | cursor.execute(p_str) 114 | p_res = cursor.fetchall() 115 | if len(p_res) == 0: 116 | return False 117 | else: 118 | return p_res 119 | except Exception as e: 120 | return False 121 | 122 | num_date_match = {} 123 | cell_match = {} 124 | 125 | for col_id, column in enumerate(schema.columns): 126 | if col_id == 0: 127 | assert column.orig_name == "*" 128 | continue 129 | match_q_ids = [] 130 | for q_id, word in enumerate(tokens): 131 | if len(word.strip()) == 0: 132 | continue 133 | if word in STOPWORDS or word in PUNKS: 134 | continue 135 | 136 | num_flag = isnumber(word) 137 | if num_flag: # TODO refine the date and time match 138 | if column.type in ["number", "time"]: 139 | num_date_match[f"{q_id},{col_id}"] = column.type.upper() 140 | else: 141 | ret = db_word_partial_match(word, column.orig_name, column.table.orig_name, schema.connection) 142 | if ret: 143 | # print(word, ret) 144 | match_q_ids.append(q_id) 145 | f = 0 146 | while f < len(match_q_ids): 147 | t = f + 1 148 | while t < len(match_q_ids) and match_q_ids[t] == match_q_ids[t - 1] + 1: 149 | t += 1 150 | q_f, q_t = match_q_ids[f], match_q_ids[t - 1] + 1 151 | words = [token for token in tokens[q_f: q_t]] 152 | ret = db_word_exact_match(' '.join(words), column.orig_name, column.table.orig_name, schema.connection) 153 | if ret: 154 | for q_id in range(q_f, q_t): 155 | cell_match[f"{q_id},{col_id}"] = CELL_EXACT_MATCH_FLAG 156 | else: 157 | for q_id in range(q_f, q_t): 158 | cell_match[f"{q_id},{col_id}"] = CELL_PARTIAL_MATCH_FLAG 159 | f = t 160 | 161 | cv_link = {"num_date_match": num_date_match, "cell_match": cell_match} 162 | return cv_link 163 | 164 | 165 | def match_shift(q_col_match, q_tab_match, cell_match): 166 | 167 | q_id_to_match = collections.defaultdict(list) 168 | for match_key in q_col_match.keys(): 169 | q_id = int(match_key.split(',')[0]) 170 | c_id = int(match_key.split(',')[1]) 171 | type = q_col_match[match_key] 172 | q_id_to_match[q_id].append((type, c_id)) 173 | for match_key in q_tab_match.keys(): 174 | q_id = int(match_key.split(',')[0]) 175 | t_id = int(match_key.split(',')[1]) 176 | type = q_tab_match[match_key] 177 | q_id_to_match[q_id].append((type, t_id)) 178 | relevant_q_ids = list(q_id_to_match.keys()) 179 | 180 | priority = [] 181 | for q_id in q_id_to_match.keys(): 182 | q_id_to_match[q_id] = list(set(q_id_to_match[q_id])) 183 | priority.append((len(q_id_to_match[q_id]), q_id)) 184 | priority.sort() 185 | matches = [] 186 | new_q_col_match, new_q_tab_match = dict(), dict() 187 | for _, q_id in priority: 188 | if not list(set(matches) & set(q_id_to_match[q_id])): 189 | exact_matches = [] 190 | for match in q_id_to_match[q_id]: 191 | if match[0] in [COL_EXACT_MATCH_FLAG, TAB_EXACT_MATCH_FLAG]: 192 | exact_matches.append(match) 193 | if exact_matches: 194 | res = exact_matches 195 | else: 196 | res = q_id_to_match[q_id] 197 | matches.extend(res) 198 | else: 199 | res = list(set(matches) & set(q_id_to_match[q_id])) 200 | for match in res: 201 | type, c_t_id = match 202 | if type in [COL_PARTIAL_MATCH_FLAG, COL_EXACT_MATCH_FLAG]: 203 | new_q_col_match[f'{q_id},{c_t_id}'] = type 204 | if type in [TAB_PARTIAL_MATCH_FLAG, TAB_EXACT_MATCH_FLAG]: 205 | new_q_tab_match[f'{q_id},{c_t_id}'] = type 206 | 207 | new_cell_match = dict() 208 | for match_key in cell_match.keys(): 209 | q_id = int(match_key.split(',')[0]) 210 | if q_id in relevant_q_ids: 211 | continue 212 | # if cell_match[match_key] == CELL_EXACT_MATCH_FLAG: 213 | new_cell_match[match_key] = cell_match[match_key] 214 | 215 | return new_q_col_match, new_q_tab_match, new_cell_match -------------------------------------------------------------------------------- /src/sources/utils/post_process.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import random 5 | import re 6 | import sqlite3 7 | import threading 8 | from collections import defaultdict 9 | from itertools import product 10 | from typing import Tuple, Any, List, Set 11 | import sqlparse 12 | import tqdm 13 | 14 | 15 | # process the case of duplicated output of ChatGPT and GPT4 for SQL Representation with QA or SQLONLY Organization 16 | def process_duplication(sql): 17 | sql = sql.strip().split("/*")[0] 18 | return sql 19 | 20 | threadLock = threading.Lock() 21 | TIMEOUT = 60 22 | EXEC_TMP_DIR = os.path.join(os.path.dirname(__file__), "tmp") 23 | 24 | 25 | def permute_tuple(element: Tuple, perm: Tuple) -> Tuple: 26 | assert len(element) == len(perm) 27 | return tuple([element[i] for i in perm]) 28 | 29 | 30 | def unorder_row(row: Tuple) -> Tuple: 31 | return tuple(sorted(row, key=lambda x: str(x) + str(type(x)))) 32 | 33 | 34 | # unorder each row in the table 35 | # [result_1 and result_2 has the same bag of unordered row] 36 | # is a necessary condition of 37 | # [result_1 and result_2 are equivalent in denotation] 38 | def quick_rej(result1: List[Tuple], result2: List[Tuple], order_matters: bool) -> bool: 39 | s1 = [unorder_row(row) for row in result1] 40 | s2 = [unorder_row(row) for row in result2] 41 | if order_matters: 42 | return s1 == s2 43 | else: 44 | return set(s1) == set(s2) 45 | 46 | 47 | # return whether two bag of relations are equivalent 48 | def multiset_eq(l1: List, l2: List) -> bool: 49 | if len(l1) != len(l2): 50 | return False 51 | d = defaultdict(int) 52 | for e in l1: 53 | d[e] = d[e] + 1 54 | for e in l2: 55 | d[e] = d[e] - 1 56 | if d[e] < 0: 57 | return False 58 | return True 59 | 60 | 61 | def get_constraint_permutation(tab1_sets_by_columns: List[Set], result2: List[Tuple]): 62 | num_cols = len(result2[0]) 63 | perm_constraints = [{i for i in range(num_cols)} for _ in range(num_cols)] 64 | if num_cols <= 3: 65 | return product(*perm_constraints) 66 | 67 | # we sample 20 rows and constrain the space of permutations 68 | for _ in range(20): 69 | random_tab2_row = random.choice(result2) 70 | 71 | for tab1_col in range(num_cols): 72 | for tab2_col in set(perm_constraints[tab1_col]): 73 | if random_tab2_row[tab2_col] not in tab1_sets_by_columns[tab1_col]: 74 | perm_constraints[tab1_col].remove(tab2_col) 75 | return product(*perm_constraints) 76 | 77 | 78 | # check whether two denotations are correct 79 | def result_eq(result1: List[Tuple], result2: List[Tuple], order_matters: bool) -> bool: 80 | if len(result1) == 0 and len(result2) == 0: 81 | return True 82 | 83 | # if length is not the same, then they are definitely different bag of rows 84 | if len(result1) != len(result2): 85 | return False 86 | 87 | num_cols = len(result1[0]) 88 | 89 | # if the results do not have the same number of columns, they are different 90 | if len(result2[0]) != num_cols: 91 | return False 92 | 93 | # unorder each row and compare whether the denotation is the same 94 | # this can already find most pair of denotations that are different 95 | if not quick_rej(result1, result2, order_matters): 96 | return False 97 | 98 | # the rest of the problem is in fact more complicated than one might think 99 | # we want to find a permutation of column order and a permutation of row order, 100 | # s.t. result_1 is the same as result_2 101 | # we return true if we can find such column & row permutations 102 | # and false if we cannot 103 | tab1_sets_by_columns = [{row[i] for row in result1} for i in range(num_cols)] 104 | 105 | # on a high level, we enumerate all possible column permutations that might make result_1 == result_2 106 | # we decrease the size of the column permutation space by the function get_constraint_permutation 107 | # if one of the permutation make result_1, result_2 equivalent, then they are equivalent 108 | for perm in get_constraint_permutation(tab1_sets_by_columns, result2): 109 | if len(perm) != len(set(perm)): 110 | continue 111 | if num_cols == 1: 112 | result2_perm = result2 113 | else: 114 | result2_perm = [permute_tuple(element, perm) for element in result2] 115 | if order_matters: 116 | if result1 == result2_perm: 117 | return True 118 | else: 119 | # in fact the first condition must hold if the second condition holds 120 | # but the first is way more efficient implementation-wise 121 | # and we use it to quickly reject impossible candidates 122 | if set(result1) == set(result2_perm) and multiset_eq(result1, result2_perm): 123 | return True 124 | return False 125 | 126 | 127 | def replace_cur_year(query: str) -> str: 128 | return re.sub( 129 | "YEAR\s*\(\s*CURDATE\s*\(\s*\)\s*\)\s*", "2020", query, flags=re.IGNORECASE 130 | ) 131 | 132 | 133 | # get the database cursor for a sqlite database path 134 | def get_cursor_from_path(sqlite_path: str): 135 | try: 136 | if not os.path.exists(sqlite_path): 137 | print("Openning a new connection %s" % sqlite_path) 138 | connection = sqlite3.connect(sqlite_path) 139 | except Exception as e: 140 | print(sqlite_path) 141 | raise e 142 | connection.text_factory = lambda b: b.decode(errors="ignore") 143 | cursor = connection.cursor() 144 | return cursor 145 | 146 | 147 | async def exec_on_db_(sqlite_path: str, query: str) -> Tuple[str, Any]: 148 | query = replace_cur_year(query) 149 | cursor = get_cursor_from_path(sqlite_path) 150 | try: 151 | cursor.execute(query) 152 | result = cursor.fetchall() 153 | cursor.close() 154 | cursor.connection.close() 155 | return "result", result 156 | except Exception as e: 157 | cursor.close() 158 | cursor.connection.close() 159 | return "exception", e 160 | 161 | 162 | async def exec_on_db( 163 | sqlite_path: str, query: str, process_id: str = "", timeout: int = TIMEOUT 164 | ) -> Tuple[str, Any]: 165 | try: 166 | return await asyncio.wait_for(exec_on_db_(sqlite_path, query), timeout) 167 | except asyncio.TimeoutError: 168 | return ('exception', TimeoutError) 169 | except Exception as e: 170 | return ("exception", e) 171 | 172 | 173 | # postprocess the model predictions to avoid execution errors 174 | # e.g. removing spaces between ">" and "=" 175 | def postprocess(query: str) -> str: 176 | query = query.replace("> =", ">=").replace("< =", "<=").replace("! =", "!=") 177 | return query 178 | 179 | def remove_distinct(s): 180 | toks = [t.value for t in list(sqlparse.parse(s)[0].flatten())] 181 | return "".join([t for t in toks if t.lower() != "distinct"]) 182 | 183 | def get_exec_output( 184 | db: str, 185 | sql: str, 186 | plug_value: bool = False, 187 | keep_distinct: bool = False, 188 | progress_bar_for_each_datapoint: bool = False, 189 | ): 190 | # post-process the prediction. 191 | # e.g. removing spaces between ">" and "=" 192 | sql = postprocess(sql) 193 | 194 | if not keep_distinct: 195 | try: 196 | # if sqlparse can't parse p_str, we should not even try to execute it 197 | sql = remove_distinct(sql) 198 | except Exception as e: 199 | return "exception", [] 200 | 201 | db_dir = os.path.dirname(db) 202 | db_paths = [os.path.join(db_dir, basename) for basename in os.listdir(db_dir) if ".sqlite" in basename] 203 | # print(db_paths) 204 | if progress_bar_for_each_datapoint: 205 | ranger = tqdm.tqdm(db_paths) 206 | else: 207 | ranger = db_paths 208 | for db_path in ranger: 209 | flag, sql_denotation = asyncio.run(exec_on_db(db_path, sql)) 210 | # print(sql_denotation) 211 | return flag, sql_denotation 212 | 213 | 214 | def get_sqls(results, select_number, db_dir): 215 | db_ids = [] 216 | all_p_sqls = [] 217 | for item in results: 218 | p_sqls = [] 219 | db_ids.append(item['db_id']) 220 | for i, x in enumerate(item['p_sqls']): 221 | p_sqls.append(x) 222 | if i+1 == select_number: 223 | break 224 | all_p_sqls.append(p_sqls) 225 | chosen_p_sqls = [] 226 | for i, db_id in enumerate(tqdm.tqdm(db_ids)): 227 | p_sqls = all_p_sqls[i] 228 | db_path = f"{db_dir}/{db_id}/{db_id}" 229 | cluster_sql_list = [] 230 | map_sql2denotation = {} 231 | for sql in p_sqls: 232 | flag, denotation = get_exec_output( 233 | db_path, 234 | sql, 235 | ) 236 | if flag == "exception": 237 | continue 238 | map_sql2denotation[sql] = denotation 239 | denotation_match = False 240 | 241 | for id, cluster in enumerate(cluster_sql_list): 242 | center_sql = cluster[0] 243 | if result_eq(map_sql2denotation[center_sql], denotation, False): 244 | cluster_sql_list[id].append(sql) 245 | denotation_match = True 246 | break 247 | if not denotation_match: 248 | cluster_sql_list.append([sql]) 249 | cluster_sql_list.sort(key=lambda x: len(x), reverse=True) 250 | if not cluster_sql_list: 251 | chosen_p_sqls.append(p_sqls[0]) 252 | else: 253 | chosen_p_sqls.append(cluster_sql_list[0][0]) 254 | 255 | print("save chosen sqls and results...") 256 | 257 | return chosen_p_sqls 258 | -------------------------------------------------------------------------------- /src/sources/utils/pretrained_embeddings.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import functools 3 | import os 4 | import time 5 | 6 | # import bpemb 7 | import corenlp 8 | # import torch 9 | import torchtext 10 | 11 | from utils.linking_utils import corenlp 12 | 13 | 14 | class Embedder(metaclass=abc.ABCMeta): 15 | 16 | @abc.abstractmethod 17 | def tokenize(self, sentence): 18 | '''Given a string, return a list of tokens suitable for lookup.''' 19 | pass 20 | 21 | @abc.abstractmethod 22 | def untokenize(self, tokens): 23 | '''Undo tokenize.''' 24 | pass 25 | 26 | @abc.abstractmethod 27 | def lookup(self, token): 28 | '''Given a token, return a vector embedding if token is in vocabulary. 29 | If token is not in the vocabulary, then return None.''' 30 | pass 31 | 32 | @abc.abstractmethod 33 | def contains(self, token): 34 | pass 35 | 36 | @abc.abstractmethod 37 | def to(self, device): 38 | '''Transfer the pretrained embeddings to the given device.''' 39 | pass 40 | 41 | 42 | class GloVe(Embedder): 43 | 44 | def __init__(self, kind, lemmatize=False): 45 | cache = os.path.join(os.environ.get('CACHE_DIR', "/root"), 'vector_cache') 46 | self.glove = torchtext.vocab.GloVe(name=kind, cache=cache) 47 | self.dim = self.glove.dim 48 | self.vectors = self.glove.vectors 49 | self.lemmatize = lemmatize 50 | self.corenlp_annotators = ['tokenize', 'ssplit'] 51 | if lemmatize: 52 | self.corenlp_annotators.append('lemma') 53 | 54 | @functools.lru_cache(maxsize=1024) 55 | def tokenize(self, text): 56 | ann = corenlp.annotate(text, self.corenlp_annotators) 57 | if self.lemmatize: 58 | return [tok.lemma.lower() for sent in ann.sentence for tok in sent.token] 59 | else: 60 | return [tok.word.lower() for sent in ann.sentence for tok in sent.token] 61 | 62 | @functools.lru_cache(maxsize=1024) 63 | def tokenize_for_copying(self, text): 64 | ann = corenlp.annotate(text, self.corenlp_annotators) 65 | text_for_copying = [tok.originalText.lower() for sent in ann.sentence for tok in sent.token] 66 | if self.lemmatize: 67 | text = [tok.lemma.lower() for sent in ann.sentence for tok in sent.token] 68 | else: 69 | text = [tok.word.lower() for sent in ann.sentence for tok in sent.token] 70 | return text, text_for_copying 71 | 72 | def untokenize(self, tokens): 73 | return ' '.join(tokens) 74 | 75 | def lookup(self, token): 76 | i = self.glove.stoi.get(token) 77 | if i is None: 78 | return None 79 | return self.vectors[i] 80 | 81 | def contains(self, token): 82 | return token in self.glove.stoi 83 | 84 | def to(self, device): 85 | self.vectors = self.vectors.to(device) --------------------------------------------------------------------------------