├── .gitignore ├── README.md ├── hugging_corenlp.py └── hugging_stanza.py /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore emacs files 2 | *~ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # huggingface-models 2 | Scripts for pushing models to huggingface repos 3 | -------------------------------------------------------------------------------- /hugging_corenlp.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script allows for pushing of the corenlp models to N different huggingface repos. 3 | 4 | Generously provided by Omar Sanseviero 5 | 6 | huggingface-cli login 7 | python3 hugging_corenlp.py --input_dir --branch 8 | """ 9 | 10 | import argparse 11 | import datetime 12 | import os 13 | import shutil 14 | 15 | from collections import namedtuple 16 | 17 | from huggingface_hub import HfApi, HfFolder, hf_hub_download 18 | 19 | def get_model_card(lang, model): 20 | now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] 21 | model_card = """--- 22 | tags: 23 | - corenlp 24 | library_tag: corenlp 25 | language: {lang} 26 | license: gpl-2.0 27 | --- 28 | # Core NLP model for {model} 29 | CoreNLP is your one stop shop for natural language processing in Java! CoreNLP enables users to derive linguistic annotations for text, including token and sentence boundaries, parts of speech, named entities, numeric and time values, dependency and constituency parses, coreference, sentiment, quote attributions, and relations. 30 | Find more about it in [our website](https://stanfordnlp.github.io/CoreNLP) and our [GitHub repository](https://github.com/stanfordnlp/CoreNLP). 31 | 32 | This card and repo were automatically prepared with `hugging_corenlp.py` in the `stanfordnlp/huggingface-models` repo 33 | 34 | Last updated {now} 35 | """.format(lang=lang, model=model, now=now) 36 | return model_card 37 | 38 | # lang is an abbrev to use in the model card 39 | # local_name is a potential alternate name for the file 40 | # remote_name is the name to use when pushing remotely 41 | # repo_name is the repo name if corenlp-model is not suitable for some reason 42 | Model = namedtuple("Model", 'model_name, lang, local_name, remote_name, repo_name') 43 | 44 | MODELS = [ 45 | Model("CoreNLP", "en", "stanford-corenlp-latest.zip", "stanford-corenlp-latest.zip", "CoreNLP"), 46 | Model("arabic", "ar", "stanford-arabic-corenlp-models-current.jar", None, None), 47 | Model("chinese", "zh", "stanford-chinese-corenlp-models-current.jar", None, None), 48 | Model("english-default", "en", "stanford-corenlp-models-current.jar", None, None), 49 | Model("english-extra", "en", "stanford-english-corenlp-models-current.jar", None, None), 50 | Model("english-kbp", "en", "stanford-english-kbp-corenlp-models-current.jar", None, None), 51 | Model("french", "fr", "stanford-french-corenlp-models-current.jar", None, None), 52 | Model("german", "de", "stanford-german-corenlp-models-current.jar", None, None), 53 | Model("hungarian", "hu", "stanford-hungarian-corenlp-models-current.jar", None, None), 54 | Model("italian", "it", "stanford-italian-corenlp-models-current.jar", None, None), 55 | Model("spanish", "es", "stanford-spanish-corenlp-models-current.jar", None, None), 56 | ] 57 | 58 | def write_model_card(repo_local_path, lang, model): 59 | """ 60 | Write a README for the current model to the given path 61 | """ 62 | readme_path = os.path.join(repo_local_path, "README.md") 63 | with open(readme_path, "w") as f: 64 | f.write(get_model_card(lang, model)) 65 | 66 | def parse_args(): 67 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 68 | # "/home/john/extern_data/corenlp/" 69 | parser.add_argument('--input_dir', type=str, default="/u/nlp/data/StanfordCoreNLPModels", help='Directory for loading the CoreNLP models') 70 | # "/home/john/huggingface/hub" 71 | parser.add_argument('--output_dir', type=str, default="/u/nlp/software/hub", help='Directory with the repos') 72 | parser.add_argument('--version', type=str, default="4.5.8", help='Version of corenlp models to upload') 73 | parser.add_argument('--no_models', dest="models", action='store_false', default=True, help="Only push the package without updating the models. Useful for when a new version is released, with only code changes, and the 'latest' symlink wasn't properly updated") 74 | args = parser.parse_args() 75 | return args 76 | 77 | 78 | def maybe_add_lfs(api, repo_id, repo_local_path, extension): 79 | # read the existing .gitattributes file 80 | git_filename = os.path.join(repo_local_path, ".gitattributes") 81 | with open(git_filename) as fin: 82 | lines = fin.readlines() 83 | 84 | # if the extension isn't already there, add it and push the new version 85 | if not any(line.startswith(extension + " ") for line in lines): 86 | lines.append("%s filter=lfs diff=lfs merge=lfs -text\n" % extension) 87 | blob = "".join(lines).encode() 88 | api.upload_file(repo_id=repo_id, path_in_repo=".gitattributes", path_or_fileobj=blob) 89 | 90 | def push_to_hub(): 91 | args = parse_args() 92 | api = HfApi() 93 | 94 | input_dir = args.input_dir 95 | if args.models: 96 | stuff_to_push = MODELS 97 | else: 98 | stuff_to_push = [x for x in MODELS if x.model_name == 'CoreNLP'] 99 | 100 | for model in stuff_to_push: 101 | # Create the repository 102 | lang = model.lang 103 | model_name = model.model_name 104 | repo_name = model.repo_name if model.repo_name else "corenlp-%s" % model_name 105 | repo_id = "stanfordnlp/" + repo_name 106 | repo_url = api.create_repo( 107 | repo_id=repo_id, 108 | exist_ok=True, 109 | ) 110 | 111 | # check the lfs status of .zip and .jar 112 | # TODO: we can probably get rid of repo_local_path 113 | # - use a temporary file for .gitattributes 114 | # - use a bytes blob for the README 115 | # - use the jar / zip file for CoreNLP directly, wherever it is 116 | repo_local_path = os.path.join(args.output_dir, repo_name) 117 | hf_hub_download(repo_id, ".gitattributes", local_dir=repo_local_path, local_dir_use_symlinks=False) 118 | maybe_add_lfs(api, repo_id, repo_local_path, '*.jar') 119 | maybe_add_lfs(api, repo_id, repo_local_path, '*.zip') 120 | 121 | # Create a copy of the jar file in the repository 122 | dst = os.path.join(repo_local_path, model.remote_name) if model.remote_name else os.path.join(repo_local_path, f"stanford-corenlp-models-{model_name}.jar") 123 | src_candidates = [f"stanford-corenlp-models-{model_name}.jar", 124 | model.local_name, 125 | # stanford-corenlp-4.4.0-models-arabic.jar 126 | f"stanford-corenlp-{args.version}-models-{model_name}.jar"] 127 | for src in src_candidates: 128 | if input_dir: 129 | src = os.path.join(input_dir, src) 130 | if os.path.exists(src): 131 | break 132 | else: 133 | if input_dir: 134 | locations_searched = ", ".join(os.path.join(input_dir, src) for src in src_candidates) 135 | else: 136 | locations_searched = ", ".join(src_candidates) 137 | raise FileNotFoundError(f"Cannot find {model_name} model. Looked in {locations_searched}") 138 | print(f"Copying model from {src} to {dst}") 139 | shutil.copy(src, dst) 140 | 141 | # Create the model card 142 | write_model_card(repo_local_path, lang, model_name) 143 | 144 | # Upload model + model card 145 | # setting delete_patterns will clean up old model files as we go 146 | # note: the error of not having anything to push will hopefully 147 | # never happen since the README is updated to the millisecond 148 | print("Pushing files to the Hub from %s to %s" % (repo_local_path, repo_id)) 149 | api.upload_folder(repo_id=repo_id, folder_path=repo_local_path, commit_message=f"Add model {args.version}") 150 | 151 | # Check and delete tag if already exist 152 | new_tag_name = "v" + args.version 153 | refs = api.list_repo_refs(repo_id=repo_id) 154 | for tag in refs.tags: 155 | if tag.name == new_tag_name: 156 | api.delete_tag(repo_id=repo_id, tag=new_tag_name) 157 | break 158 | 159 | # Tag model version 160 | api.create_tag(repo_id=repo_id, tag=new_tag_name, tag_message=f"Adding new version of models {new_tag_name}") 161 | print(f"Added a tag for the new models: {new_tag_name}") 162 | 163 | print(f"View your model in {repo_url}") 164 | 165 | 166 | if __name__ == '__main__': 167 | push_to_hub() 168 | -------------------------------------------------------------------------------- /hugging_stanza.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script allows for pushing of the corenlp models to N different huggingface repos. 3 | 4 | Generously provided by Omar Sanseviero 5 | 6 | huggingface-cli login 7 | python hugging_stanza.py --input_dir --version 8 | """ 9 | 10 | import argparse 11 | import datetime 12 | import os 13 | import shutil 14 | from pathlib import Path 15 | 16 | from stanza.resources.common import list_available_languages 17 | from stanza.models.common.constant import lcode2lang, lang2lcode 18 | 19 | from huggingface_hub import HfApi 20 | 21 | def get_model_card(lang): 22 | now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] 23 | full_lang = lcode2lang.get(lang, None) 24 | short_lang = lang2lcode.get(lang, lang) 25 | short_lang = short_lang.split("-")[0] 26 | lang_text = f"{full_lang} ({lang})" if full_lang else lang 27 | model_card = """--- 28 | tags: 29 | - stanza 30 | - token-classification 31 | library_name: stanza 32 | language: {short_lang} 33 | license: apache-2.0 34 | --- 35 | # Stanza model for {lang_text} 36 | Stanza is a collection of accurate and efficient tools for the linguistic analysis of many human languages. Starting from raw text to syntactic analysis and entity recognition, Stanza brings state-of-the-art NLP models to languages of your choosing. 37 | Find more about it in [our website](https://stanfordnlp.github.io/stanza) and our [GitHub repository](https://github.com/stanfordnlp/stanza). 38 | 39 | This card and repo were automatically prepared with `hugging_stanza.py` in the `stanfordnlp/huggingface-models` repo 40 | 41 | Last updated {now} 42 | """.format(short_lang=short_lang, lang_text=lang_text, now=now) 43 | return model_card 44 | 45 | def parse_args(): 46 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 47 | parser.add_argument('--input_dir', type=str, default="/u/nlp/software/stanza/models/", help='Directory for loading the stanza models. Will first try input_dir + version, if that exists') 48 | parser.add_argument('--version', type=str, default="1.10.0", help='Version of stanza models to upload') 49 | parser.add_argument('lang', nargs='*', help='List of languages. Will default to all languages') 50 | args = parser.parse_args() 51 | if len(args.lang) == 0: 52 | # TODO: use version to get the available languages 53 | # TODO: skip languages where the version and the data didn't change 54 | args.lang = list_available_languages() 55 | return args 56 | 57 | def push_to_hub(): 58 | args = parse_args() 59 | input_dir = args.input_dir 60 | if os.path.exists(input_dir + args.version): 61 | input_dir = input_dir + args.version 62 | print("Found directory in %s - using that instead of %s" % (input_dir, args.input_dir)) 63 | 64 | new_tag_name = "v" + args.version 65 | 66 | api = HfApi() 67 | 68 | print("Processing languages: {}".format(args.lang)) 69 | for model in args.lang: 70 | print(f"Processing {model}") 71 | # Create the repository 72 | repo_name = "stanza-" + model 73 | repo_id = "stanfordnlp/" + repo_name 74 | repo_url = api.create_repo( 75 | repo_id=repo_id, 76 | exist_ok=True 77 | ) 78 | 79 | # Find src folder 80 | src = Path(input_dir) / model 81 | if not src.exists(): 82 | if not input_dir: 83 | raise FileNotFoundError(f"Could not find models under {src}. Perhaps you forgot to set --input_dir?") 84 | else: 85 | raise FileNotFoundError(f"Could not find models under {src}") 86 | 87 | # Update model card in it 88 | (src / "README.md").write_text(get_model_card(model)) 89 | 90 | # Upload model + model card 91 | # setting delete_patterns will clean up old model files as we go 92 | api.upload_folder(repo_id=repo_id, folder_path=src, commit_message=f"Add model {args.version}", delete_patterns="*.pt") 93 | 94 | # Check and delete tag if already exist 95 | refs = api.list_repo_refs(repo_id=repo_id) 96 | for tag in refs.tags: 97 | if tag.name == new_tag_name: 98 | api.delete_tag(repo_id=repo_id, tag=new_tag_name) 99 | break 100 | 101 | # Tag model version 102 | api.create_tag(repo_id=repo_id, tag=new_tag_name, tag_message=f"Adding new version of models {new_tag_name}") 103 | print(f"Added a tag for the new models: {new_tag_name}") 104 | print(f"View your model in:\n {repo_url}\n\n") 105 | 106 | if __name__ == '__main__': 107 | push_to_hub() 108 | --------------------------------------------------------------------------------