├── .gitignore
├── README.md
├── hugging_corenlp.py
└── hugging_stanza.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # ignore emacs files
2 | *~
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # huggingface-models
2 | Scripts for pushing models to huggingface repos
3 | 


--------------------------------------------------------------------------------
/hugging_corenlp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script allows for pushing of the corenlp models to N different huggingface repos.
  3 | 
  4 | Generously provided by Omar Sanseviero
  5 | 
  6 | huggingface-cli login
  7 | python3 hugging_corenlp.py --input_dir <models_path>  --branch <version>
  8 | """
  9 | 
 10 | import argparse
 11 | import datetime
 12 | import os
 13 | import shutil
 14 | 
 15 | from collections import namedtuple
 16 | 
 17 | from huggingface_hub import  HfApi, HfFolder, hf_hub_download
 18 | 
 19 | def get_model_card(lang, model):
 20 |     now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
 21 |     model_card = """---
 22 | tags:
 23 | - corenlp
 24 | library_tag: corenlp
 25 | language: {lang}
 26 | license: gpl-2.0
 27 | ---
 28 | # Core NLP model for {model}
 29 | CoreNLP is your one stop shop for natural language processing in Java! CoreNLP enables users to derive linguistic annotations for text, including token and sentence boundaries, parts of speech, named entities, numeric and time values, dependency and constituency parses, coreference, sentiment, quote attributions, and relations.
 30 | Find more about it in [our website](https://stanfordnlp.github.io/CoreNLP) and our [GitHub repository](https://github.com/stanfordnlp/CoreNLP).
 31 | 
 32 | This card and repo were automatically prepared with `hugging_corenlp.py` in the `stanfordnlp/huggingface-models` repo
 33 | 
 34 | Last updated {now}
 35 | """.format(lang=lang, model=model, now=now)
 36 |     return model_card
 37 | 
 38 | # lang is an abbrev to use in the model card
 39 | # local_name is a potential alternate name for the file
 40 | # remote_name is the name to use when pushing remotely
 41 | # repo_name is the repo name if corenlp-model is not suitable for some reason
 42 | Model = namedtuple("Model", 'model_name, lang, local_name, remote_name, repo_name')
 43 | 
 44 | MODELS = [
 45 |     Model("CoreNLP",          "en",   "stanford-corenlp-latest.zip",                     "stanford-corenlp-latest.zip", "CoreNLP"),
 46 |     Model("arabic",           "ar",   "stanford-arabic-corenlp-models-current.jar",      None,                          None),
 47 |     Model("chinese",          "zh",   "stanford-chinese-corenlp-models-current.jar",     None,                          None),
 48 |     Model("english-default",  "en",   "stanford-corenlp-models-current.jar",             None,                          None),
 49 |     Model("english-extra",    "en",   "stanford-english-corenlp-models-current.jar",     None,                          None),
 50 |     Model("english-kbp",      "en",   "stanford-english-kbp-corenlp-models-current.jar", None,                          None),
 51 |     Model("french",           "fr",   "stanford-french-corenlp-models-current.jar",      None,                          None),
 52 |     Model("german",           "de",   "stanford-german-corenlp-models-current.jar",      None,                          None),
 53 |     Model("hungarian",        "hu",   "stanford-hungarian-corenlp-models-current.jar",   None,                          None),
 54 |     Model("italian",          "it",   "stanford-italian-corenlp-models-current.jar",     None,                          None),
 55 |     Model("spanish",          "es",   "stanford-spanish-corenlp-models-current.jar",     None,                          None),
 56 | ]
 57 | 
 58 | def write_model_card(repo_local_path, lang, model):
 59 |     """
 60 |     Write a README for the current model to the given path
 61 |     """
 62 |     readme_path = os.path.join(repo_local_path, "README.md")
 63 |     with open(readme_path, "w") as f:
 64 |         f.write(get_model_card(lang, model))
 65 | 
 66 | def parse_args():
 67 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 68 |     # "/home/john/extern_data/corenlp/"
 69 |     parser.add_argument('--input_dir', type=str, default="/u/nlp/data/StanfordCoreNLPModels", help='Directory for loading the CoreNLP models')
 70 |     # "/home/john/huggingface/hub"
 71 |     parser.add_argument('--output_dir', type=str, default="/u/nlp/software/hub", help='Directory with the repos')
 72 |     parser.add_argument('--version', type=str, default="4.5.8", help='Version of corenlp models to upload')
 73 |     parser.add_argument('--no_models', dest="models", action='store_false', default=True, help="Only push the package without updating the models.  Useful for when a new version is released, with only code changes, and the 'latest' symlink wasn't properly updated")
 74 |     args = parser.parse_args()
 75 |     return args
 76 | 
 77 | 
 78 | def maybe_add_lfs(api, repo_id, repo_local_path, extension):
 79 |     # read the existing .gitattributes file
 80 |     git_filename = os.path.join(repo_local_path, ".gitattributes")
 81 |     with open(git_filename) as fin:
 82 |         lines = fin.readlines()
 83 | 
 84 |     # if the extension isn't already there, add it and push the new version
 85 |     if not any(line.startswith(extension + " ") for line in lines):
 86 |         lines.append("%s filter=lfs diff=lfs merge=lfs -text\n" % extension)
 87 |         blob = "".join(lines).encode()
 88 |         api.upload_file(repo_id=repo_id, path_in_repo=".gitattributes", path_or_fileobj=blob)
 89 | 
 90 | def push_to_hub():
 91 |     args = parse_args()
 92 |     api = HfApi()
 93 | 
 94 |     input_dir = args.input_dir
 95 |     if args.models:
 96 |         stuff_to_push = MODELS
 97 |     else:
 98 |         stuff_to_push = [x for x in MODELS if x.model_name == 'CoreNLP']
 99 | 
100 |     for model in stuff_to_push:
101 |         # Create the repository
102 |         lang = model.lang
103 |         model_name = model.model_name
104 |         repo_name = model.repo_name if model.repo_name else "corenlp-%s" % model_name
105 |         repo_id = "stanfordnlp/" + repo_name
106 |         repo_url = api.create_repo(
107 |             repo_id=repo_id,
108 |             exist_ok=True,
109 |         )
110 | 
111 |         # check the lfs status of .zip and .jar
112 |         # TODO: we can probably get rid of repo_local_path
113 |         # - use a temporary file for .gitattributes
114 |         # - use a bytes blob for the README
115 |         # - use the jar / zip file for CoreNLP directly, wherever it is
116 |         repo_local_path = os.path.join(args.output_dir, repo_name)
117 |         hf_hub_download(repo_id, ".gitattributes", local_dir=repo_local_path, local_dir_use_symlinks=False)
118 |         maybe_add_lfs(api, repo_id, repo_local_path, '*.jar')
119 |         maybe_add_lfs(api, repo_id, repo_local_path, '*.zip')
120 | 
121 |         # Create a copy of the jar file in the repository
122 |         dst = os.path.join(repo_local_path, model.remote_name) if model.remote_name else os.path.join(repo_local_path, f"stanford-corenlp-models-{model_name}.jar")
123 |         src_candidates = [f"stanford-corenlp-models-{model_name}.jar",
124 |                           model.local_name,
125 |                           # stanford-corenlp-4.4.0-models-arabic.jar
126 |                           f"stanford-corenlp-{args.version}-models-{model_name}.jar"]
127 |         for src in src_candidates:
128 |             if input_dir:
129 |                 src = os.path.join(input_dir, src)
130 |             if os.path.exists(src):
131 |                 break
132 |         else:
133 |             if input_dir:
134 |                 locations_searched = ", ".join(os.path.join(input_dir, src) for src in src_candidates)
135 |             else:
136 |                 locations_searched = ", ".join(src_candidates)
137 |             raise FileNotFoundError(f"Cannot find {model_name} model.  Looked in {locations_searched}")
138 |         print(f"Copying model from {src} to {dst}")
139 |         shutil.copy(src, dst)
140 | 
141 |         # Create the model card
142 |         write_model_card(repo_local_path, lang, model_name)
143 | 
144 |         # Upload model + model card
145 |         # setting delete_patterns will clean up old model files as we go
146 |         # note: the error of not having anything to push will hopefully
147 |         # never happen since the README is updated to the millisecond
148 |         print("Pushing files to the Hub from %s to %s" % (repo_local_path, repo_id))
149 |         api.upload_folder(repo_id=repo_id, folder_path=repo_local_path, commit_message=f"Add model {args.version}")
150 | 
151 |         # Check and delete tag if already exist
152 |         new_tag_name = "v" + args.version
153 |         refs = api.list_repo_refs(repo_id=repo_id)
154 |         for tag in refs.tags:
155 |             if tag.name == new_tag_name:
156 |                 api.delete_tag(repo_id=repo_id, tag=new_tag_name)
157 |                 break
158 | 
159 |         # Tag model version
160 |         api.create_tag(repo_id=repo_id, tag=new_tag_name, tag_message=f"Adding new version of models {new_tag_name}")
161 |         print(f"Added a tag for the new models: {new_tag_name}")
162 | 
163 |         print(f"View your model in {repo_url}")
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     push_to_hub()
168 | 


--------------------------------------------------------------------------------
/hugging_stanza.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script allows for pushing of the corenlp models to N different huggingface repos.
  3 | 
  4 | Generously provided by Omar Sanseviero
  5 | 
  6 | huggingface-cli login
  7 | python hugging_stanza.py --input_dir <models_path>  --version <version>
  8 | """
  9 | 
 10 | import argparse
 11 | import datetime
 12 | import os
 13 | import shutil
 14 | from pathlib import Path
 15 | 
 16 | from stanza.resources.common import list_available_languages
 17 | from stanza.models.common.constant import lcode2lang, lang2lcode
 18 | 
 19 | from huggingface_hub import HfApi
 20 | 
 21 | def get_model_card(lang):
 22 |     now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
 23 |     full_lang = lcode2lang.get(lang, None)
 24 |     short_lang = lang2lcode.get(lang, lang)
 25 |     short_lang = short_lang.split("-")[0]
 26 |     lang_text = f"{full_lang} ({lang})" if full_lang else lang
 27 |     model_card = """---
 28 | tags:
 29 | - stanza
 30 | - token-classification
 31 | library_name: stanza
 32 | language: {short_lang}
 33 | license: apache-2.0
 34 | ---
 35 | # Stanza model for {lang_text}
 36 | Stanza is a collection of accurate and efficient tools for the linguistic analysis of many human languages. Starting from raw text to syntactic analysis and entity recognition, Stanza brings state-of-the-art NLP models to languages of your choosing.
 37 | Find more about it in [our website](https://stanfordnlp.github.io/stanza) and our [GitHub repository](https://github.com/stanfordnlp/stanza).
 38 | 
 39 | This card and repo were automatically prepared with `hugging_stanza.py` in the `stanfordnlp/huggingface-models` repo
 40 | 
 41 | Last updated {now}
 42 | """.format(short_lang=short_lang, lang_text=lang_text, now=now)
 43 |     return model_card
 44 | 
 45 | def parse_args():
 46 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 47 |     parser.add_argument('--input_dir', type=str, default="/u/nlp/software/stanza/models/", help='Directory for loading the stanza models.  Will first try input_dir + version, if that exists')
 48 |     parser.add_argument('--version', type=str, default="1.10.0", help='Version of stanza models to upload')
 49 |     parser.add_argument('lang', nargs='*', help='List of languages.  Will default to all languages')
 50 |     args = parser.parse_args()
 51 |     if len(args.lang) == 0:
 52 |         # TODO: use version to get the available languages
 53 |         # TODO: skip languages where the version and the data didn't change
 54 |         args.lang = list_available_languages()
 55 |     return args
 56 | 
 57 | def push_to_hub():
 58 |     args = parse_args()
 59 |     input_dir = args.input_dir
 60 |     if os.path.exists(input_dir + args.version):
 61 |         input_dir = input_dir + args.version
 62 |         print("Found directory in %s - using that instead of %s" % (input_dir, args.input_dir))
 63 | 
 64 |     new_tag_name = "v" + args.version
 65 | 
 66 |     api = HfApi()
 67 | 
 68 |     print("Processing languages: {}".format(args.lang))
 69 |     for model in args.lang:
 70 |         print(f"Processing {model}")
 71 |         # Create the repository
 72 |         repo_name = "stanza-" + model
 73 |         repo_id = "stanfordnlp/" + repo_name
 74 |         repo_url = api.create_repo(
 75 |             repo_id=repo_id,
 76 |             exist_ok=True
 77 |         )
 78 | 
 79 |         # Find src folder
 80 |         src = Path(input_dir) / model
 81 |         if not src.exists():
 82 |             if not input_dir:
 83 |                 raise FileNotFoundError(f"Could not find models under {src}.  Perhaps you forgot to set --input_dir?")
 84 |             else:
 85 |                 raise FileNotFoundError(f"Could not find models under {src}")
 86 | 
 87 |         # Update model card in it
 88 |         (src / "README.md").write_text(get_model_card(model))
 89 | 
 90 |         # Upload model + model card
 91 |         # setting delete_patterns will clean up old model files as we go
 92 |         api.upload_folder(repo_id=repo_id, folder_path=src, commit_message=f"Add model {args.version}", delete_patterns="*.pt")
 93 | 
 94 |         # Check and delete tag if already exist
 95 |         refs = api.list_repo_refs(repo_id=repo_id)
 96 |         for tag in refs.tags:
 97 |             if tag.name == new_tag_name:
 98 |                 api.delete_tag(repo_id=repo_id, tag=new_tag_name)
 99 |                 break
100 | 
101 |         # Tag model version
102 |         api.create_tag(repo_id=repo_id, tag=new_tag_name, tag_message=f"Adding new version of models {new_tag_name}")
103 |         print(f"Added a tag for the new models: {new_tag_name}")
104 |         print(f"View your model in:\n  {repo_url}\n\n")
105 | 
106 | if __name__ == '__main__':
107 |     push_to_hub()
108 | 


--------------------------------------------------------------------------------