├── .gitignore ├── README.md ├── datasets_logo_name.jpg ├── datasets_logo_name.png ├── requirements.txt ├── run.py └── viewer.png /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .venv 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hugging Face Datasets Viewer 2 | 3 | Viewer for the Hugging Face datasets library. 4 | 5 | 6 | 7 | 8 | ``` 9 | streamlit run run.py 10 | ``` 11 | 12 | or if you want to view local files 13 | 14 | ``` 15 | streamlit run run.py 16 | ``` 17 | -------------------------------------------------------------------------------- /datasets_logo_name.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets-viewer/8efad8eae313a891f713469983bf4c744786f26e/datasets_logo_name.jpg -------------------------------------------------------------------------------- /datasets_logo_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets-viewer/8efad8eae313a891f713469983bf4c744786f26e/datasets_logo_name.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | streamlit 3 | pandas>=1.2.4,<1.3 4 | pyyaml 5 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import numpy as np 3 | import pandas as pd 4 | import datasets 5 | from dataclasses import asdict 6 | import yaml 7 | import textwrap 8 | import tornado 9 | import json 10 | import time 11 | import sys 12 | 13 | 14 | MAX_SIZE = 40000000000 15 | if len(sys.argv) > 1: 16 | path_to_datasets = sys.argv[1] 17 | else: 18 | path_to_datasets = None 19 | 20 | ## Hack to extend the width of the main pane. 21 | def _max_width_(): 22 | max_width_str = f"max-width: 1000px;" 23 | st.markdown( 24 | f""" 25 | 41 | """, 42 | unsafe_allow_html=True, 43 | ) 44 | 45 | 46 | _max_width_() 47 | 48 | 49 | def render_features(features): 50 | if isinstance(features, dict): 51 | return {k: render_features(v) for k, v in features.items()} 52 | if isinstance(features, datasets.features.ClassLabel): 53 | return features.names 54 | 55 | if isinstance(features, datasets.features.Value): 56 | return features.dtype 57 | 58 | if isinstance(features, datasets.features.Sequence): 59 | return {"[]": render_features(features.feature)} 60 | return features 61 | 62 | 63 | app_state = st.experimental_get_query_params() 64 | # print(app_state) 65 | start = True 66 | loaded = True 67 | INITIAL_SELECTION = "" 68 | # if app_state == "NOT_INITIALIZED": 69 | # latest_iteration = st.empty() 70 | # bar = st.progress(0) 71 | # start = False 72 | # for i in range(0, 101, 10): 73 | # # Update the progress bar with each iteration. 74 | # # latest_iteration.text(f'Iteration {i+1}') 75 | # bar.progress(i) 76 | # time.sleep(0.1) 77 | # if i == 100: 78 | # start = True 79 | # bar.empty() 80 | # loaded = True 81 | 82 | # app_state = st.experimental_get_query_params() 83 | # print("appstate is", app_state) 84 | app_state.setdefault("dataset", "glue") 85 | if len(app_state.get("dataset", [])) == 1: 86 | app_state["dataset"] = app_state["dataset"][0] 87 | INITIAL_SELECTION = app_state["dataset"] 88 | if len(app_state.get("config", [])) == 1: 89 | app_state["config"] = app_state["config"][0] 90 | print(INITIAL_SELECTION) 91 | 92 | if start: 93 | ## Logo and sidebar decoration. 94 | st.sidebar.markdown( 95 | """
96 | 97 | 98 |
""", 99 | unsafe_allow_html=True, 100 | ) 101 | st.sidebar.image("datasets_logo_name.png", width=300) 102 | st.sidebar.markdown( 103 | "

github/huggingface/datasets

", 104 | unsafe_allow_html=True, 105 | ) 106 | st.sidebar.markdown( 107 | """ 108 |
109 | Docs | 110 | Browse 111 | | Add Dataset 112 |
""", 113 | unsafe_allow_html=True, 114 | ) 115 | st.sidebar.subheader("") 116 | 117 | ## Interaction with the datasets libary. 118 | # @st.cache 119 | def get_confs(opt): 120 | "Get the list of confs for a dataset." 121 | if path_to_datasets is not None and opt is not None: 122 | path = path_to_datasets + opt 123 | else: 124 | path = opt 125 | 126 | module_path = datasets.load.prepare_module(path, dataset=True 127 | ) 128 | # Get dataset builder class from the processing script 129 | builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) 130 | # Instantiate the dataset builder 131 | confs = builder_cls.BUILDER_CONFIGS 132 | if confs and len(confs) > 1: 133 | return confs 134 | else: 135 | return [] 136 | 137 | # @st.cache(allow_output_mutation=True) 138 | def get(opt, conf=None): 139 | "Get a dataset from name and conf" 140 | if path_to_datasets is not None: 141 | path = path_to_datasets + opt 142 | else: 143 | path = opt 144 | 145 | module_path = datasets.load.prepare_module(path, dataset=True) 146 | builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) 147 | if conf: 148 | builder_instance = builder_cls(name=conf, cache_dir=path if path_to_datasets is not None else None) 149 | else: 150 | builder_instance = builder_cls(cache_dir=path if path_to_datasets is not None else None) 151 | fail = False 152 | if path_to_datasets is not None: 153 | dts = datasets.load_dataset(path, 154 | name=builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None, 155 | ) 156 | dataset = dts 157 | 158 | elif ( 159 | builder_instance.manual_download_instructions is None 160 | and builder_instance.info.size_in_bytes is not None 161 | and builder_instance.info.size_in_bytes < MAX_SIZE): 162 | builder_instance.download_and_prepare() 163 | dts = builder_instance.as_dataset() 164 | dataset = dts 165 | else: 166 | dataset = builder_instance 167 | fail = True 168 | return dataset, fail 169 | 170 | # Dataset select box. 171 | dataset_names = [] 172 | selection = None 173 | 174 | import glob 175 | if path_to_datasets is None: 176 | list_of_datasets = datasets.list_datasets(with_community_datasets=False) 177 | else: 178 | list_of_datasets = sorted(glob.glob(path_to_datasets + "*")) 179 | print(list_of_datasets) 180 | for i, dataset in enumerate(list_of_datasets): 181 | dataset = dataset.split("/")[-1] 182 | if INITIAL_SELECTION and dataset == INITIAL_SELECTION: 183 | selection = i 184 | dataset_names.append(dataset ) 185 | 186 | if selection is not None: 187 | option = st.sidebar.selectbox( 188 | "Dataset", dataset_names, index=selection, format_func=lambda a: a 189 | ) 190 | else: 191 | option = st.sidebar.selectbox("Dataset", dataset_names, format_func=lambda a: a) 192 | print(option) 193 | app_state["dataset"] = option 194 | st.experimental_set_query_params(**app_state) 195 | 196 | # Side bar Configurations. 197 | configs = get_confs(option) 198 | conf_avail = len(configs) > 0 199 | conf_option = None 200 | if conf_avail: 201 | start = 0 202 | for i, conf in enumerate(configs): 203 | if conf.name == app_state.get("config", None): 204 | start = i 205 | conf_option = st.sidebar.selectbox( 206 | "Subset", configs, index=start, format_func=lambda a: a.name 207 | ) 208 | app_state["config"] = conf_option.name 209 | 210 | else: 211 | if "config" in app_state: 212 | del app_state["config"] 213 | st.experimental_set_query_params(**app_state) 214 | 215 | dts, fail = get(str(option), str(conf_option.name) if conf_option else None) 216 | 217 | # Main panel setup. 218 | if fail: 219 | st.markdown( 220 | "Dataset is too large to browse or requires manual download. Check it out in the datasets library! \n\n Size: " 221 | + str(dts.info.size_in_bytes) 222 | + "\n\n Instructions: " 223 | + str(dts.manual_download_instructions) 224 | ) 225 | else: 226 | 227 | k = list(dts.keys()) 228 | index = 0 229 | if "train" in dts.keys(): 230 | index = k.index("train") 231 | split = st.sidebar.selectbox("Split", k, index=index) 232 | 233 | d = dts[split] 234 | 235 | keys = list(d[0].keys()) 236 | 237 | st.header( 238 | "Dataset: " 239 | + option 240 | + " " 241 | + (("/ " + conf_option.name) if conf_option else "") 242 | ) 243 | 244 | st.markdown( 245 | "*Homepage*: " 246 | + d.info.homepage 247 | + "\n\n*Dataset*: https://huggingface.co/datasets/%s" 248 | % (option) 249 | ) 250 | 251 | md = """ 252 | %s 253 | """ % ( 254 | d.info.description.replace("\\", "") if option else "" 255 | ) 256 | st.markdown(md) 257 | 258 | step = 50 259 | offset = st.sidebar.number_input( 260 | "Offset (Size: %d)" % len(d), 261 | min_value=0, 262 | max_value=int(len(d)) - step, 263 | value=0, 264 | step=step, 265 | ) 266 | 267 | citation = st.sidebar.checkbox("Show Citations", False) 268 | table = not st.sidebar.checkbox("Show List View", False) 269 | show_features = st.sidebar.checkbox("Show Features", True) 270 | md = """ 271 | ``` 272 | %s 273 | ``` 274 | """ % ( 275 | d.info.citation.replace("\\", "").replace("}", " }").replace("{", "{ "), 276 | ) 277 | if citation: 278 | st.markdown(md) 279 | # st.text("Features:") 280 | if show_features: 281 | on_keys = st.multiselect("Features", keys, keys) 282 | st.write(render_features(d.features)) 283 | else: 284 | on_keys = keys 285 | if not table: 286 | # Full view. 287 | for item in range(offset, offset + step): 288 | st.text(" ") 289 | st.text(" ---- #" + str(item)) 290 | st.text(" ") 291 | # Use st to write out. 292 | for k in on_keys: 293 | v = d[item][k] 294 | st.subheader(k) 295 | if isinstance(v, str): 296 | out = v 297 | st.text(textwrap.fill(out, width=120)) 298 | elif ( 299 | isinstance(v, bool) 300 | or isinstance(v, int) 301 | or isinstance(v, float) 302 | ): 303 | st.text(v) 304 | else: 305 | st.write(v) 306 | 307 | else: 308 | # Table view. Use Pandas. 309 | df = [] 310 | for item in range(offset, offset + step): 311 | df_item = {} 312 | df_item["_number"] = item 313 | for k in on_keys: 314 | v = d[item][k] 315 | if isinstance(v, str): 316 | out = v 317 | df_item[k] = textwrap.fill(out, width=50) 318 | elif ( 319 | isinstance(v, bool) 320 | or isinstance(v, int) 321 | or isinstance(v, float) 322 | ): 323 | df_item[k] = v 324 | else: 325 | out = json.dumps(v, indent=2, sort_keys=True) 326 | df_item[k] = out 327 | df.append(df_item) 328 | df2 = df 329 | df = pd.DataFrame(df).set_index("_number") 330 | 331 | def hover(hover_color="#ffff99"): 332 | return dict( 333 | selector="tr:hover", 334 | props=[("background-color", "%s" % hover_color)], 335 | ) 336 | 337 | styles = [ 338 | hover(), 339 | dict( 340 | selector="th", 341 | props=[("font-size", "150%"), ("text-align", "center")], 342 | ), 343 | dict(selector="caption", props=[("caption-side", "bottom")]), 344 | ] 345 | 346 | # Table view. Use pands styling. 347 | style = df.style.set_properties( 348 | **{"text-align": "left", "white-space": "pre"} 349 | ).set_table_styles([dict(selector="th", props=[("text-align", "left")])]) 350 | style = style.set_table_styles(styles) 351 | st.table(style) 352 | 353 | # Additional dataset installation and sidebar properties. 354 | md = """ 355 | ### Code 356 | 357 | ```python 358 | !pip install datasets 359 | from datasets import load_dataset 360 | dataset = load_dataset( 361 | '%s'%s) 362 | ``` 363 | 364 | """ % ( 365 | option, 366 | (", '" + conf_option.name + "'") if conf_option else "", 367 | ) 368 | st.sidebar.markdown(md) 369 | -------------------------------------------------------------------------------- /viewer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/datasets-viewer/8efad8eae313a891f713469983bf4c744786f26e/viewer.png --------------------------------------------------------------------------------