├── .gitignore
├── README.md
├── datasets_logo_name.jpg
├── datasets_logo_name.png
├── requirements.txt
├── run.py
└── viewer.png
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .venv
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hugging Face Datasets Viewer
2 |
3 | Viewer for the Hugging Face datasets library.
4 |
5 |
6 |
7 |
8 | ```
9 | streamlit run run.py
10 | ```
11 |
12 | or if you want to view local files
13 |
14 | ```
15 | streamlit run run.py
16 | ```
17 |
--------------------------------------------------------------------------------
/datasets_logo_name.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets-viewer/8efad8eae313a891f713469983bf4c744786f26e/datasets_logo_name.jpg
--------------------------------------------------------------------------------
/datasets_logo_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets-viewer/8efad8eae313a891f713469983bf4c744786f26e/datasets_logo_name.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | streamlit
3 | pandas>=1.2.4,<1.3
4 | pyyaml
5 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import numpy as np
3 | import pandas as pd
4 | import datasets
5 | from dataclasses import asdict
6 | import yaml
7 | import textwrap
8 | import tornado
9 | import json
10 | import time
11 | import sys
12 |
13 |
14 | MAX_SIZE = 40000000000
15 | if len(sys.argv) > 1:
16 | path_to_datasets = sys.argv[1]
17 | else:
18 | path_to_datasets = None
19 |
20 | ## Hack to extend the width of the main pane.
21 | def _max_width_():
22 | max_width_str = f"max-width: 1000px;"
23 | st.markdown(
24 | f"""
25 |
41 | """,
42 | unsafe_allow_html=True,
43 | )
44 |
45 |
46 | _max_width_()
47 |
48 |
49 | def render_features(features):
50 | if isinstance(features, dict):
51 | return {k: render_features(v) for k, v in features.items()}
52 | if isinstance(features, datasets.features.ClassLabel):
53 | return features.names
54 |
55 | if isinstance(features, datasets.features.Value):
56 | return features.dtype
57 |
58 | if isinstance(features, datasets.features.Sequence):
59 | return {"[]": render_features(features.feature)}
60 | return features
61 |
62 |
63 | app_state = st.experimental_get_query_params()
64 | # print(app_state)
65 | start = True
66 | loaded = True
67 | INITIAL_SELECTION = ""
68 | # if app_state == "NOT_INITIALIZED":
69 | # latest_iteration = st.empty()
70 | # bar = st.progress(0)
71 | # start = False
72 | # for i in range(0, 101, 10):
73 | # # Update the progress bar with each iteration.
74 | # # latest_iteration.text(f'Iteration {i+1}')
75 | # bar.progress(i)
76 | # time.sleep(0.1)
77 | # if i == 100:
78 | # start = True
79 | # bar.empty()
80 | # loaded = True
81 |
82 | # app_state = st.experimental_get_query_params()
83 | # print("appstate is", app_state)
84 | app_state.setdefault("dataset", "glue")
85 | if len(app_state.get("dataset", [])) == 1:
86 | app_state["dataset"] = app_state["dataset"][0]
87 | INITIAL_SELECTION = app_state["dataset"]
88 | if len(app_state.get("config", [])) == 1:
89 | app_state["config"] = app_state["config"][0]
90 | print(INITIAL_SELECTION)
91 |
92 | if start:
93 | ## Logo and sidebar decoration.
94 | st.sidebar.markdown(
95 | """
96 |
97 |
98 | """,
99 | unsafe_allow_html=True,
100 | )
101 | st.sidebar.image("datasets_logo_name.png", width=300)
102 | st.sidebar.markdown(
103 | "",
104 | unsafe_allow_html=True,
105 | )
106 | st.sidebar.markdown(
107 | """
108 |
109 | Docs |
110 | Browse
111 | | Add Dataset
112 | """,
113 | unsafe_allow_html=True,
114 | )
115 | st.sidebar.subheader("")
116 |
117 | ## Interaction with the datasets libary.
118 | # @st.cache
119 | def get_confs(opt):
120 | "Get the list of confs for a dataset."
121 | if path_to_datasets is not None and opt is not None:
122 | path = path_to_datasets + opt
123 | else:
124 | path = opt
125 |
126 | module_path = datasets.load.prepare_module(path, dataset=True
127 | )
128 | # Get dataset builder class from the processing script
129 | builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
130 | # Instantiate the dataset builder
131 | confs = builder_cls.BUILDER_CONFIGS
132 | if confs and len(confs) > 1:
133 | return confs
134 | else:
135 | return []
136 |
137 | # @st.cache(allow_output_mutation=True)
138 | def get(opt, conf=None):
139 | "Get a dataset from name and conf"
140 | if path_to_datasets is not None:
141 | path = path_to_datasets + opt
142 | else:
143 | path = opt
144 |
145 | module_path = datasets.load.prepare_module(path, dataset=True)
146 | builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
147 | if conf:
148 | builder_instance = builder_cls(name=conf, cache_dir=path if path_to_datasets is not None else None)
149 | else:
150 | builder_instance = builder_cls(cache_dir=path if path_to_datasets is not None else None)
151 | fail = False
152 | if path_to_datasets is not None:
153 | dts = datasets.load_dataset(path,
154 | name=builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None,
155 | )
156 | dataset = dts
157 |
158 | elif (
159 | builder_instance.manual_download_instructions is None
160 | and builder_instance.info.size_in_bytes is not None
161 | and builder_instance.info.size_in_bytes < MAX_SIZE):
162 | builder_instance.download_and_prepare()
163 | dts = builder_instance.as_dataset()
164 | dataset = dts
165 | else:
166 | dataset = builder_instance
167 | fail = True
168 | return dataset, fail
169 |
170 | # Dataset select box.
171 | dataset_names = []
172 | selection = None
173 |
174 | import glob
175 | if path_to_datasets is None:
176 | list_of_datasets = datasets.list_datasets(with_community_datasets=False)
177 | else:
178 | list_of_datasets = sorted(glob.glob(path_to_datasets + "*"))
179 | print(list_of_datasets)
180 | for i, dataset in enumerate(list_of_datasets):
181 | dataset = dataset.split("/")[-1]
182 | if INITIAL_SELECTION and dataset == INITIAL_SELECTION:
183 | selection = i
184 | dataset_names.append(dataset )
185 |
186 | if selection is not None:
187 | option = st.sidebar.selectbox(
188 | "Dataset", dataset_names, index=selection, format_func=lambda a: a
189 | )
190 | else:
191 | option = st.sidebar.selectbox("Dataset", dataset_names, format_func=lambda a: a)
192 | print(option)
193 | app_state["dataset"] = option
194 | st.experimental_set_query_params(**app_state)
195 |
196 | # Side bar Configurations.
197 | configs = get_confs(option)
198 | conf_avail = len(configs) > 0
199 | conf_option = None
200 | if conf_avail:
201 | start = 0
202 | for i, conf in enumerate(configs):
203 | if conf.name == app_state.get("config", None):
204 | start = i
205 | conf_option = st.sidebar.selectbox(
206 | "Subset", configs, index=start, format_func=lambda a: a.name
207 | )
208 | app_state["config"] = conf_option.name
209 |
210 | else:
211 | if "config" in app_state:
212 | del app_state["config"]
213 | st.experimental_set_query_params(**app_state)
214 |
215 | dts, fail = get(str(option), str(conf_option.name) if conf_option else None)
216 |
217 | # Main panel setup.
218 | if fail:
219 | st.markdown(
220 | "Dataset is too large to browse or requires manual download. Check it out in the datasets library! \n\n Size: "
221 | + str(dts.info.size_in_bytes)
222 | + "\n\n Instructions: "
223 | + str(dts.manual_download_instructions)
224 | )
225 | else:
226 |
227 | k = list(dts.keys())
228 | index = 0
229 | if "train" in dts.keys():
230 | index = k.index("train")
231 | split = st.sidebar.selectbox("Split", k, index=index)
232 |
233 | d = dts[split]
234 |
235 | keys = list(d[0].keys())
236 |
237 | st.header(
238 | "Dataset: "
239 | + option
240 | + " "
241 | + (("/ " + conf_option.name) if conf_option else "")
242 | )
243 |
244 | st.markdown(
245 | "*Homepage*: "
246 | + d.info.homepage
247 | + "\n\n*Dataset*: https://huggingface.co/datasets/%s"
248 | % (option)
249 | )
250 |
251 | md = """
252 | %s
253 | """ % (
254 | d.info.description.replace("\\", "") if option else ""
255 | )
256 | st.markdown(md)
257 |
258 | step = 50
259 | offset = st.sidebar.number_input(
260 | "Offset (Size: %d)" % len(d),
261 | min_value=0,
262 | max_value=int(len(d)) - step,
263 | value=0,
264 | step=step,
265 | )
266 |
267 | citation = st.sidebar.checkbox("Show Citations", False)
268 | table = not st.sidebar.checkbox("Show List View", False)
269 | show_features = st.sidebar.checkbox("Show Features", True)
270 | md = """
271 | ```
272 | %s
273 | ```
274 | """ % (
275 | d.info.citation.replace("\\", "").replace("}", " }").replace("{", "{ "),
276 | )
277 | if citation:
278 | st.markdown(md)
279 | # st.text("Features:")
280 | if show_features:
281 | on_keys = st.multiselect("Features", keys, keys)
282 | st.write(render_features(d.features))
283 | else:
284 | on_keys = keys
285 | if not table:
286 | # Full view.
287 | for item in range(offset, offset + step):
288 | st.text(" ")
289 | st.text(" ---- #" + str(item))
290 | st.text(" ")
291 | # Use st to write out.
292 | for k in on_keys:
293 | v = d[item][k]
294 | st.subheader(k)
295 | if isinstance(v, str):
296 | out = v
297 | st.text(textwrap.fill(out, width=120))
298 | elif (
299 | isinstance(v, bool)
300 | or isinstance(v, int)
301 | or isinstance(v, float)
302 | ):
303 | st.text(v)
304 | else:
305 | st.write(v)
306 |
307 | else:
308 | # Table view. Use Pandas.
309 | df = []
310 | for item in range(offset, offset + step):
311 | df_item = {}
312 | df_item["_number"] = item
313 | for k in on_keys:
314 | v = d[item][k]
315 | if isinstance(v, str):
316 | out = v
317 | df_item[k] = textwrap.fill(out, width=50)
318 | elif (
319 | isinstance(v, bool)
320 | or isinstance(v, int)
321 | or isinstance(v, float)
322 | ):
323 | df_item[k] = v
324 | else:
325 | out = json.dumps(v, indent=2, sort_keys=True)
326 | df_item[k] = out
327 | df.append(df_item)
328 | df2 = df
329 | df = pd.DataFrame(df).set_index("_number")
330 |
331 | def hover(hover_color="#ffff99"):
332 | return dict(
333 | selector="tr:hover",
334 | props=[("background-color", "%s" % hover_color)],
335 | )
336 |
337 | styles = [
338 | hover(),
339 | dict(
340 | selector="th",
341 | props=[("font-size", "150%"), ("text-align", "center")],
342 | ),
343 | dict(selector="caption", props=[("caption-side", "bottom")]),
344 | ]
345 |
346 | # Table view. Use pands styling.
347 | style = df.style.set_properties(
348 | **{"text-align": "left", "white-space": "pre"}
349 | ).set_table_styles([dict(selector="th", props=[("text-align", "left")])])
350 | style = style.set_table_styles(styles)
351 | st.table(style)
352 |
353 | # Additional dataset installation and sidebar properties.
354 | md = """
355 | ### Code
356 |
357 | ```python
358 | !pip install datasets
359 | from datasets import load_dataset
360 | dataset = load_dataset(
361 | '%s'%s)
362 | ```
363 |
364 | """ % (
365 | option,
366 | (", '" + conf_option.name + "'") if conf_option else "",
367 | )
368 | st.sidebar.markdown(md)
369 |
--------------------------------------------------------------------------------
/viewer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/datasets-viewer/8efad8eae313a891f713469983bf4c744786f26e/viewer.png
--------------------------------------------------------------------------------