├── .devcontainer └── devcontainer.json ├── App.py ├── README.md ├── __init__.py ├── requirements.txt ├── run.bat ├── sivan22 └── orach-chaim │ ├── dataset_dict.json │ └── train │ ├── data-00000-of-00001.arrow │ ├── dataset_info.json │ └── state.json └── utils.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "App.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y object: 14 | ds = datasets.load_from_disk('sivan22/orach-chaim') 15 | df = ds['train'].to_pandas() 16 | def clean(s)->str: 17 | return s.replace(" ","") 18 | df['seif']= df['seif'].apply(clean) 19 | return df 20 | 21 | @st.cache_resource 22 | def get_model()->object: 23 | model = "sivan22/halacha-siman-seif-classifier-new" 24 | classifier = pipeline("text-classification",model=model,top_k=None) 25 | return classifier 26 | 27 | def get_predicts(classifier,input)->str: 28 | predicts = classifier(input) 29 | return predicts 30 | 31 | def run(): 32 | 33 | st.set_page_config( 34 | page_title=" חיפוש חכם בשולחן ערוך", 35 | page_icon="📚", 36 | layout="wide", 37 | initial_sidebar_state="expanded" 38 | ) 39 | 40 | st.write("# (אורח חיים) חיפוש חכם בשולחן ערוך") 41 | 42 | classifier = get_model() 43 | df = get_df() 44 | 45 | user_input = st.text_input('כתוב כאן את שאלתך', placeholder='כמה נרות מדליקים בכל לילה מלילות החנוכה') 46 | num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5) 47 | 48 | if (st.button('חפש') or user_input) and user_input!="": 49 | predictions = get_predicts(classifier,user_input)[0][:num_of_results] 50 | for prediction in predictions: 51 | siman = prediction['label'].split(' ')[0] 52 | seif = prediction['label'].split(' ')[1] 53 | rows = df[((df["bookname"] == " שלחן ערוך - אורח חיים ") | (df["bookname"] ==" משנה ברורה")) & 54 | (df["siman"] == siman) & 55 | (df["seif"] == seif) ] 56 | rows = rows.sort_values(["bookname"],ascending=False) 57 | st.write(('סימן ' + siman + ' סעיף ' + seif), rows[['text','bookname','sek','seif','siman',]]) 58 | 59 | feedback_picker = st.sidebar.selectbox("עזור לי להשתפר! מהי התוצאה הנכונה ביותר לדעתך?",[ str(i+1)+') '+p['label'] for i,p in enumerate(predictions)]) 60 | if st.sidebar.button("אישור"): 61 | LOGGER.info("TEXT: " +user_input + "\t" +"LABEL: "+feedback_picker) 62 | st.sidebar.write("תודה על המשוב!") 63 | 64 | if __name__ == "__main__": 65 | run() 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Semantic search for Halacha 2 | ![image](https://github.com/Sivan22/halacha-classification/assets/89018301/04e3eae0-c990-4795-843e-a879a4ef26b0) 3 | 4 | 5 | this app showcases the capabilities of an AI-based approach to searching Jewish law (Halacha). 6 | 7 | By now, it only returns results from the "orach-chaim" volume of the book "shulchan-aruch". 8 | 9 | # חיפוש הלכתי חכם 10 | תוכנה זו נועדה להדגים את היכולות של גישה מבוססת מודלים של בינה מלאכותית עבור חיפוש תורני חכם. 11 | 12 | כרגע, התוכנה מחפשת רק בספר "שולחן ערוך " חלק "אורח חיים" 13 | 14 | ## Semantic search through text classification 15 | 16 | Searches are based on a text-classification model, with categories being chapters and paragraphs within the book. 17 | 18 | [link](https://huggingface.co/sivan22/halacha-siman-seif-classifier) to the moedl card on huggingFace.co. 19 | 20 | ## גישה מבוססת סיווג 21 | החיפוש מתבצע באמצעות מודל שאומן למשימה של סיווג טקסט, הקטגוריות לסיווג הן הסימן והסעיף בו נמצא הנידון שבחיפוש. 22 | # installation 23 | להתקנה: 24 | 25 | 26 | git clone https://github.com/Sivan22/halacha-classification.git 27 | cd halacha-classification 28 | pip install -r requirements.txt 29 | streamlit run App.py 30 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | streamlit 3 | torch 4 | transformers 5 | datasets 6 | -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | pip install -r requirements.txt 2 | streamlit run app.py 3 | -------------------------------------------------------------------------------- /sivan22/orach-chaim/dataset_dict.json: -------------------------------------------------------------------------------- 1 | {"splits": ["train"]} -------------------------------------------------------------------------------- /sivan22/orach-chaim/train/data-00000-of-00001.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sivan22/halacha-classification/17474483de27d85b7c84b0b6af8c49f51a89172a/sivan22/orach-chaim/train/data-00000-of-00001.arrow -------------------------------------------------------------------------------- /sivan22/orach-chaim/train/dataset_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "builder_name": "parquet", 3 | "citation": "", 4 | "config_name": "default", 5 | "dataset_name": "orach-chaim", 6 | "dataset_size": 24569065, 7 | "description": "", 8 | "download_checksums": { 9 | "hf://datasets/sivan22/orach-chaim@a5608f75949d3ed42b147d852c38a3068c4cbd15/data/train-00000-of-00001.parquet": { 10 | "num_bytes": 10932463, 11 | "checksum": null 12 | } 13 | }, 14 | "download_size": 10932463, 15 | "features": { 16 | "Unnamed: 0": { 17 | "dtype": "int64", 18 | "_type": "Value" 19 | }, 20 | "bookname": { 21 | "dtype": "string", 22 | "_type": "Value" 23 | }, 24 | "siman": { 25 | "dtype": "string", 26 | "_type": "Value" 27 | }, 28 | "sek": { 29 | "dtype": "string", 30 | "_type": "Value" 31 | }, 32 | "text": { 33 | "dtype": "string", 34 | "_type": "Value" 35 | }, 36 | "seif": { 37 | "dtype": "string", 38 | "_type": "Value" 39 | }, 40 | "topic": { 41 | "dtype": "string", 42 | "_type": "Value" 43 | } 44 | }, 45 | "homepage": "", 46 | "license": "", 47 | "size_in_bytes": 35501528, 48 | "splits": { 49 | "train": { 50 | "name": "train", 51 | "num_bytes": 24569065, 52 | "num_examples": 42608, 53 | "dataset_name": "orach-chaim" 54 | } 55 | }, 56 | "version": { 57 | "version_str": "0.0.0", 58 | "major": 0, 59 | "minor": 0, 60 | "patch": 0 61 | } 62 | } -------------------------------------------------------------------------------- /sivan22/orach-chaim/train/state.json: -------------------------------------------------------------------------------- 1 | { 2 | "_data_files": [ 3 | { 4 | "filename": "data-00000-of-00001.arrow" 5 | } 6 | ], 7 | "_fingerprint": "5a46be759aaf0a40", 8 | "_format_columns": null, 9 | "_format_kwargs": {}, 10 | "_format_type": null, 11 | "_output_all_columns": false, 12 | "_split": "train" 13 | } -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import inspect 16 | import textwrap 17 | 18 | import streamlit as st 19 | 20 | 21 | def show_code(demo): 22 | """Showing the code of the demo.""" 23 | show_code = st.sidebar.checkbox("Show code", True) 24 | if show_code: 25 | # Showing the code of the demo. 26 | st.markdown("## Code") 27 | sourcelines, _ = inspect.getsourcelines(demo) 28 | st.code(textwrap.dedent("".join(sourcelines[1:]))) 29 | --------------------------------------------------------------------------------