├── .gitignore
├── README.md
├── chatbot
    ├── ChatBot.ipynb
    └── Synthesis_Embedding.zip
├── random_forest
    ├── model.ipynb
    └── processed_data.csv
└── text_mining
    ├── 228paper_info.csv
    ├── 228paper_parsed.csv
    ├── ChatGPT_Paper_Reading.ipynb
    └── pdf_pool.csv


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ChatGPT_Chemistry_Assistant
  2 | ChatGPT Chemistry Assistant
  3 | 
  4 | Please check out https://pubs.acs.org/doi/10.1021/jacs.3c05819 for more details.
  5 | 
  6 | Step-by-step illustrations of setting up Processes 1, 2, and 3 were shown in the Supporting Information file of this article in the cookbook style.
  7 | 
  8 | If you find this work helpful to your research, kindly consider citing the following: 
  9 | 
 10 | Zheng, Z.;  Zhang, O.;  Borgs, C.;  Chayes, J. T.; Yaghi, O. M., ChatGPT Chemistry Assistant for Text Mining and Prediction of MOF Synthesis. J. Am. Chem. Soc. 2023. (DOI: 10.1021/jacs.3c05819)
 11 | 
 12 | Thank you!
 13 | 
 14 | **Contents** 
 15 | 
 16 | · Text Mining: PDF Text Processing and Analysis with OpenAI's _gpt-3.5-turbo_ API or  _gpt-4_ API
 17 | 
 18 | · MOF Chatbot: a chatbot answers question based on post text mining data
 19 | 
 20 | · Predictive Model: A RF classfifier trained on post text mining data
 21 | 
 22 | 
 23 | 
 24 | **Features**
 25 | 
 26 | _This text mining assistant includes the following main functions:_
 27 | 
 28 | · Extraction of text from PDF files and its division into smaller chunks.
 29 | 
 30 | · Classfication of text segments.
 31 | 
 32 | · Processing and summarization of the extracted text data.
 33 | 
 34 | · Conversion of summarized data into a tabular format.
 35 | 
 36 | · Calculation of text embeddings using the OpenAI API.
 37 | 
 38 | · Selection of top similarity sections and their neighbors in the data.
 39 | 
 40 | · Calculation of text token count using the tiktoken library.
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | _This MOF Synthesis Assistant tool provides the following core functionalities:_
 47 | 
 48 | · Extraction of synthesis information and embeddings from a CSV file.
 49 | 
 50 | · Calculation of similarity scores.
 51 | 
 52 | · Sorting of text segments based on their similarity scores.
 53 | 
 54 | · Selection of top similar synthesis conditions from the sorted data.
 55 | 
 56 | · Processing of multiple user questions to maintain a conversational context.
 57 | 
 58 | · Use of the OpenAI API to generate text embeddings for user's questions based on the selected synthesis conditions.
 59 | 
 60 | · Maintenance of a conversation history for better contextually accurate responses in a conversational interface.
 61 | 
 62 | · A user-friendly conversational interface for asking questions related to MOF synthesis conditions.
 63 | 
 64 | _This machine learning tool includes the following primary functions:_
 65 | 
 66 | · Data Preprocessing: Reads, processes, and drops unused data columns from CSV file.
 67 | 
 68 | · Feature Selection: Applies RFECV for robust feature selection.
 69 | 
 70 | · Data Splitting: Splits data into training and testing sets with various sizes.
 71 | 
 72 | · Hyperparameter Tuning: Performs tuning via RandomizedSearchCV for RandomForestClassifier.
 73 | 
 74 | · Model Evaluation: Computes several performance metrics for each model configuration.
 75 | 
 76 | · Optimal Model Selection: Selects the best performing model based on balanced accuracy.
 77 | 
 78 | · Random Splits: Supports multiple random states for data splitting.
 79 | 
 80 | · Reporting: Records all performance metrics in an organized format for model comparison.
 81 | 
 82 | 
 83 | **Dependencies**
 84 | 
 85 | · This project is built on Python and requires the following libraries:
 86 | 
 87 | 
 88 | _openai_
 89 | 
 90 | _requests_
 91 | 
 92 | _PyPDF2_
 93 | 
 94 | _pandas_
 95 | 
 96 | _tiktoken_
 97 | 
 98 | 
 99 | _sklearn_
100 | 
101 | _numpy_
102 | 
103 | _mendeleev_
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/chatbot/ChatBot.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "58ec05e6",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import openai\n",
 12 |     "from sklearn.metrics.pairwise import cosine_similarity\n",
 13 |     "\n",
 14 |     "def add_similarity(df, given_embedding):\n",
 15 |     "    def calculate_similarity(embedding):\n",
 16 |     "        # Check if embedding is a string and convert it to a list of floats if necessary\n",
 17 |     "        if isinstance(embedding, str):\n",
 18 |     "            embedding = [float(x) for x in embedding.strip('[]').split(',')]\n",
 19 |     "        return cosine_similarity([embedding], [given_embedding])[0][0]\n",
 20 |     "\n",
 21 |     "    df['similarity'] = df['embedding'].apply(calculate_similarity)\n",
 22 |     "    return df\n",
 23 |     "\n",
 24 |     "def top_similar_entries(df, x=3):\n",
 25 |     "    \"\"\"\n",
 26 |     "    Return the top x entries in the \"Synthesis Information\" column based on the highest similarity values.\n",
 27 |     "\n",
 28 |     "    :param df: The DataFrame containing the \"similarity\" and \"Synthesis Information\" columns.\n",
 29 |     "    :param x: The number of top entries to return. Default is 3.\n",
 30 |     "    :return: A string containing the top x entries in the \"Synthesis Information\" column, separated by new lines.\n",
 31 |     "    \"\"\"\n",
 32 |     "    # Sort the DataFrame based on the \"similarity\" column in descending order\n",
 33 |     "    sorted_df = df.sort_values(by=\"similarity\", ascending=False)\n",
 34 |     "\n",
 35 |     "    # Get the top x entries from the \"Synthesis Information\" column\n",
 36 |     "    top_x_entries = sorted_df[\"Synthesis Information\"].head(x).tolist()\n",
 37 |     "\n",
 38 |     "    # Add separator line with MOF Name if x is equal or larger than 2\n",
 39 |     "    if x >= 2:\n",
 40 |     "        for i, entry in enumerate(top_x_entries):\n",
 41 |     "            mof_name = entry.split(\"\\n\")[0].replace(\"MOF Name: \", \"\")\n",
 42 |     "            separator = f\"--- SECTION {i + 1}: {mof_name} ---\"\n",
 43 |     "            top_x_entries[i] = separator + \"\\n\" + entry\n",
 44 |     "\n",
 45 |     "    # Join the entries together with new lines\n",
 46 |     "    joined_entries = \"\\n\".join(top_x_entries)\n",
 47 |     "\n",
 48 |     "    return joined_entries\n",
 49 |     "\n",
 50 |     "\n",
 51 |     "def chatbot(question, past_user_messages=None, initial_context=None):\n",
 52 |     "    if past_user_messages is None:\n",
 53 |     "        past_user_messages = []\n",
 54 |     "\n",
 55 |     "    past_user_messages.append(question)\n",
 56 |     "\n",
 57 |     "    file_name = \"Synthesis_Embedding.csv\" #synthesis information database with embedding\n",
 58 |     "    df_with_emb = pd.read_csv(file_name)\n",
 59 |     "\n",
 60 |     "    if initial_context is None:\n",
 61 |     "        # Find the context based on the first question\n",
 62 |     "        first_question = past_user_messages[0]\n",
 63 |     "        question_return = openai.Embedding.create(model=\"text-embedding-ada-002\", input=first_question)\n",
 64 |     "        question_emb = question_return['data'][0]['embedding']\n",
 65 |     "\n",
 66 |     "        df_with_emb_sim = add_similarity(df_with_emb, question_emb)\n",
 67 |     "        num_paper = 3\n",
 68 |     "        top_n_synthesis_str = top_similar_entries(df_with_emb_sim, num_paper)\n",
 69 |     "\n",
 70 |     "        print(\"I have found below synthesis conditions and paper information based on your first question:\")\n",
 71 |     "        print(\"\\n\" + top_n_synthesis_str)\n",
 72 |     "        initial_context = top_n_synthesis_str\n",
 73 |     "\n",
 74 |     "    message_history = [\n",
 75 |     "        {\n",
 76 |     "            \"role\": \"system\",\n",
 77 |     "            \"content\": \"You are a chemistry assistant that specifically handles questions related to MOF synthesis conditions based on the papers you have reviewed. Answer the question using the provided context. If the question is not relevant to the context or the MOF is not mentioned in the context, respond with 'Based on the information available from the MOF paper I have read so far, I cannot provide a reliable answer to this question. Please revise your question.'\\n\\nContext:\\n\" + initial_context\n",
 78 |     "        },\n",
 79 |     "    ]\n",
 80 |     "\n",
 81 |     "    for user_question in past_user_messages:\n",
 82 |     "        message_history.append({\"role\": \"user\", \"content\": user_question})\n",
 83 |     "\n",
 84 |     "    response = openai.ChatCompletion.create(\n",
 85 |     "        model='gpt-3.5-turbo',\n",
 86 |     "        #temperature=0.8,\n",
 87 |     "        #max_tokens=2000,\n",
 88 |     "        messages=message_history\n",
 89 |     "    )\n",
 90 |     "\n",
 91 |     "    answer = response.choices[0].message[\"content\"]\n",
 92 |     "    return answer, initial_context, past_user_messages\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "\n",
 96 |     "\n",
 97 |     "openai.api_key = \"Add Your OpenAI API KEY Here.\"\n",
 98 |     "\n",
 99 |     "# Example usage:\n",
100 |     "first_question = \"What is the linker used to synthesize MOF-520?\"\n",
101 |     "answer, initial_context, past_user_messages = chatbot(first_question)\n",
102 |     "print(answer)\n",
103 |     "\n",
104 |     "follow_up_question = \"Well, so how to make this MOF?\"\n",
105 |     "answer, _, past_user_messages = chatbot(follow_up_question, past_user_messages, initial_context)\n",
106 |     "print(answer)"
107 |    ]
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "kernelspec": {
112 |    "display_name": "Python 3 (ipykernel)",
113 |    "language": "python",
114 |    "name": "python3"
115 |   },
116 |   "language_info": {
117 |    "codemirror_mode": {
118 |     "name": "ipython",
119 |     "version": 3
120 |    },
121 |    "file_extension": ".py",
122 |    "mimetype": "text/x-python",
123 |    "name": "python",
124 |    "nbconvert_exporter": "python",
125 |    "pygments_lexer": "ipython3",
126 |    "version": "3.9.12"
127 |   }
128 |  },
129 |  "nbformat": 4,
130 |  "nbformat_minor": 5
131 | }
132 | 


--------------------------------------------------------------------------------
/chatbot/Synthesis_Embedding.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zach-zhiling-zheng/ChatGPT_Chemistry_Assistant/c7916e6e2fb722af8c7bc580fbd06af27b3d7f44/chatbot/Synthesis_Embedding.zip


--------------------------------------------------------------------------------
/random_forest/model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from sklearn.model_selection import train_test_split, RandomizedSearchCV\n",
 10 |     "from sklearn.feature_selection import RFECV\n",
 11 |     "from sklearn.ensemble import RandomForestClassifier\n",
 12 |     "from sklearn.metrics import (f1_score, recall_score, precision_score, \n",
 13 |     "                             roc_auc_score, balanced_accuracy_score)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import numpy as np\n",
 23 |     "import pandas as pd\n",
 24 |     "from copy import deepcopy"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 21,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# preprocess data and drop unused columns\n",
 34 |     "\n",
 35 |     "data = pd.read_csv(\"processed_data.csv\")\n",
 36 |     "t_ys = data[\"result\"].values\n",
 37 |     "Xs = data.drop([\"result\"], axis=1).fillna(0.)\n",
 38 |     "nan_mask = np.all(data == 0, axis=0)\n",
 39 |     "t_Xs = Xs.drop(list(data.columns[nan_mask]), axis=1)\n"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 8,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# held-out set\n",
 49 |     "Xs, hXs, ys, hys = train_test_split(t_Xs,t_ys, train_size=0.8, random_state=49, stratify=t_ys)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# recursive feature reduction\n",
 59 |     "\n",
 60 |     "clf = RandomForestClassifier(criterion=\"entropy\", random_state=49, class_weight=\"balanced\")\n",
 61 |     "rfecv = RFECV(\n",
 62 |     "    estimator=clf,\n",
 63 |     "    cv=5,\n",
 64 |     "    scoring=\"accuracy\",\n",
 65 |     "    min_features_to_select=20,\n",
 66 |     ")\n",
 67 |     "rfecv.fit(Xs.values, ys)\n",
 68 |     "\n",
 69 |     "print(f\"Optimal number of features: {rfecv.n_features_}\")"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 20,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "train_size = np.arange(0.4, 1, 0.1)\n",
 79 |     "X_REF = Xs.values[:, rfecv.support_]\n",
 80 |     "params = {\"n_estimators\": np.arange(50, 250, 50),\n",
 81 |     "            \"min_samples_leaf\": np.arange(5, 30)}\n",
 82 |     "REF_ac = {\"train\": [], \"test\": [], \"held out\": [], \"f1\": [], \"p\": [], \"r\": [], \"auc\": []}\n",
 83 |     "prev = 0\n",
 84 |     "prev_test = 0\n",
 85 |     "\n",
 86 |     "np.random.seed(49)\n",
 87 |     "split_states = np.random.randint(100, size=10)\n",
 88 |     "\n",
 89 |     "for s in train_size:\n",
 90 |     "    train_ac = np.zeros(10)\n",
 91 |     "    test_ac = np.zeros(10)\n",
 92 |     "    held_ac = np.zeros(10)\n",
 93 |     "    test_F1 = np.zeros(20)\n",
 94 |     "    test_p = np.zeros(20)\n",
 95 |     "    test_r = np.zeros(20)\n",
 96 |     "    test_auc = np.zeros(20)\n",
 97 |     "    X_train, X_test, y_train, y_test = train_test_split(X_REF, ys, train_size=s, random_state=49) \n",
 98 |     "    for i, si in enumerate(split_states):\n",
 99 |     "        # hyperparameter tunning \n",
100 |     "        rf = RandomForestClassifier(criterion=\"entropy\", random_state=49, class_weight=\"balanced\")\n",
101 |     "        clf = RandomizedSearchCV(rf, params, n_iter=10, random_state=si)\n",
102 |     "        search = clf.fit(X_train, y_train)\n",
103 |     "        \n",
104 |     "        #print(search.best_params_)\n",
105 |     "        best_rf = clf.best_estimator_\n",
106 |     "    \n",
107 |     "        test_pred = best_rf.predict(X_test)\n",
108 |     "        htest_pred = best_rf.predict(hXs.values[:, rfecv.support_])\n",
109 |     "        train_pred = best_rf.predict(X_train)\n",
110 |     "        test_ac[i] = balanced_accuracy_score(y_test, test_pred)\n",
111 |     "        held_ac[i] = balanced_accuracy_score(hys, htest_pred)\n",
112 |     "        if test_ac[-1] > prev_test:\n",
113 |     "            prev_test = test_ac[-1]\n",
114 |     "            best_test_model = deepcopy(best_rf)\n",
115 |     "        if held_ac[-1] > prev:\n",
116 |     "            prev = held_ac[-1]\n",
117 |     "            best_model = deepcopy(best_rf)\n",
118 |     "\n",
119 |     "        #print(f\"seed {i}, balanced accuracy {test_ac[-1]}, accuracy {best_rf.score(X_test, y_test)}\")\n",
120 |     "        test_F1[2*i] = f1_score(y_test, test_pred)\n",
121 |     "        test_p[2*i] = precision_score(y_test, test_pred)\n",
122 |     "        test_r[2*i] = recall_score(y_test, test_pred)\n",
123 |     "        test_auc[2*i] = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])\n",
124 |     "        train_ac[i] = balanced_accuracy_score(y_train, train_pred)\n",
125 |     "\n",
126 |     "        test_F1[2*i+1] = f1_score(hys, htest_pred)\n",
127 |     "        test_p[2*i+1] = precision_score(hys, htest_pred)\n",
128 |     "        test_r[2*i+1] = recall_score(hys, htest_pred)\n",
129 |     "        test_auc[2*i+1] = roc_auc_score(hys, best_rf.predict_proba(hXs.values[:, rfecv.support_])[:, 1])\n",
130 |     "    \n",
131 |     "    REF_ac[\"train\"] += [train_ac]\n",
132 |     "    REF_ac[\"test\"] += [test_ac]\n",
133 |     "    REF_ac[\"held out\"] += [held_ac]\n",
134 |     "    REF_ac[\"f1\"] += [test_F1[0::2], test_F1[1::2]]\n",
135 |     "    REF_ac[\"p\"] += [test_p[0::2], test_p[1::2]]\n",
136 |     "    REF_ac[\"r\"] += [test_r[0::2], test_r[1::2]]\n",
137 |     "    REF_ac[\"auc\"] += [test_auc[0::2], test_auc[1::2]]"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": []
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "kernelspec": {
148 |    "display_name": "chem142",
149 |    "language": "python",
150 |    "name": "chem142"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 3
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython3",
162 |    "version": "3.8.15"
163 |   },
164 |   "orig_nbformat": 4
165 |  },
166 |  "nbformat": 4,
167 |  "nbformat_minor": 2
168 | }
169 | 


--------------------------------------------------------------------------------
/text_mining/ChatGPT_Paper_Reading.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "b562b141",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import openai\n",
 11 |     "import requests\n",
 12 |     "import PyPDF2\n",
 13 |     "import re\n",
 14 |     "import os\n",
 15 |     "import requests\n",
 16 |     "import pandas as pd\n",
 17 |     "import tiktoken\n",
 18 |     "import time\n",
 19 |     "from io import StringIO\n",
 20 |     "from sklearn.metrics.pairwise import cosine_similarity\n",
 21 |     "from sklearn.manifold import TSNE\n",
 22 |     "import numpy as np\n",
 23 |     "import ast\n",
 24 |     "\n",
 25 |     "def count_tokens(text):\n",
 26 |     "    \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
 27 |     "    encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
 28 |     "    num_tokens = len(encoding.encode(text))\n",
 29 |     "    return num_tokens\n",
 30 |     "\n",
 31 |     "def get_txt_from_pdf(pdf_files,filter_ref = False, combine=False):\n",
 32 |     "    \"\"\"Convert pdf files to dataframe\"\"\"\n",
 33 |     "    # Create an empty list to store the data\n",
 34 |     "    data = []\n",
 35 |     "    # Iterate over the PDF\n",
 36 |     "    for pdf in pdf_files:\n",
 37 |     "        # Fetch the PDF content from the pdf\n",
 38 |     "        with open(pdf, 'rb') as pdf_content:\n",
 39 |     "            # Create a PDF reader object\n",
 40 |     "            pdf_reader = PyPDF2.PdfReader(pdf_content)\n",
 41 |     "            # Iterate over all the pages in the PDF\n",
 42 |     "            for page_num in range(len(pdf_reader.pages)):\n",
 43 |     "                page = pdf_reader.pages[page_num] # Extract the text from the current page\n",
 44 |     "                page_text = page.extract_text()\n",
 45 |     "                words = page_text.split() # Split the page text into individual words\n",
 46 |     "                page_text_join = ' '.join(words) # Join the words back together with a single space between each word\n",
 47 |     "\n",
 48 |     "                if filter_ref: #filter the reference at the end\n",
 49 |     "                    page_text_join = remove_ref(page_text_join)\n",
 50 |     "\n",
 51 |     "                page_len = len(page_text_join)\n",
 52 |     "                div_len = page_len // 4 # Divide the page into 4 parts\n",
 53 |     "                page_parts = [page_text_join[i*div_len:(i+1)*div_len] for i in range(4)]\n",
 54 |     "            \n",
 55 |     "                min_tokens = 40\n",
 56 |     "                for i, page_part in enumerate(page_parts):\n",
 57 |     "                    if count_tokens(page_part) > min_tokens:\n",
 58 |     "                        # Append the data to the list\n",
 59 |     "                        data.append({\n",
 60 |     "                            'file name': pdf,\n",
 61 |     "                            'page number': page_num + 1,\n",
 62 |     "                            'page section': i+1,\n",
 63 |     "                            'content': page_part,\n",
 64 |     "                            'tokens': count_tokens(page_part)\n",
 65 |     "                        })\n",
 66 |     "    # Create a DataFrame from the data\n",
 67 |     "    df = pd.DataFrame(data)\n",
 68 |     "    if combine:\n",
 69 |     "        df = combine_section(df)\n",
 70 |     "    return df\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "def remove_ref(pdf_text):\n",
 74 |     "    \"\"\"This function removes reference section from a given PDF text. It uses regular expressions to find the index of the words to be filtered out.\"\"\"\n",
 75 |     "    # Regular expression pattern for the words to be filtered out\n",
 76 |     "    pattern = r'(REFERENCES|Acknowledgment|ACKNOWLEDGMENT)'\n",
 77 |     "    match = re.search(pattern, pdf_text)\n",
 78 |     "\n",
 79 |     "    if match:\n",
 80 |     "        # If a match is found, remove everything after the match\n",
 81 |     "        start_index = match.start()\n",
 82 |     "        clean_text = pdf_text[:start_index].strip()\n",
 83 |     "    else:\n",
 84 |     "        # Define a list of regular expression patterns for references\n",
 85 |     "        reference_patterns = [\n",
 86 |     "            '\\[[\\d\\w]{1,3}\\].+?[\\d]{3,5}\\.','\\[[\\d\\w]{1,3}\\].+?[\\d]{3,5};','\\([\\d\\w]{1,3}\\).+?[\\d]{3,5}\\.','\\[[\\d\\w]{1,3}\\].+?[\\d]{3,5},',\n",
 87 |     "            '\\([\\d\\w]{1,3}\\).+?[\\d]{3,5},','\\[[\\d\\w]{1,3}\\].+?[\\d]{3,5}','[\\d\\w]{1,3}\\).+?[\\d]{3,5}\\.','[\\d\\w]{1,3}\\).+?[\\d]{3,5}',\n",
 88 |     "            '\\([\\d\\w]{1,3}\\).+?[\\d]{3,5}','^[\\w\\d,\\.– ;)-]+$',\n",
 89 |     "        ]\n",
 90 |     "\n",
 91 |     "        # Find and remove matches with the first eight patterns\n",
 92 |     "        for pattern in reference_patterns[:8]:\n",
 93 |     "            matches = re.findall(pattern, pdf_text, flags=re.S)\n",
 94 |     "            pdf_text = re.sub(pattern, '', pdf_text) if len(matches) > 500 and matches.count('.') < 2 and matches.count(',') < 2 and not matches[-1].isdigit() else pdf_text\n",
 95 |     "\n",
 96 |     "        # Split the text into lines\n",
 97 |     "        lines = pdf_text.split('\\n')\n",
 98 |     "\n",
 99 |     "        # Strip each line and remove matches with the last two patterns\n",
100 |     "        for i, line in enumerate(lines):\n",
101 |     "            lines[i] = line.strip()\n",
102 |     "            for pattern in reference_patterns[7:]:\n",
103 |     "                matches = re.findall(pattern, lines[i])\n",
104 |     "                lines[i] = re.sub(pattern, '', lines[i]) if len(matches) > 500 and len(re.findall('\\d', matches)) < 8 and len(set(matches)) > 10 and matches.count(',') < 2 and len(matches) > 20 else lines[i]\n",
105 |     "\n",
106 |     "        # Join the lines back together, excluding any empty lines\n",
107 |     "        clean_text = '\\n'.join([line for line in lines if line])\n",
108 |     "\n",
109 |     "    return clean_text\n",
110 |     "\n",
111 |     "      \n",
112 |     "def combine_section(df):\n",
113 |     "    \"\"\"Merge sections, page numbers, add up content, and tokens based on the pdf name.\"\"\"\n",
114 |     "    aggregated_df = df.groupby('file name').agg({\n",
115 |     "        'content': aggregate_content,\n",
116 |     "        'tokens': aggregate_tokens\n",
117 |     "    }).reset_index()\n",
118 |     "\n",
119 |     "    return aggregated_df\n",
120 |     "\n",
121 |     "\n",
122 |     "def aggregate_content(series):\n",
123 |     "    \"\"\"Join all elements in the series with a space separator. \"\"\"\n",
124 |     "    return ' '.join(series)\n",
125 |     "\n",
126 |     "\n",
127 |     "def aggregate_tokens(series):\n",
128 |     "    \"\"\"Sum all elements in the series.\"\"\"\n",
129 |     "    return series.sum()\n",
130 |     "\n",
131 |     "\n",
132 |     "def extract_title(file_name):\n",
133 |     "    \"\"\"Extract the main part of the file name. \"\"\"\n",
134 |     "    title = file_name.split('_')[0]\n",
135 |     "    return title.rstrip('.pdf')\n",
136 |     "\n",
137 |     "\n",
138 |     "def combine_main_SI(df):\n",
139 |     "    \"\"\"Create a new column with the main part of the file name, group the DataFrame by the new column, \n",
140 |     "    and aggregate the content and tokens.\"\"\"\n",
141 |     "    df['main_part'] = df['file name'].apply(extract_title)\n",
142 |     "    merged_df = df.groupby('main_part').agg({\n",
143 |     "        'content': ''.join,\n",
144 |     "        'tokens': sum\n",
145 |     "    }).reset_index()\n",
146 |     "\n",
147 |     "    return merged_df.rename(columns={'main_part': 'file name'})\n",
148 |     "\n",
149 |     "\n",
150 |     "def df_to_csv(df, file_name):\n",
151 |     "    \"\"\"Write a DataFrame to a CSV file.\"\"\"\n",
152 |     "    df.to_csv(file_name, index=False, escapechar='\\\\')\n",
153 |     "\n",
154 |     "\n",
155 |     "def csv_to_df(file_name):\n",
156 |     "    \"\"\"Read a CSV file into a DataFrame.\"\"\"\n",
157 |     "    return pd.read_csv(file_name)\n",
158 |     "\n",
159 |     "\n",
160 |     "\n",
161 |     "def tabulate_condition(df,column_name):\n",
162 |     "    \"\"\"This function converts the text from a ChatGPT conversation into a DataFrame.\n",
163 |     "    It also cleans the DataFrame by dropping additional headers and empty lines.    \"\"\"\n",
164 |     "    \n",
165 |     "    table_text = df[column_name].str.cat(sep='\\n')\n",
166 |     "\n",
167 |     "    # Remove leading and trailing whitespace\n",
168 |     "    table_text = table_text.strip()\n",
169 |     "    \n",
170 |     "    # Split the table into rows\n",
171 |     "    rows = table_text.split('\\n')\n",
172 |     "\n",
173 |     "    # Extract the header row and the divider row\n",
174 |     "    header_row, divider_row, *data_rows = rows\n",
175 |     "\n",
176 |     "    # Extract column names from the header row\n",
177 |     "\n",
178 |     "    column_names = ['compound name', 'metal source', 'metal amount', 'linker', 'linker amount',\n",
179 |     "                   'modulator', 'modulator amount or volume', 'solvent', 'solvent volume', 'reaction temperature',\n",
180 |     "                   'reaction time']\n",
181 |     "\n",
182 |     "    # Create a list of dictionaries to store the table data\n",
183 |     "    data = []\n",
184 |     "\n",
185 |     "    # Process each data row\n",
186 |     "    for row in data_rows:\n",
187 |     "\n",
188 |     "        # Split the row into columns\n",
189 |     "        columns = [col.strip() for col in row.split('|') if col.strip()]\n",
190 |     "    \n",
191 |     "        # Create a dictionary to store the row data\n",
192 |     "        row_data = {col_name: col_value for col_name, col_value in zip(column_names, columns)}\n",
193 |     "    \n",
194 |     "        # Append the dictionary to the data list\n",
195 |     "        data.append(row_data)\n",
196 |     "        \n",
197 |     "    df = pd.DataFrame(data)\n",
198 |     "\n",
199 |     "    \n",
200 |     "    \"\"\"Make df clean by drop additional header and empty lines \"\"\"\n",
201 |     "    def contains_pattern(s, patterns):\n",
202 |     "        return any(re.search(p, s) for p in patterns)\n",
203 |     "\n",
204 |     "    def drop_rows_with_patterns(df, column_name):\n",
205 |     "        #empty cells, N/A cells and header cells\n",
206 |     "        patterns = [r'^\\s*$', r'--',r'-\\s-', r'compound', r'Compound',r'Compound name', r'Compound Name',\n",
207 |     "                r'NaN',r'N/A',r'n/a',r'\\nN/A', r'note', r'Note']\n",
208 |     "        \n",
209 |     "        mask = df[column_name].apply(lambda x: not contains_pattern(str(x), patterns))\n",
210 |     "        filtered_df = df[mask]\n",
211 |     "    \n",
212 |     "        return filtered_df\n",
213 |     "    \n",
214 |     "    \n",
215 |     "    #drop the repeated header\n",
216 |     "    df = drop_rows_with_patterns(df, 'compound name')\n",
217 |     "    \n",
218 |     "    #drop the organic synthesis (where the metal source is N/a)    \n",
219 |     "    filtered_df = drop_rows_with_patterns(drop_rows_with_patterns(drop_rows_with_patterns(df,'metal source'),'metal amount'),'linker amount') \n",
220 |     "\n",
221 |     "    #drop the N/A rows\n",
222 |     "    filtered_df = filtered_df.dropna(subset=['metal source','metal amount', 'linker amount'])\n",
223 |     "\n",
224 |     "    return filtered_df\n",
225 |     "\n",
226 |     "\n",
227 |     "\n",
228 |     "def split_content(input_string, tokens):\n",
229 |     "    \"\"\"Splits a string into chunks based on a maximum token count. \"\"\"\n",
230 |     "\n",
231 |     "    MAX_TOKENS = tokens\n",
232 |     "    split_strings = []\n",
233 |     "    current_string = \"\"\n",
234 |     "    tokens_so_far = 0\n",
235 |     "\n",
236 |     "    for word in input_string.split():\n",
237 |     "        # Check if adding the next word would exceed the max token limit\n",
238 |     "        if tokens_so_far + count_tokens(word) > MAX_TOKENS:\n",
239 |     "            # If we've reached the max tokens, look for the last dot or newline in the current string\n",
240 |     "            last_dot = current_string.rfind(\".\")\n",
241 |     "            last_newline = current_string.rfind(\"\\n\")\n",
242 |     "\n",
243 |     "            # Find the index to cut the current string\n",
244 |     "            cut_index = max(last_dot, last_newline)\n",
245 |     "\n",
246 |     "            # If there's no dot or newline, we'll just cut at the max tokens\n",
247 |     "            if cut_index == -1:\n",
248 |     "                cut_index = MAX_TOKENS\n",
249 |     "\n",
250 |     "            # Add the substring to the result list and reset the current string and tokens_so_far\n",
251 |     "            split_strings.append(current_string[:cut_index + 1].strip())\n",
252 |     "            current_string = current_string[cut_index + 1:].strip()\n",
253 |     "            tokens_so_far = count_tokens(current_string)\n",
254 |     "\n",
255 |     "        # Add the current word to the current string and update the token count\n",
256 |     "        current_string += \" \" + word\n",
257 |     "        tokens_so_far += count_tokens(word)\n",
258 |     "\n",
259 |     "    # Add the remaining current string to the result list\n",
260 |     "    split_strings.append(current_string.strip())\n",
261 |     "\n",
262 |     "    return split_strings\n",
263 |     "\n",
264 |     "\n",
265 |     "def table_text_clean(text):\n",
266 |     "    \"\"\"Cleans the table string and splits it into lines.\"\"\"\n",
267 |     "\n",
268 |     "    # Pattern to find table starts\n",
269 |     "    pattern = r\"\\|\\s*compound\\s*.*\"\n",
270 |     "\n",
271 |     "    # Use re.finditer() to find all instances of the pattern in the string and their starting indexes\n",
272 |     "    matches = [match.start() for match in re.finditer(pattern, text, flags=re.IGNORECASE)]\n",
273 |     "\n",
274 |     "    # Count the number of matches\n",
275 |     "    num_matches = len(matches)\n",
276 |     "\n",
277 |     "    # Base table string\n",
278 |     "    table_string = \"\"\"| compound name | metal source | metal amount | linker | linker amount | modulator | modulator amount or volume | solvent | solvent volume | reaction temperature | reaction time |\\n|---------------|-------|--------------|--------|---------------|-----------|---------------------------|---------|----------------|---------------------|---------------|\\n\"\"\"\n",
279 |     "\n",
280 |     "    if num_matches == 0:  # No table in the answer\n",
281 |     "        print(\"No table found in the text: \" + text)\n",
282 |     "        splited_text = ''\n",
283 |     "\n",
284 |     "    else:  # Split the text based on header\n",
285 |     "        splited_text = ''\n",
286 |     "        for i in range(num_matches):\n",
287 |     "            # Get the relevant table slice\n",
288 |     "            splited = text[matches[i]:matches[i + 1]] if i != (num_matches - 1) else text[matches[i]:]\n",
289 |     "\n",
290 |     "            # Remove the text after last '|'\n",
291 |     "            last_pipe_index = splited.rfind('|')\n",
292 |     "            splited = splited[:last_pipe_index + 1]\n",
293 |     "\n",
294 |     "            # Remove the header and \\------\\\n",
295 |     "            pattern_dash = r\"-(\\s*)\\|\"\n",
296 |     "            match = max(re.finditer(pattern_dash, splited), default=None, key=lambda x: x.start())\n",
297 |     "\n",
298 |     "            if not match:\n",
299 |     "                print(\"'-|' pattern not found.\")\n",
300 |     "            else:\n",
301 |     "                first_pipe_index = match.start()\n",
302 |     "                splited = '\\n' + splited[(first_pipe_index + len('-|\\n|') - 1):]  # Start from \"\\\"\n",
303 |     "\n",
304 |     "            splited_text += splited\n",
305 |     "\n",
306 |     "    table_string = table_string + splited_text\n",
307 |     "    return table_string\n",
308 |     "\n",
309 |     "def add_similarity(df, given_embedding):\n",
310 |     "    \"\"\"Adds a 'similarity' column to a dataframe based on cosine similarity with a given embedding.\"\"\"\n",
311 |     "    def calculate_similarity(embedding):\n",
312 |     "        # Check if embedding is a string and convert it to a list of floats if necessary\n",
313 |     "        if isinstance(embedding, str):\n",
314 |     "            embedding = [float(x) for x in embedding.strip('[]').split(',')]\n",
315 |     "        return cosine_similarity([embedding], [given_embedding])[0][0]\n",
316 |     "\n",
317 |     "    df['similarity'] = df['embedding'].apply(calculate_similarity)\n",
318 |     "    return df\n",
319 |     "\n",
320 |     "\n",
321 |     "def select_top_neighbors(df):\n",
322 |     "    \"\"\"Retains top-10 similarity sections and their neighbors in the dataframe and drops the rest.\"\"\"\n",
323 |     "    # Sort dataframe by 'file name' and 'similarity' in descending order\n",
324 |     "    df.sort_values(['file name', 'similarity'], ascending=[True, False], inplace=True)\n",
325 |     "    \n",
326 |     "    # Group dataframe by 'file name' and select the top 10 rows based on similarity\n",
327 |     "    top_10 = df.groupby('file name').head(10)\n",
328 |     "    \n",
329 |     "    # Add neighboring rows (one above and one below) to the selection\n",
330 |     "    neighbors = [i for index in top_10.index for i in (index - 1, index + 1) if 0 <= i < df.shape[0]]\n",
331 |     "\n",
332 |     "    # Create a new dataframe with only the selected rows\n",
333 |     "    selected_df = df.loc[top_10.index.union(neighbors)]\n",
334 |     "    return selected_df\n",
335 |     "\n",
336 |     "\n",
337 |     "def add_emb(df):\n",
338 |     "    \"\"\"Adds an 'embedding' column to a dataframe using OpenAI API.\"\"\"\n",
339 |     "    openai.api_key = api_key\n",
340 |     "    if 'embedding' in df.columns:\n",
341 |     "        print('The dataframe already has embeddings. Please double check.')\n",
342 |     "        return df\n",
343 |     "\n",
344 |     "    embed_msgs = []\n",
345 |     "    for _, row in df.iterrows():\n",
346 |     "        context = row['content']\n",
347 |     "        context_emb = openai.Embedding.create(model=\"text-embedding-ada-002\", input=context)\n",
348 |     "        embed_msgs.append(context_emb['data'][0]['embedding'])\n",
349 |     "\n",
350 |     "    df = df.copy()\n",
351 |     "    df.loc[:, 'embedding'] = embed_msgs\n",
352 |     "    \n",
353 |     "    return df\n",
354 |     "\n",
355 |     "   \n",
356 |     "\n",
357 |     "def model_1(df):\n",
358 |     "    \"\"\"Model 1 will turn text in dataframe to a summarized reaction condition table.The dataframe should have a column \"file name\" and a column \"exp content\".\"\"\"\n",
359 |     "    response_msgs = []\n",
360 |     "\n",
361 |     "    for index, row in df.iterrows():\n",
362 |     "        column1_value = row[df.columns[0]]\n",
363 |     "        column2_value = row['content']\n",
364 |     "\n",
365 |     "        max_tokens = 3000\n",
366 |     "        if count_tokens(column2_value) > max_tokens:\n",
367 |     "            context_list = split_content(column2_value, max_tokens)\n",
368 |     "        else:\n",
369 |     "            context_list = [column2_value]\n",
370 |     "\n",
371 |     "        answers = ''  # Collect answers from chatGPT\n",
372 |     "        for context in context_list:\n",
373 |     "            print(\"Start to analyze paper \" + str(column1_value) )\n",
374 |     "            user_heading = f\"This is an experimental section on MOF synthesis from paper {column1_value}\\n\\nContext:\\n{context}\"\n",
375 |     "            user_ending = \"\"\"Q: Can you summarize the following details in a table: \n",
376 |     "            compound name or chemical formula (if the name is not provided), metal source, metal amount, organic linker(s), \n",
377 |     "            linker amount, modulator, modulator amount or volume, solvent(s), solvent volume(s), reaction temperature, \n",
378 |     "            and reaction time? If any information is not provided or you are unsure, use \"N/A\". \n",
379 |     "            Please focus on extracting experimental conditions from only the MOF synthesis and ignore information related to organic linker synthesis, \n",
380 |     "            MOF postsynthetic modification, high throughput (HT) experiment details or catalysis reactions. \n",
381 |     "            If multiple conditions are provided for the same compound, use multiple rows to represent them. If multiple units or components are provided for the same factor (e.g.  g and mol for the weight, multiple linker or metals, multiple temperature and reaction time, mixed solvents, etc), include them in the same cell and separate by comma.\n",
382 |     "            The table should have 11 columns, all in lowercase:\n",
383 |     "            | compound name | metal source | metal amount | linker | linker amount | modulator | modulator amount or volume | solvent | solvent volume | reaction temperature | reaction time |\n",
384 |     "\n",
385 |     "            A:\"\"\"   \n",
386 |     "\n",
387 |     "            attempts = 3\n",
388 |     "            while attempts > 0:\n",
389 |     "                try:\n",
390 |     "                    response = openai.ChatCompletion.create(\n",
391 |     "                        model='gpt-3.5-turbo',\n",
392 |     "                        messages=[{\n",
393 |     "                            \"role\": \"system\",\n",
394 |     "                            \"content\": \"\"\"Answer the question as truthfully as possible using the provided context,\n",
395 |     "                                        and if the answer is not contained within the text below, say \"N/A\" \"\"\"\n",
396 |     "                        },\n",
397 |     "                            {\"role\": \"user\", \"content\": user_heading + user_ending}]\n",
398 |     "                    )\n",
399 |     "                    answer_str = response.choices[0].message.content\n",
400 |     "                    if not answer_str.lower().startswith(\"n/a\"):\n",
401 |     "                        answers += '\\n' + answer_str\n",
402 |     "                    break\n",
403 |     "                except Exception as e:\n",
404 |     "                    attempts -= 1\n",
405 |     "                    if attempts <= 0:\n",
406 |     "                        print(f\"Error: Failed to process paper {column1_value}. Skipping. (model 1)\")\n",
407 |     "                        break\n",
408 |     "                    print(f\"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 1)\")\n",
409 |     "                    time.sleep(60)\n",
410 |     "\n",
411 |     "        response_msgs.append(answers)\n",
412 |     "    df = df.copy()\n",
413 |     "    df.loc[:, 'summarized'] = response_msgs\n",
414 |     "    return df\n",
415 |     "\n",
416 |     "\n",
417 |     "def model_2(df):\n",
418 |     "    \"\"\"Model 2 has two parts. First, it asks ChatGPT to identify the experiment section,\n",
419 |     "    then it combines the results\"\"\"\n",
420 |     "    openai.api_key = api_key\n",
421 |     "\n",
422 |     "    response_msgs = []\n",
423 |     "    \n",
424 |     "    prev_paper_name = None  # Initialize the variable. For message printing purpose\n",
425 |     "    total_pages = df.groupby(df.columns[0])[df.columns[1]].max() #  For message printing purpose\n",
426 |     "    \n",
427 |     "    for _, row in df.iterrows():\n",
428 |     "        paper_name = row[df.columns[0]]\n",
429 |     "        page_number = row[df.columns[1]]\n",
430 |     "        # Only print the message when the paper name changes\n",
431 |     "        if paper_name != prev_paper_name:\n",
432 |     "            print(f'Processing paper: {paper_name}. Total pages: {total_pages[paper_name]}')\n",
433 |     "            prev_paper_name = paper_name\n",
434 |     "\n",
435 |     "        context = row['content']\n",
436 |     "\n",
437 |     "        user_msg1 = \"\"\"\n",
438 |     "        Context:\n",
439 |     "        In a 4-mL scintillation vial, the linker H2PZVDC (91.0 mg, 0.5 mmol, 1 equiv.) was dissolved in N,N-dimethylformamide (DMF) (0.6 mL) upon sonication. An aqueous solution of AlCl3·6H2O (2.4 mL, 0.2 M, 1 equiv.) was added dropwise, and the resulting mixture was heated in a 120 °C oven for 24 hours.\n",
440 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
441 |     "        Answer: Yes.\n",
442 |     "\n",
443 |     "        Context:\n",
444 |     "        These metal salt mixtures were combined with 18 mg of H4DOT and then dissolved in a mixture of DMF (6 mL), EtOH (0.36 mL), and water (0.36 mL). The reaction mixture was heated to specific temperatures (120°C or 85°C) for 24 h. \n",
445 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
446 |     "        Answer: Yes.\n",
447 |     "\n",
448 |     "        Context:\n",
449 |     "        Synthesis of MOF-5.19 Zn(NO3)2?4H2O (31.824 mmol) and H2bdc (10.594 mmol) were dissolved in DEF (100 cm3). The solution was heated to 100 C.\n",
450 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
451 |     "        Answer: Yes.\n",
452 |     "\n",
453 |     "        Context:\n",
454 |     "        [Zr6O4(OH)8(H2O)4(CTTA)8/3]·S (BUT-12·S). ZrCl4 (48 mg), H3CTTA (40 mg), and formic acid (8 mL) were ultrasonically dissolved in N,N′-dimethylformamide (DMF, 8 mL) in a 20 mL Pyrexvial.\n",
455 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
456 |     "        Answer: Yes.\n",
457 |     "\n",
458 |     "        Context:\n",
459 |     "        A 0.150 M solution of imidazole in DMF and a 0.075M solution of Zn(NO3)2·4H2O in DMF were used as stock solutions, and heated in a 85 ºC isothermal oven for 3 days. \n",
460 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
461 |     "        Answer: Yes.\n",
462 |     "        \n",
463 |     "        Context:\n",
464 |     "        Synthesis and Characterization of MOFs, Abbreviations, and General Procedures. For easy reference, the formulas for MOF-69-80, explanation of guest abbreviations and organic carboxylates, andcrystal unit cell parameters are listed in Table 2. Unless otherwiseindicated, chemicals were purchased from the Aldrich Chemical Co.and used as received. HPDC and ATC organic linkers were synthesizedaccording to published procedures.13 HPDC was protected, dehydro\u0002genated, and then deprotected to yield PDC.14\n",
465 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
466 |     "        Answer: No.\n",
467 |     "\n",
468 |     "        Context:\n",
469 |     "        The design and synthesis of metal-organic frameworks (MOFs) has yielded a large number of structures which have been shown to have useful gas and liquid adsorption properties.1In particular, porous structures constructed from discrete metal\u0002carboxylate clusters and organic links have been shown to beamenable to systematic variation in pore size and functionality.\n",
470 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
471 |     "        Answer: No.\n",
472 |     "        \n",
473 |     "        Context:\n",
474 |     "        Solvothermal reactions of Co(NO3)·6H2O, Hatz, and L1/L2 in a 2:2:1 molar ratio in DMF solvent at 180 °C for 24 h yielded\n",
475 |     "        two crystalline products, 1 and 2, respectively\n",
476 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
477 |     "        Answer: No.\n",
478 |     "        \n",
479 |     "        Context:\n",
480 |     "        A 22.9% weight loss was observed from 115 to 350 °C, which corresponds to the loss of one DEF molecule per formula unit (calcd: 23.5%).\n",
481 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
482 |     "        Answer: No.\n",
483 |     "                \n",
484 |     "        Context:\n",
485 |     "          \"\"\"\n",
486 |     "    \n",
487 |     "        user_msg2 = \"\"\"\n",
488 |     "        Question: Does the section contain a comprehensive MOF synthesis with explicit reactant quantities or solvent volumes?\n",
489 |     "        Answer:\n",
490 |     "        \"\"\"\n",
491 |     "\n",
492 |     "        attempts = 3\n",
493 |     "        while attempts > 0:\n",
494 |     "            try:\n",
495 |     "                response = openai.ChatCompletion.create(\n",
496 |     "                    model='gpt-3.5-turbo',\n",
497 |     "                    messages=[\n",
498 |     "                        {\"role\": \"system\", \"content\": \"Determine if the section comes from an experimental section for MOF synthesis, which contains information on at least one of the following: reaction time, reaction temperature, metal source, organic linker, the amount, solvent and volume. Answer will be either Yes or No.\"},\n",
499 |     "                        {\"role\": \"user\", \"content\": user_msg1 + context + user_msg2}\n",
500 |     "                    ]\n",
501 |     "                )\n",
502 |     "                answers = response.choices[0].message.content\n",
503 |     "                break\n",
504 |     "\n",
505 |     "            except Exception as e:\n",
506 |     "                attempts -= 1\n",
507 |     "                if attempts > 0:\n",
508 |     "                    print(f\"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 2)\")\n",
509 |     "                    time.sleep(60)\n",
510 |     "                else:\n",
511 |     "                    print(f\"Error: Failed to process paper {paper_name}. Skipping. (model 2)\")\n",
512 |     "                    answers = \"No\"\n",
513 |     "                    break\n",
514 |     "\n",
515 |     "        response_msgs.append(answers)\n",
516 |     "    df = df.copy()\n",
517 |     "    df.loc[:,'classification'] = response_msgs\n",
518 |     "\n",
519 |     "\n",
520 |     "    # The following section creates a new dataframe after applying some transformations to the old dataframe\n",
521 |     "    # Create a boolean mask for rows where 'results' starts with 'No'\n",
522 |     "    mask_no = df[\"classification\"].str.startswith(\"No\")\n",
523 |     "    # Create a boolean mask for rows where both the row above and below have 'No' in the 'results' column\n",
524 |     "    mask_surrounded_by_no = mask_no.shift(1, fill_value=False) & mask_no.shift(-1, fill_value=False)\n",
525 |     "    # Combine the two masks with an AND operation\n",
526 |     "    mask_to_remove = mask_no & mask_surrounded_by_no\n",
527 |     "    # Invert the mask and filter the DataFrame\n",
528 |     "    filtered_df = df[~mask_to_remove]\n",
529 |     "    #combined\n",
530 |     "    combined_df= combine_main_SI(combine_section(filtered_df ))\n",
531 |     "    #call model 1 to summarized results\n",
532 |     "    add_table_df = model_1(combined_df)\n",
533 |     "    return add_table_df \n",
534 |     "\n",
535 |     "\n",
536 |     "def model_3(df, prompt_choice=\"synthesis\", classfication = True):\n",
537 |     "    \"\"\"Input a dataframe in broken separation, ~300 tokens, separated by pages and sections. This function will filter the unnecessary sections.\"\"\"\n",
538 |     "\n",
539 |     "    # Set up your API key\n",
540 |     "    openai.api_key = api_key\n",
541 |     "\n",
542 |     "    # Define the prompt\n",
543 |     "    prompts = {\n",
544 |     "        \"synthesis\": \"Provide a detailed description of the experimental section or synthesis method used in this research. This section should cover essential information such as the compound name (e.g., MOF-5, ZIF-1, Cu(Bpdc), compound 1, etc.), metal source (e.g., ZrCl4, CuCl2, AlCl3, zinc nitrate, iron acetate, etc.), organic linker (e.g., terephthalate acid, H2BDC, H2PZDC, H4Por, etc.), amount (e.g., 25mg, 1.02g, 100mmol, 0.2mol, etc.), solvent (e.g., N,N Dimethylformamide, DMF, DCM, DEF, NMP, water, EtOH, etc.), solvent volume (e.g., 12mL, 100mL, 1L, 0.1mL, etc.), reaction temperature (e.g., 120°C, 293K, 100C, room temperature, reflux, etc.), and reaction time (e.g., 120h, 1 day, 1d, 1h, 0.5h, 30min, a week, etc.).\",\n",
545 |     "        \"TGA\": \"\"\"Identify the section discussing thermogravimetric analysis (TGA) and thermal stability. This section typically includes information about weight-loss steps (e.g., 20%, 30%, 29.5%) and a decomposition temperature range (e.g., 450°C, 515°C) or a plateau.\"\"\",\n",
546 |     "        \"sorption\": \"Identify the section discussing nitrogen (N2) sorption, argon sorption, Brunauer-Emmett-Teller (BET) surface area, Langmuir surface area, and porosity. This section typically reports values such as 1000 m2/g, 100 cm3/g STP, and includes pore diameter or pore size expressed in units of Ångströms (Å)\"\n",
547 |     "    }\n",
548 |     "        \n",
549 |     "    #other than \"synthesis\", \"TGA\", \"sorption\"),the prompt choice can be the name of the linker to be searched for.\n",
550 |     "    # If the choice is not one of the predefined ones (\"synthesis\", \"TGA\", \"sorption\"), it defaults to a generic prompt for the linker.\n",
551 |     "    prompt = prompts.get(prompt_choice, f\"Provide the full name of linker ({prompt_choice}) or denoted as {prompt_choice} in chemicals, abstract, introduction or experimental section.\")\n",
552 |     "    \n",
553 |     "    # Create an embedding for the chosen prompt using OpenAI's embedding model\n",
554 |     "    prompt_result = openai.Embedding.create(model=\"text-embedding-ada-002\", input=prompt)\n",
555 |     "    # Extract the embedding data from the result\n",
556 |     "    prompt_emb = prompt_result['data'][0]['embedding']\n",
557 |     "\n",
558 |     "    # If the dataframe does not already have an 'embedding' column, add one. This is done by calling the add_emb function on the dataframe\n",
559 |     "    if 'embedding' not in df.columns:\n",
560 |     "        df_with_emb = add_emb(df)\n",
561 |     "    else:\n",
562 |     "        df_with_emb  = df\n",
563 |     "\n",
564 |     "    # Add a 'similarity' column to the dataframe by comparing the embeddings.This is done by calling the add_similarity function on the dataframe and the prompt embedding\n",
565 |     "    df_2 = add_similarity(df_with_emb, prompt_emb)\n",
566 |     "\n",
567 |     "    # Filter the dataframe to only include rows with top similarity and their neighbors\n",
568 |     "    df_3 = select_top_neighbors(df_2)\n",
569 |     "\n",
570 |     "    # If the classification parameter is True, pass the dataframe to model_2 for further processing\n",
571 |     "    if classfication:\n",
572 |     "        return model_2(df_3)\n",
573 |     "\n",
574 |     "    # If the classification parameter is False, return the filtered dataframe as is\n",
575 |     "    return df_3\n",
576 |     "\n",
577 |     "\n",
578 |     "\n",
579 |     "def load_paper(filename):\n",
580 |     "    \"\"\"Crate a dataframe\"\"\"\n",
581 |     "    if os.path.exists(filename):\n",
582 |     "        dataframe = pd.read_csv(filename)\n",
583 |     "        return dataframe\n",
584 |     "    else:\n",
585 |     "        #load pdf names\n",
586 |     "        \n",
587 |     "        with open('pdf_pool.csv', 'r') as file:\n",
588 |     "            reader = csv.reader(file)\n",
589 |     "            pdf_pool = [row[0] for row in reader]\n",
590 |     "        dataframe = get_txt_from_pdf(pdf_pool,combine = False, filter_ref = True)\n",
591 |     "    \n",
592 |     "        #store the dataframe\n",
593 |     "        df_to_csv(dataframe, filename)\n",
594 |     "\n",
595 |     "        \n",
596 |     "def load_paper_emb(filename):\n",
597 |     "    \"\"\"Crate a dataframe that includes embedding information\"\"\"\n",
598 |     "    if os.path.exists(filename):\n",
599 |     "        paper_df_emb  = pd.read_csv(filename)\n",
600 |     "        paper_df_emb['embedding'] = paper_df_emb['embedding'].apply(ast.literal_eval)\n",
601 |     "        \n",
602 |     "    else: #load paper and create embedding\n",
603 |     "        paper_df_emb = add_emb(load_paper())\n",
604 |     "    #store embedding to csv\n",
605 |     "        df_to_csv(paper_df_emb, filename)\n",
606 |     "    \n",
607 |     "    return paper_df_emb\n",
608 |     "\n",
609 |     "\n",
610 |     "def check_system(syn_df, paper_df, paper_df_emb):\n",
611 |     "    \"\"\"Check if the data is correctly loaded\"\"\"\n",
612 |     "    # check if openai.api_key is not placeholder\n",
613 |     "    if openai.api_key  == \"Add Your OpenAI API KEY Here.\":\n",
614 |     "        print(\"Error: Please replace openai.api_key with your actual key.\")\n",
615 |     "        return False\n",
616 |     "\n",
617 |     "    # check if 'content' column exists in syn_df\n",
618 |     "    if 'content' not in syn_df.columns:\n",
619 |     "        print(\"Error: 'content' column is missing in syn_df.\")\n",
620 |     "        return False\n",
621 |     "\n",
622 |     "    # check if 'paper_df' has at least four columns\n",
623 |     "    expected_columns = ['file name', 'page number', 'page section', 'content']\n",
624 |     "    if not all(col in paper_df.columns for col in expected_columns):\n",
625 |     "        print(\"Error: 'paper_df' should have these columns: 'file name', 'page number', 'page section', 'content'.\")\n",
626 |     "        return False\n",
627 |     "\n",
628 |     "    # check if 'embedding' column exists in paper_df_emb\n",
629 |     "    if 'embedding' not in paper_df_emb.columns:\n",
630 |     "        print(\"Error: 'embedding' column is missing in paper_df_emb.\")\n",
631 |     "        return False\n",
632 |     "\n",
633 |     "    print(\"All checks passed.\")\n",
634 |     "    return True\n",
635 |     "\n",
636 |     "\n",
637 |     "\n",
638 |     "#Load all dataframes\n",
639 |     "openai.api_key = \"Add Your OpenAI API KEY Here.\"  #e.g. openai.api_key = \"abcdefg123abc\" \n",
640 |     "syn_df = pd.read_csv(\"228paper_info.csv\")\n",
641 |     "paper_df=load_paper(\"228paper_parsed.csv\")\n",
642 |     "paper_df_emb = load_paper_emb(\"228paper_emb.csv\")\n",
643 |     "check_system(syn_df, paper_df, paper_df_emb)\n",
644 |     "\n",
645 |     "#Run for Model 1\n",
646 |     "model_1_table = tabulate_condition(model_1(syn_df),\"summarized\")\n",
647 |     "\n",
648 |     "#Run for Model 2\n",
649 |     "model_2_table = tabulate_condition(model_2(paper_df),\"summarized\")\n",
650 |     "\n",
651 |     "#Run for Model 3\n",
652 |     "model_3_table_2 = tabulate_condition( model_3(paper_df_emb),\"summarized\")"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": null,
658 |    "id": "968f0b6c",
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": []
662 |   }
663 |  ],
664 |  "metadata": {
665 |   "kernelspec": {
666 |    "display_name": "Python 3 (ipykernel)",
667 |    "language": "python",
668 |    "name": "python3"
669 |   },
670 |   "language_info": {
671 |    "codemirror_mode": {
672 |     "name": "ipython",
673 |     "version": 3
674 |    },
675 |    "file_extension": ".py",
676 |    "mimetype": "text/x-python",
677 |    "name": "python",
678 |    "nbconvert_exporter": "python",
679 |    "pygments_lexer": "ipython3",
680 |    "version": "3.9.12"
681 |   }
682 |  },
683 |  "nbformat": 4,
684 |  "nbformat_minor": 5
685 | }
686 | 


--------------------------------------------------------------------------------
/text_mining/pdf_pool.csv:
--------------------------------------------------------------------------------
  1 | 1acetylene.pdf
  2 | 1acetylene_SI.pdf
  3 | 2RodPackings.pdf
  4 | 2RodPackings_SI.pdf
  5 | 3MIL101.pdf
  6 | 3MIL101_SI.pdf
  7 | 4PdMOF5.pdf
  8 | 4PdMOF5_SI.pdf
  9 | 5HighCapGas.pdf
 10 | 5HighCapGas_SI.pdf
 11 | 6ZIF100.pdf
 12 | 6ZIF100_SI.pdf
 13 | 7AlPor.pdf
 14 | 7AlPor_SI.pdf
 15 | 8PCN124.pdf
 16 | 8PCN124_SI.pdf
 17 | 9Uio66.pdf
 18 | 9Uio66_SI.pdf
 19 | 10Uio67bpy.pdf
 20 | 10Uio67bpy_SI.pdf
 21 | 11Zrbtba.pdf
 22 | 11Zrbtba_SI.pdf
 23 | 12NENU500.pdf
 24 | 12NENU500_SI.pdf
 25 | 13BUT12.pdf
 26 | 13BUT12_SI.pdf
 27 | 14CAU23.pdf
 28 | 14CAU23_SI.pdf
 29 | 15Sequence.pdf
 30 | 15Sequence_SI.pdf
 31 | 16CTNMOF.pdf
 32 | 16CTNMOF_SI.pdf
 33 | 17NewRod.pdf
 34 | 17NewRod_SI.pdf
 35 | 18ZIF1001.pdf
 36 | 18ZIF1001_SI.pdf
 37 | 19LAX.pdf
 38 | 19LAX_SI.pdf
 39 | 20MOFglass.pdf
 40 | 20MOFglass_SI.pdf
 41 | 21ZIF.pdf
 42 | 21ZIF_SI.pdf
 43 | 22NU109.pdf
 44 | 22NU109_SI.pdf
 45 | 23CO2.pdf
 46 | 23CO2_SI.pdf
 47 | 24CAU1.pdf
 48 | 24CAU1_SI.pdf
 49 | 25CAU7.pdf
 50 | 25CAU7_SI.pdf
 51 | 26CAU8.pdf
 52 | 26CAU8_SI.pdf
 53 | 27CAU10.pdf
 54 | 27CAU10_SI.pdf
 55 | 28CAU11.pdf
 56 | 28CAU11_SI.pdf
 57 | 29CAU17.pdf
 58 | 29CAU17_SI.pdf
 59 | 30CAU18.pdf
 60 | 30CAU18_SI.pdf
 61 | 31CAU21.pdf
 62 | 31CAU21_SI.pdf
 63 | 32CAU24.pdf
 64 | 32CAU24_SI.pdf
 65 | 33CAU27.pdf
 66 | 33CAU27_SI.pdf
 67 | 34CAU28.pdf
 68 | 34CAU28_SI.pdf
 69 | 35CAU29.pdf
 70 | 35CAU29_SI.pdf
 71 | 36CAU30.pdf
 72 | 36CAU30_SI.pdf
 73 | 37CAU31.pdf
 74 | 37CAU31_SI.pdf
 75 | 38CAU35.pdf
 76 | 38CAU35_SI.pdf
 77 | 39CAU36.pdf
 78 | 39CAU36_SI.pdf
 79 | 40CAU41.pdf
 80 | 40CAU41_SI.pdf
 81 | 41CAU42.pdf
 82 | 41CAU42_SI.pdf
 83 | 42CAU43.pdf
 84 | 42CAU43_SI.pdf
 85 | 43CAU45.pdf
 86 | 43CAU45_SI.pdf
 87 | 44CAU49.pdf
 88 | 44CAU49_SI.pdf
 89 | 45CAU53.pdf
 90 | 45CAU53_SI.pdf
 91 | 46ZIF20.pdf
 92 | 46ZIF20_SI.pdf
 93 | 47ZIF70.pdf
 94 | 47ZIF70_SI.pdf
 95 | 48ZIF93.pdf
 96 | 48ZIF93_SI.pdf
 97 | 49ZIF78.pdf
 98 | 49ZIF78_SI.pdf
 99 | 50ZIF90.pdf
100 | 50ZIF90_SI.pdf
101 | 51ZIF202.pdf
102 | 51ZIF202_SI.pdf
103 | 52ZIF300.pdf
104 | 52ZIF300_SI.pdf
105 | 53ZIF303.pdf
106 | 53ZIF303_SI.pdf
107 | 54ZIF318.pdf
108 | 54ZIF318_SI.pdf
109 | 55ZIFneb_Main_and_SI.pdf
110 | 56CdZIF.pdf
111 | 56CdZIF_SI.pdf
112 | 57ZIF68.pdf
113 | 57ZIF68_SI.pdf
114 | 58TIF1.pdf
115 | 58TIF1_SI.pdf
116 | 59TIF2.pdf
117 | 59TIF2_SI.pdf
118 | 60ZTIF1.pdf
119 | 60ZTIF1_SI.pdf
120 | 61ZTIF3.pdf
121 | 61ZTIF3_SI.pdf
122 | 62ZTIF6.pdf
123 | 62ZTIF6_SI.pdf
124 | 63ZTIF8.pdf
125 | 63ZTIF8_SI.pdf
126 | 64ZTIF9.pdf
127 | 64ZTIF9_SI.pdf
128 | 65ZTIF11.pdf
129 | 65ZTIF11_SI.pdf
130 | 66ZTIF14.pdf
131 | 66ZTIF14_SI.pdf
132 | 67ZTIF17_Main_and_SI.pdf
133 | 68ZTIF18.pdf
134 | 68ZTIF18_SI.pdf
135 | 69ZTIF50.pdf
136 | 69ZTIF50_SI.pdf
137 | 70IRMOF2.pdf
138 | 70IRMOF2_SI.pdf
139 | 71IRMOF8.pdf
140 | 71IRMOF8_SI.pdf
141 | 72IRMOF18.pdf
142 | 72IRMOF18_SI.pdf
143 | 73NU1100.pdf
144 | 73NU1100_SI.pdf
145 | 74MIL100.pdf
146 | 74MIL100_SI.pdf
147 | 75AlGreen_Main_and_SI.pdf
148 | 76MOF177.pdf
149 | 76MOF177_SI.pdf
150 | 77MOF180.pdf
151 | 77MOF180_SI.pdf
152 | 78MOF143.pdf
153 | 78MOF143_SI.pdf
154 | 79MFM300.pdf
155 | 79MFM300_SI.pdf
156 | 80IRMOF74.pdf
157 | 80IRMOF74_SI.pdf
158 | 81MOF525.pdf
159 | 81MOF525_SI.pdf
160 | 82MOF801.pdf
161 | 82MOF801_SI.pdf
162 | 83NU1000.pdf
163 | 83NU1000_SI.pdf
164 | 84NU1101.pdf
165 | 84NU1101_SI.pdf
166 | 85nMOF.pdf
167 | 85nMOF_SI.pdf
168 | 86UiObpdc.pdf
169 | 86UiObpdc_SI.pdf
170 | 87PCN56.pdf
171 | 87PCN56_SI.pdf
172 | 88MIP200.pdf
173 | 88MIP200_SI.pdf
174 | 89MIP201.pdf
175 | 89MIP201_SI.pdf
176 | 90MIP202.pdf
177 | 90MIP202_SI.pdf
178 | 91MIP203.pdf
179 | 91MIP203_SI.pdf
180 | 92PCN94.pdf
181 | 92PCN94_SI.pdf
182 | 93PCN221.pdf
183 | 93PCN221_SI.pdf
184 | 94PCN222.pdf
185 | 94PCN222_SI.pdf
186 | 95PCN223.pdf
187 | 95PCN223_SI.pdf
188 | 96PCN224.pdf
189 | 96PCN224_SI.pdf
190 | 97PCN225.pdf
191 | 97PCN225_SI.pdf
192 | 98PCN228.pdf
193 | 98PCN228_SI.pdf
194 | 99PCN521.pdf
195 | 99PCN521_SI.pdf
196 | 100PCN700.pdf
197 | 100PCN700_SI.pdf
198 | 101PCN777.pdf
199 | 101PCN777_SI.pdf
200 | 102DUT51.pdf
201 | 102DUT51_SI.pdf
202 | 103DUT52.pdf
203 | 103DUT52_SI.pdf
204 | 104DUT67.pdf
205 | 104DUT67_SI.pdf
206 | 105BUT10.pdf
207 | 105BUT10_SI.pdf
208 | 106BUT30.pdf
209 | 106BUT30_SI.pdf
210 | 107MIL140.pdf
211 | 107MIL140_SI.pdf
212 | 108MIL163.pdf
213 | 108MIL163_SI.pdf
214 | 109CPM99.pdf
215 | 109CPM99_SI.pdf
216 | 110UMCM309.pdf
217 | 110UMCM309_SI.pdf
218 | 111ZrBTB.pdf
219 | 111ZrBTB_SI.pdf
220 | 112ZrABDC.pdf
221 | 112ZrABDC_SI.pdf
222 | 113PIZOF.pdf
223 | 113PIZOF_SI.pdf
224 | 114BPV.pdf
225 | 114BPV_SI.pdf
226 | 115ZrBTBP.pdf
227 | 115ZrBTBP_SI.pdf
228 | 116UPG1.pdf
229 | 116UPG1_SI.pdf
230 | 117ZrADC.pdf
231 | 117ZrADC_SI.pdf
232 | 118ZrDTDC.pdf
233 | 118ZrDTDC_SI.pdf
234 | 119ZrTCPS.pdf
235 | 119ZrTCPS_SI.pdf
236 | 120ZrBTDC.pdf
237 | 120ZrBTDC_SI.pdf
238 | 121MMPF6.pdf
239 | 121MMPF6_SI.pdf
240 | 122ZrAP1.pdf
241 | 122ZrAP1_SI.pdf
242 | 123PeroxideCarrier.pdf
243 | 123PeroxideCarrier_SI.pdf
244 | 124NU600.pdf
245 | 124NU600_SI.pdf
246 | 125NU1900.pdf
247 | 125NU1900_SI.pdf
248 | 126NU1800.pdf
249 | 126NU1800_SI.pdf
250 | 127MFU4.pdf
251 | 127MFU4_SI.pdf
252 | 128NU500.pdf
253 | 128NU500_SI.pdf
254 | 129UHM.pdf
255 | 129UHM_SI.pdf
256 | 130ZnLi.pdf
257 | 130ZnLi_SI.pdf
258 | 131JUC118.pdf
259 | 131JUC118_SI.pdf
260 | 132MOM11.pdf
261 | 132MOM11_SI.pdf
262 | 133CoINA.pdf
263 | 133CoINA_SI.pdf
264 | 134CuBTC.pdf
265 | 134CuBTC_SI.pdf
266 | 135MIL53.pdf
267 | 135MIL53_SI.pdf
268 | 136ZnBTATDA.pdf
269 | 136ZnBTATDA_SI.pdf
270 | 137CdMOF.pdf
271 | 137CdMOF_SI.pdf
272 | 138NTUZ8.pdf
273 | 138NTUZ8_SI.pdf
274 | 139NTUZ30.pdf
275 | 139NTUZ30_SI.pdf
276 | 140CdMOF.pdf
277 | 140CdMOF_SI.pdf
278 | 141ZnMOF.pdf
279 | 141ZnMOF_SI.pdf
280 | 142CuMOF.pdf
281 | 142CuMOF_SI.pdf
282 | 143Fedobdc.pdf
283 | 143Fedobdc_SI.pdf
284 | 144BioMOF101.pdf
285 | 144BioMOF101_SI.pdf
286 | 145MTTFTB.pdf
287 | 145MTTFTB_SI.pdf
288 | 146YMOF.pdf
289 | 146YMOF_SI.pdf
290 | 147PCN526.pdf
291 | 147PCN526_SI.pdf
292 | 148JLU23.pdf
293 | 148JLU23_SI.pdf
294 | 149UTSA74.pdf
295 | 149UTSA74_SI.pdf
296 | 150NPF200.pdf
297 | 150NPF200_SI.pdf
298 | 151ZrMTBC.pdf
299 | 151ZrMTBC_SI.pdf
300 | 152SBMOF.pdf
301 | 152SBMOF_SI.pdf
302 | 153MAF42.pdf
303 | 153MAF42_SI.pdf
304 | 154TTF4.pdf
305 | 154TTF4_SI.pdf
306 | 155IISERP.pdf
307 | 155IISERP_SI.pdf
308 | 156CuMOF.pdf
309 | 156CuMOF_SI.pdf
310 | 157LaMOF.pdf
311 | 157LaMOF_SI.pdf
312 | 158CuMOF.pdf
313 | 158CuMOF_SI.pdf
314 | 159CdMOF.pdf
315 | 159CdMOF_SI.pdf
316 | 160CPM33.pdf
317 | 160CPM33_SI.pdf
318 | 161Cutba.pdf
319 | 161Cutba_SI.pdf
320 | 162MUF77.pdf
321 | 162MUF77_SI.pdf
322 | 163H6LMOF.pdf
323 | 163H6LMOF_SI.pdf
324 | 164Mabtc.pdf
325 | 164Mabtc_SI.pdf
326 | 165MTCPP.pdf
327 | 165MTCPP_SI.pdf
328 | 166Mbptc.pdf
329 | 166Mbptc_SI.pdf
330 | 167FJU14.pdf
331 | 167FJU14_SI.pdf
332 | 168ZnAzDC.pdf
333 | 168ZnAzDC_SI.pdf
334 | 169CdMOF.pdf
335 | 169CdMOF_SI.pdf
336 | 170SBMOF2.pdf
337 | 170SBMOF2_SI.pdf
338 | 171ZnMOF.pdf
339 | 171ZnMOF_SI.pdf
340 | 172CuCPT.pdf
341 | 172CuCPT_SI.pdf
342 | 173ZJNU43.pdf
343 | 173ZJNU43_SI.pdf
344 | 174ftwMOF.pdf
345 | 174ftwMOF_SI.pdf
346 | 175CdMOF.pdf
347 | 175CdMOF_SI.pdf
348 | 176LaMOF.pdf
349 | 176LaMOF_SI.pdf
350 | 177ZnFDA.pdf
351 | 177ZnFDA_SI.pdf
352 | 178ZnMOF.pdf
353 | 178ZnMOF_SI.pdf
354 | 179ZnMOF.pdf
355 | 179ZnMOF_SI.pdf
356 | 180CdMOF.pdf
357 | 180CdMOF_SI.pdf
358 | 181EMOF.pdf
359 | 181EMOF_SI.pdf
360 | 182UHM31.pdf
361 | 182UHM31_SI.pdf
362 | 183EuMOF.pdf
363 | 183EuMOF_SI.pdf
364 | 184ZnMOF.pdf
365 | 184ZnMOF_SI.pdf
366 | 185PCPBDAMOF.pdf
367 | 185PCPBDAMOF_SI.pdf
368 | 186ZnCPT.pdf
369 | 186ZnCPT_SI.pdf
370 | 187UBMOF.pdf
371 | 187UBMOF_SI.pdf
372 | 188TMU18.pdf
373 | 188TMU18_SI.pdf
374 | 189ZJNU46.pdf
375 | 189ZJNU46_SI.pdf
376 | 190CdMOF.pdf
377 | 190CdMOF_SI.pdf
378 | 191ADCMOF.pdf
379 | 191ADCMOF_SI.pdf
380 | 192PCN133.pdf
381 | 192PCN133_SI.pdf
382 | 193AZIDO.pdf
383 | 193AZIDO_SI.pdf
384 | 194MOF235.pdf
385 | 194MOF235_SI.pdf
386 | 195JLU3.pdf
387 | 195JLU3_SI.pdf
388 | 196ZnSIP.pdf
389 | 196ZnSIP_SI.pdf
390 | 197NNU28.pdf
391 | 197NNU28_SI.pdf
392 | 198NNU32.pdf
393 | 198NNU32_SI.pdf
394 | 199USTC7.pdf
395 | 199USTC7_SI.pdf
396 | 200UPC5.pdf
397 | 200UPC5_SI.pdf
398 | 201ZnCrown.pdf
399 | 201ZnCrown_SI.pdf
400 | 202UMCM2.pdf
401 | 202UMCM2_SI.pdf
402 | 203MOF808.pdf
403 | 203MOF808_SI.pdf
404 | 204ZnMOF.pdf
405 | 204ZnMOF_SI.pdf
406 | 205DEF1.pdf
407 | 205DEF1_SI.pdf
408 | 206BOB.pdf
409 | 206BOB_SI.pdf
410 | 207ZnTBAPy.pdf
411 | 207ZnTBAPy_SI.pdf
412 | 208ZnMOF.pdf
413 | 208ZnMOF_SI.pdf
414 | 209Znbtcatz.pdf
415 | 209Znbtcatz_SI.pdf
416 | 210InMOF.pdf
417 | 210InMOF_SI.pdf
418 | 211RPM3.pdf
419 | 211RPM3_SI.pdf
420 | 212MOF419.pdf
421 | 212MOF419_SI.pdf
422 | 213MOF808.pdf
423 | 213MOF808_SI.pdf
424 | 214MOF950.pdf
425 | 214MOF950_SI.pdf
426 | 215MOF519.pdf
427 | 215MOF519_SI.pdf
428 | 216MOF645.pdf
429 | 216MOF645_SI.pdf
430 | 217SOCMOF.pdf
431 | 217SOCMOF_SI.pdf
432 | 218FeMOF.pdf
433 | 218FeMOF_SI.pdf
434 | 219MIL53.pdf
435 | 219MIL53_SI.pdf
436 | 220ZnMOF.pdf
437 | 220ZnMOF_SI.pdf
438 | 221VMOF.pdf
439 | 221VMOF_SI.pdf
440 | 222GaMOF.pdf
441 | 222GaMOF_SI.pdf
442 | 223ZnMOF.pdf
443 | 223ZnMOF_SI.pdf
444 | 224Helical.pdf
445 | 224Helical_SI.pdf
446 | 225JUC124.pdf
447 | 225JUC124_SI.pdf
448 | 226PPMOF.pdf
449 | 226PPMOF_SI.pdf
450 | 227CoMOF.pdf
451 | 227CoMOF_SI.pdf
452 | 228MnMOF.pdf
453 | 228MnMOF_SI.pdf
454 | 


--------------------------------------------------------------------------------