├── .DS_Store ├── .github └── ISSUE_TEMPLATE │ ├── epic-template.md │ └── story-template.md ├── .gitignore ├── README.md ├── UPDATES.md ├── chat-examples ├── alternative-approach.md ├── cognitive-verifier.md ├── context-manager.md ├── fact-check.md ├── flipped-interaction.md ├── output_automater.md ├── persona.md ├── question-refinement.md └── recipe.md ├── code ├── __init__.py ├── agent.py ├── chains.py ├── chat_agent.py ├── cmr_connector.py ├── contexts.py ├── data_downloader.py ├── gcmd_keyword_download.py ├── gcmd_keywords.json ├── gpt_embedder.py ├── parse_docs.py ├── prompting_tools.py ├── prompts.py ├── texts.txt └── utils.py ├── data ├── EJ_datasets.csv ├── EJ_validation_data.csv ├── dgf.json ├── ej_dataset.xlsx ├── human_evaluation_done.csv ├── images │ └── llm-use-in-smd.png ├── keywords.json ├── mdgf_document.txt ├── osdr-eval.zip └── policies.json ├── docs-presentations ├── .keep ├── README.md └── papers │ └── .keep ├── images ├── applications.png ├── astro_langchain.png ├── cmr_langchain.png ├── create-jupyterlab-env.png ├── credential.png ├── credentials-show.png ├── git-clone-1.png ├── jupyterlab-spaces.png ├── llm-use-in-smd.png ├── loggedin.png ├── login-1.png ├── login-2.png ├── sagemaker-studio.png ├── smd-hls-cloned-content.png ├── smd-hls-git-clone.png ├── smd-llm-cloned-content.png ├── smd-llm-git-clone.png ├── update-instance-type.png └── updated-instance-config.png ├── notebooks ├── Chat-with-mDGF.ipynb ├── EJ_classify.ipynb ├── EJ_extractions.ipynb ├── EJ_finetuning.ipynb ├── OSDR-evaluation.ipynb ├── langchain-react-astro.ipynb └── langchain-react-cmr.ipynb └── requirements.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/.DS_Store -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/epic-template.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/.github/ISSUE_TEMPLATE/epic-template.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/story-template.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/.github/ISSUE_TEMPLATE/story-template.md -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .venv 3 | __pycache__ 4 | cache 5 | venv/ 6 | tmp/ 7 | .python-version 8 | data/osdr-eval/ 9 | 10 | .DS_Store 11 | .devcontainer 12 | 13 | 14 | # dirs 15 | __pycache__/ 16 | UND/__pycache__/config.cpython-39.pyc 17 | 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | *$py.class 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | pip-wheel-metadata/ 41 | share/python-wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | MANIFEST 46 | 47 | # PyInstaller 48 | # Usually these files are written by a python script from a template 49 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 50 | *.manifest 51 | *.spec 52 | 53 | # Installer logs 54 | pip-log.txt 55 | pip-delete-this-directory.txt 56 | 57 | # Unit test / coverage reports 58 | htmlcov/ 59 | .tox/ 60 | .nox/ 61 | .coverage 62 | .coverage.* 63 | .cache 64 | nosetests.xml 65 | coverage.xml 66 | *.cover 67 | *.py,cover 68 | .hypothesis/ 69 | .pytest_cache/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | target/ 93 | 94 | # Jupyter Notebook 95 | .ipynb_checkpoints 96 | 97 | # IPython 98 | profile_default/ 99 | ipython_config.py 100 | 101 | # pyenv 102 | .python-version 103 | 104 | # pipenv 105 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 106 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 107 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 108 | # install all needed dependencies. 109 | #Pipfile.lock 110 | 111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 112 | __pypackages__/ 113 | 114 | # Celery stuff 115 | celerybeat-schedule 116 | celerybeat.pid 117 | 118 | # SageMath parsed files 119 | *.sage.py 120 | 121 | # Environments 122 | .env 123 | .venv 124 | env/ 125 | venv/ 126 | venv-eval/ 127 | venv*/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | .vscode 150 | 151 | # Data Files 152 | scripts/*data 153 | scripts/*results 154 | scripts/*extractions 155 | -------------------------------------------------------------------------------- /UPDATES.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 6 | ## Motivation and Context 7 | 8 | 9 | 10 | ## How has this been tested? 11 | 12 | 13 | 14 | 15 | ## Discussed with team? 16 | 17 | - [ ] Yes 18 | - [ ] No 19 | - [ ] Need further discussion 20 | 21 | ## Checklist: 22 | 23 | ### 24 | ### 25 | 26 | -------------------------------------------------------------------------------- /chat-examples/alternative-approach.md: -------------------------------------------------------------------------------- 1 | ## Appendix F: Alternative Approach Examples 2 | 3 | **Provide different approaches to studying the Earth from space, considering various methods, tools, or perspectives that could be applied. \[March 8, 2024\]** 4 | 5 | ChatGPT 6 | 7 | Studying the Earth from space encompasses a wide range of approaches, each employing various methods, tools, and perspectives. Here are several approaches: 8 | 9 | - Remote Sensing: 10 | - Optical Imaging: Utilizing cameras and sensors to capture visible light, providing high-resolution images of Earth's surface features, such as land cover, vegetation, and urban development. 11 | - Infrared Imaging: Detecting heat signatures to analyze temperature variations across the Earth's surface, useful for monitoring wildfires, urban heat islands, and agricultural productivity. 12 | - Microwave Imaging: Penetrating through clouds and vegetation to measure soil moisture content, sea surface temperature, and ice coverage, aiding in climate studies and weather forecasting. 13 | - Satellite Radar: 14 | - Synthetic Aperture Radar (SAR): Sending radar pulses to the Earth's surface and measuring the reflected signals to create detailed images, useful for mapping terrain, monitoring changes in land use, and detecting surface deformation due to earthquakes or volcanic activity. 15 | - Global Positioning System (GPS): 16 | - Satellite-based navigation system providing precise positioning and timing information worldwide, facilitating various Earth observation applications such as monitoring crustal movements, sea level rise, and atmospheric water vapor content. 17 | - Earth-observing Satellites: 18 | - Dedicated satellites equipped with sensors and instruments tailored for monitoring specific Earth system components, including oceans, atmosphere, land, and ice. 19 | - These satellites enable continuous monitoring of environmental parameters such as sea surface temperature, ocean color, atmospheric composition, and land surface changes, contributing to climate research, resource management, and disaster monitoring. 20 | - Geostationary Satellites: 21 | - Positioned in geostationary orbit above the equator, these satellites provide continuous observations of specific regions, offering real-time monitoring of weather patterns, atmospheric conditions, and environmental phenomena such as hurricanes, cyclones, and thunderstorms. 22 | - Space-based Lidar: 23 | - Lidar (Light Detection and Ranging) systems onboard satellites emit laser pulses and measure the reflected light to create highly detailed 3D maps of Earth's surface, vegetation structure, and atmospheric properties, aiding in biodiversity assessment, urban planning, and carbon cycle studies. 24 | - Hyperspectral Imaging: 25 | - Capturing images across numerous spectral bands, hyperspectral sensors provide detailed information about the chemical composition and physical properties of Earth's surface, facilitating mineral exploration, agriculture management, and environmental monitoring. 26 | - International Collaboration: 27 | - Collaborative efforts among space agencies and organizations worldwide, such as NASA, ESA, JAXA, and NOAA, enable the pooling of resources, expertise, and data sharing to address global challenges related to climate change, natural disasters, and sustainable development. 28 | 29 | By combining these approaches and leveraging advancements in technology, data analytics, and interdisciplinary research, scientists can gain comprehensive insights into Earth's dynamic processes and address critical environmental and societal issues. 30 | 31 | **Provide different approaches to detecting exoplanets, considering various data, methods, tools, or algorithms that could be applied. \[March 8, 2024\]** 32 | 33 | ChatGPT 34 | 35 | Detecting exoplanets involves various methods, each leveraging different data, techniques, and tools. Here are some approaches: 36 | 37 | - Transit Photometry: 38 | - Method: Observing the decrease in brightness of a star when a planet passes in front of it (transit). 39 | - Data: Light curves obtained from telescopes measuring the brightness of stars over time. 40 | - Tools/Algorithms: Algorithms to analyze periodic dimming in light curves, such as box-fitting least squares, matched filtering, or machine learning classifiers. 41 | - Radial Velocity: 42 | - Method: Detecting the small wobbles of a star caused by the gravitational tug of an orbiting planet. 43 | - Data: Spectroscopic measurements of a star's radial velocity over time. 44 | - Tools/Algorithms: Doppler shift analysis, Fourier analysis, or Bayesian inference methods to extract periodic variations indicative of orbiting planets. 45 | - Direct Imaging: 46 | - Method: Capturing images of exoplanets by blocking out the light from the parent star. 47 | - Data: High-resolution images from telescopes, often using coronagraphs or starshade technology to block out starlight. 48 | - Tools/Algorithms: Image processing techniques to enhance contrast and remove residual starlight, such as angular differential imaging or post-processing algorithms. 49 | - Microlensing: 50 | - Method: Utilizing the gravitational lensing effect caused by a foreground star to magnify the light of a background star, revealing the presence of planets orbiting the foreground star. 51 | - Data: Light curves of microlensing events, which exhibit characteristic magnification patterns over time. 52 | - Tools/Algorithms: Modeling software to fit light curve data and infer the presence of planets, Bayesian analysis, or machine learning methods to classify microlensing events. 53 | - Astrometry: 54 | - Method: Measuring the precise position of a star over time and detecting the small periodic shifts caused by the gravitational pull of an orbiting planet. 55 | - Data: High-precision astrometric measurements from telescopes. 56 | - Tools/Algorithms: Astrometric data analysis software, such as fitting orbits to measured positions, Bayesian inference methods, or machine learning algorithms. 57 | - Pulsar Timing: 58 | - Method: Monitoring the precise timing of radio pulses from pulsars and detecting deviations caused by the presence of planets. 59 | - Data: Time series data of pulsar pulse arrival times. 60 | - Tools/Algorithms: Timing analysis software to detect subtle deviations from predicted arrival times, Bayesian modeling, or machine learning algorithms for pattern recognition. 61 | - Gravitational Microlensing: 62 | - Method: Observing the magnification of background stars due to the gravitational lensing effect of a foreground star with planets. 63 | - Data: Light curves of gravitational microlensing events. 64 | - Tools/Algorithms: Modeling software to fit light curve data and infer the presence of planets, statistical analysis methods, or machine learning algorithms for event classification. 65 | 66 | Each of these methods has its strengths and limitations, and often a combination of techniques is used for more comprehensive exoplanet detection and characterization. 67 | 68 | **Provide different approaches to determining Earth's surface reflectance, considering various data, methods, tools, or algorithms that could be applied. \[March 8, 2024\]** 69 | 70 | ChatGPT 71 | 72 | Determining Earth's surface reflectance involves various approaches depending on the data availability, spatial resolution, and specific research objectives. Here are several methods and tools commonly used: 73 | 74 | - Remote Sensing Data Analysis: 75 | - Satellite Imagery: Utilize multispectral or hyperspectral satellite imagery, such as Landsat, Sentinel-2, or MODIS, which capture reflectance across different wavelengths. 76 | - Radiometric Correction: Apply atmospheric correction techniques to remove atmospheric effects, such as Rayleigh and Mie scattering, to obtain accurate surface reflectance. 77 | - Band Ratio Techniques: Calculate spectral indices or band ratios, like Normalized Difference Vegetation Index (NDVI), to quantify surface reflectance related to specific features like vegetation. 78 | - Field Measurements and Calibration: 79 | - Ground-based Spectroradiometers: Use field spectroradiometers to directly measure surface reflectance at specific locations across various wavelengths. 80 | - Calibration Targets: Deploy calibrated reflectance panels or targets in the study area to calibrate remote sensing data and validate surface reflectance estimates. 81 | - Modeling and Simulation: 82 | - Radiative Transfer Models: Employ radiative transfer models, like MODTRAN or PROSAIL, to simulate the interaction between electromagnetic radiation and Earth's surface to predict surface reflectance. 83 | - Machine Learning Algorithms: Train machine learning models, such as random forests or neural networks, to estimate surface reflectance based on input features from satellite imagery and ancillary data. 84 | - Image Processing and Analysis: 85 | - Topographic Correction: Apply topographic correction algorithms to account for terrain-induced variations in reflectance, especially in rugged terrains. 86 | - Temporal Analysis: Analyze time-series satellite imagery to capture seasonal variations in surface reflectance, especially for dynamic landscapes like forests or agricultural areas. 87 | - Data Fusion and Integration: 88 | - Fusing Multiple Sensors: Integrate data from multiple sensors with complementary capabilities, such as optical and SAR (Synthetic Aperture Radar), to enhance surface reflectance estimation, especially in areas with frequent cloud cover. 89 | - Incorporating Ancillary Data: Include ancillary data like digital elevation models (DEMs), land cover maps, or meteorological data to improve the accuracy of surface reflectance estimation models. 90 | - Validation and Quality Assessment: 91 | - Field Campaigns: Conduct field campaigns to collect in situ measurements of surface reflectance for validation and calibration purposes. 92 | - Cross-Validation: Employ statistical techniques like cross-validation to assess the accuracy and reliability of surface reflectance estimates derived from remote sensing data. 93 | 94 | Each approach has its strengths and limitations, and the choice depends on factors such as the study area characteristics, available resources, and the specific research objectives. Combining multiple approaches often yields more robust and accurate surface reflectance estimations. 95 | -------------------------------------------------------------------------------- /chat-examples/cognitive-verifier.md: -------------------------------------------------------------------------------- 1 | ## Appendix G: Cognitive Verifier Examples 2 | 3 | **For the question 'what happens during a total solar eclipse?', suggest a series of subquestions to help explore and understand the various aspects of this topic, leading to a more informed and comprehensive answer. \[March 8, 2024\]** 4 | 5 | ChatGPT 6 | 7 | Sure, breaking down the question "what happens during a total solar eclipse?" into subquestions can help delve into the various aspects of this phenomenon: 8 | 9 | - What is a solar eclipse? 10 | - Definition and explanation of a solar eclipse. 11 | - Different types of solar eclipses (total, partial, annular). 12 | - How does a total solar eclipse occur? 13 | - Explanation of the alignment of the Sun, Moon, and Earth. 14 | - The concept of umbra and penumbra. 15 | - What are the stages of a total solar eclipse? 16 | - Initial contact (partial eclipse begins). 17 | - Totality (complete coverage of the Sun). 18 | - Final contact (partial eclipse ends). 19 | - What are the observable phenomena during a total solar eclipse? 20 | - Totality phenomena like the solar corona, Bailey's beads, and the diamond ring effect. 21 | - Changes in the environment such as temperature drop and altered animal behavior. 22 | - How often do total solar eclipses occur? 23 | - Frequency and occurrence of total solar eclipses around the world. 24 | - Factors influencing the frequency of total solar eclipses. 25 | - Where are the best places to observe a total solar eclipse? 26 | - Geographical locations favorable for witnessing a total solar eclipse. 27 | - Consideration of factors like weather conditions and accessibility. 28 | - What safety precautions should be taken during a total solar eclipse? 29 | - Risks associated with looking directly at the Sun during partial phases. 30 | - Safe viewing methods such as solar filters or pinhole projectors. 31 | - What scientific discoveries have been made during total solar eclipses? 32 | - Historical significance of eclipses in understanding solar physics. 33 | - Modern scientific experiments conducted during eclipses. 34 | - What cultural and historical significance do total solar eclipses hold? 35 | - Mythologies, folklore, and cultural interpretations of solar eclipses. 36 | - Historical events and their relation to solar eclipses. 37 | - How do total solar eclipses inspire and impact society today? 38 | - Public interest and excitement surrounding total solar eclipses. 39 | - Educational outreach and scientific engagement during eclipse events. 40 | 41 | You 42 | 43 | what are the different types of eclipses? 44 | 45 | ChatGPT 46 | 47 | There are three main types of eclipses: solar eclipses, lunar eclipses, and transits. 48 | 49 | - Solar Eclipses: 50 | - Solar eclipses occur when the Moon passes between the Earth and the Sun, blocking all or part of the Sun's light. 51 | - There are three types of solar eclipses: 52 | - Total Solar Eclipse: The Moon completely covers the Sun, creating a brief period of darkness (totality) in the areas where it is visible. 53 | - Partial Solar Eclipse: The Moon covers only part of the Sun's disk, resulting in a partial blocking of sunlight. 54 | - Annular Solar Eclipse: When the Moon is farther from Earth in its orbit and appears smaller in the sky, it doesn't fully cover the Sun, leaving a ring of sunlight visible around the Moon. 55 | - Lunar Eclipses: 56 | - Lunar eclipses occur when the Earth passes between the Sun and the Moon, causing the Earth's shadow to fall on the Moon. 57 | - There are three types of lunar eclipses: 58 | - Total Lunar Eclipse: The entire Moon passes through the Earth's umbral shadow, causing it to appear red or coppery during totality. 59 | - Partial Lunar Eclipse: Only a portion of the Moon passes through the Earth's umbral shadow, resulting in a partial darkening of the lunar surface. 60 | - Penumbral Lunar Eclipse: The Moon passes through the Earth's penumbral shadow, causing a subtle darkening of the lunar surface that is often difficult to observe. 61 | - Transits: 62 | - Transits occur when a celestial body passes directly between an observer and another celestial object. The two most common transits visible from Earth are Mercury and Venus transits. 63 | - Mercury Transit: Mercury passes between the Earth and the Sun, appearing as a small black dot moving across the Sun's disk. 64 | - Venus Transit: Venus passes between the Earth and the Sun, also appearing as a small black dot crossing the Sun's surface. 65 | 66 | Each type of eclipse or transit offers unique astronomical phenomena and opportunities for observation and scientific study. 67 | 68 | **For the question 'what are emerging research areas in astrophysics?', suggest a series of subquestions to help explore and understand the various aspects of this topic, leading to a more informed and comprehensive answer. \[March 8, 2024\]** 69 | 70 | ChatGPT 71 | 72 | Certainly! Here are some subquestions that can help explore and understand emerging research areas in astrophysics: 73 | 74 | - Cosmology and Dark Matter/Energy: 75 | - What recent discoveries or theories are challenging our understanding of the universe's composition and evolution? 76 | - How are scientists studying dark matter and dark energy, and what are the latest findings in these areas? 77 | - Are there any new observational techniques or instruments being developed to explore cosmological mysteries? 78 | - Exoplanetary Systems: 79 | - What recent discoveries have been made in the search for exoplanets? 80 | - How are scientists characterizing exoplanetary atmospheres and surfaces? 81 | - What are the implications of finding potentially habitable exoplanets? 82 | - Gravitational Waves: 83 | - What groundbreaking discoveries have been made in gravitational wave astronomy since their detection? 84 | - How are scientists using gravitational wave observations to study phenomena like black hole mergers and neutron star collisions? 85 | - What are the prospects for future detections and improvements in detection sensitivity? 86 | - Astrobiology: 87 | - What recent advancements have been made in the search for extraterrestrial life? 88 | - How are scientists exploring the potential habitability of other planets and moons in our solar system and beyond? 89 | - What are the emerging interdisciplinary approaches in astrobiology? 90 | - Multi-Messenger Astronomy: 91 | - How are scientists integrating data from different astronomical messengers such as electromagnetic radiation, neutrinos, and gravitational waves? 92 | - What recent discoveries have been made through multi-messenger astronomy? 93 | - What are the challenges and opportunities for further advancements in this field? 94 | - High-Energy Astrophysics: 95 | - What are the latest findings in the study of high-energy phenomena such as gamma-ray bursts, supernovae, and active galactic nuclei? 96 | - How are researchers using observations from satellites and ground-based telescopes to explore these phenomena? 97 | - What new insights are emerging from theoretical models and simulations? 98 | - Astrophysical Big Data and Machine Learning: 99 | - How are advancements in big data analysis and machine learning techniques revolutionizing astrophysics research? 100 | - What are some recent applications of machine learning in data analysis, pattern recognition, and simulations? 101 | - What are the challenges and ethical considerations associated with the use of AI in astrophysics? 102 | 103 | These subquestions can provide a comprehensive framework for exploring and understanding the diverse and rapidly evolving landscape of astrophysics research.r -------------------------------------------------------------------------------- /chat-examples/context-manager.md: -------------------------------------------------------------------------------- 1 | ## Appendix I: Context Manager Examples 2 | 3 | **When explaining how exoplanets are detected, only include information on the gravitational lensing method. Ignore the transit method. \[March 15, 2024\]** 4 | 5 | ChatGPT 6 | 7 | Gravitational lensing is a method used to detect exoplanets indirectly by observing the distortion of light from a distant star caused by the gravitational field of a foreground exoplanet. Here's how it works: 8 | 9 | - Gravitational Lensing Principle: According to Einstein's general theory of relativity, mass bends spacetime, causing light rays to follow curved paths near massive objects. When a massive object, like an exoplanet, passes between Earth and a distant star, its gravitational field acts as a lens, bending and magnifying the light from the background star. 10 | - Microlensing Events: As the exoplanet and its parent star move relative to Earth, they create a transient gravitational lensing event. This event causes a brief increase in the brightness of the background star. This increase typically lasts for a few days to weeks, depending on the alignment and the relative velocities of the objects involved. 11 | - Observation and Data Analysis: Astronomers monitor large numbers of stars in the night sky, looking for these temporary increases in brightness that signify a microlensing event. Specialized telescopes equipped with sensitive detectors are used for this purpose. The light curves generated during these events provide information about the mass, distance, and orbital characteristics of the exoplanet and its host star. 12 | - Characterization of Exoplanets: While gravitational lensing can detect the presence of exoplanets, it's challenging to directly determine properties such as the size, composition, and atmosphere of the detected planets using this method alone. Additional observations and analyses, often involving complementary techniques, are typically required to further characterize the detected exoplanets. 13 | - Advantages and Limitations: Gravitational lensing is particularly useful for detecting exoplanets that are relatively distant from Earth, as it does not rely on the alignment of the planet's orbit with our line of sight. However, it is less sensitive to small planets and requires precise observations and careful analysis to distinguish planetary signals from other sources of variability in stellar brightness. Additionally, gravitational lensing events are relatively rare and unpredictable, making it challenging to target specific systems for study. 14 | 15 | **When explaining how lightning works, only include information on lightning that results from volcanic eruptions. Ignore lightning that occurs during atmospheric weather events. \[March 15, 2024\]** 16 | 17 | ChatGPT 18 | 19 | Lightning resulting from volcanic eruptions, known as volcanic lightning, occurs due to the intense electrical charges generated during the eruption process. Here's a simplified explanation of how it works: 20 | 21 | - Ash and Particle Collision: During a volcanic eruption, ash, dust, and other particles are forcefully expelled into the atmosphere. As these particles collide with each other and with the surrounding air molecules, they create friction. This friction causes the particles to become electrically charged. 22 | - Charge Separation: The collision of particles in the volcanic plume causes a separation of electric charges. This process is similar to the way static electricity builds up when you rub certain materials together. Positive charges accumulate in some areas while negative charges gather in others. 23 | - Formation of Lightning: As the charge separation becomes significant, it creates an electric field within the volcanic plume. When the electric field becomes strong enough, it can overcome the insulating properties of the surrounding air, leading to the rapid discharge of electricity in the form of lightning. 24 | - Path of Least Resistance: Lightning typically follows the path of least resistance through the atmosphere. In the case of volcanic eruptions, the lightning may travel within the volcanic plume itself or extend beyond it, depending on the specific conditions and the presence of conducting materials such as ash, dust, and gases. 25 | - Variability and Characteristics: Volcanic lightning can vary in intensity and appearance. It may manifest as flickering flashes within the volcanic plume or as more traditional lightning bolts extending from the eruption cloud to the ground or between different parts of the plume. 26 | - Study and Monitoring: Scientists study volcanic lightning to better understand volcanic processes and to monitor eruptions. Monitoring volcanic lightning can provide insights into the intensity and behavior of volcanic eruptions, aiding in hazard assessment and mitigation efforts. 27 | 28 | Overall, volcanic lightning results from the complex interplay of charged particles, electric fields, and atmospheric conditions during volcanic eruptions. It represents a fascinating yet hazardous aspect of volcanic activity. 29 | 30 | **When explaining the causes of climate change, only consider anthropogenic sources. Please ignore Milankovitch cycles. \[March 15, 2024\]** 31 | 32 | ChatGPT 33 | 34 | Certainly. When discussing the causes of climate change, anthropogenic (human-caused) factors play a significant role. Here are some key anthropogenic sources contributing to climate change: 35 | 36 | - Greenhouse Gas Emissions: The burning of fossil fuels for energy, transportation, and industrial processes releases greenhouse gases such as carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O) into the atmosphere. These gases trap heat, leading to the warming of the Earth's surface and contributing to climate change. 37 | - Deforestation: Forests act as carbon sinks, absorbing CO2 from the atmosphere. Deforestation, primarily for agriculture, urbanization, and logging, reduces the number of trees available to absorb CO2, leading to increased concentrations of greenhouse gases in the atmosphere. 38 | - Industrial Processes: Various industrial activities release greenhouse gases and other pollutants into the atmosphere. For example, cement production releases large amounts of CO2, while certain manufacturing processes release fluorinated gases, which have high global warming potentials. 39 | - Agriculture: Agricultural practices such as livestock farming and rice cultivation produce methane emissions. Additionally, the use of synthetic fertilizers releases nitrous oxide, another potent greenhouse gas. 40 | - Land Use Changes: Besides deforestation, changes in land use, such as urbanization and conversion of natural habitats into agricultural land, can alter regional climate patterns and contribute to climate change. 41 | - Waste Management: Improper disposal of waste, particularly organic waste in landfills, generates methane as it decomposes. Methane emissions from landfills contribute to climate change. 42 | - Transportation: The combustion of fossil fuels in vehicles, airplanes, ships, and other transportation modes releases CO2 and other pollutants into the atmosphere, contributing to climate change. 43 | 44 | These anthropogenic sources of greenhouse gas emissions have substantially altered the Earth's climate system, leading to global warming, changes in precipitation patterns, rising sea levels, and other impacts associated with climate change. Addressing these sources through mitigation efforts is crucial in combating climate change and its adverse effects. 45 | 46 | **When studying the greenhouse gas effect, consider the greenhouse gas effect on Venus. Please ignore the greenhouse effect on Earth. \[March 15, 2024\]** 47 | 48 | ChatGPT 49 | 50 | Certainly, when studying the greenhouse effect on Venus, it's a fascinating case study because Venus is often referred to as Earth's "sister planet" due to similarities in size and composition. However, the greenhouse effect on Venus is extremely pronounced and serves as an extreme example of what can happen when this phenomenon runs rampant. 51 | 52 | Venus's atmosphere is predominantly composed of carbon dioxide (CO2) with trace amounts of other gases like nitrogen and sulfur dioxide. The dense atmosphere traps heat from the Sun through the greenhouse effect, resulting in surface temperatures hot enough to melt lead. 53 | 54 | Here are some key points regarding the greenhouse effect on Venus: 55 | 56 | - Atmospheric Composition: Venus's atmosphere is about 96.5% carbon dioxide, which is a potent greenhouse gas. This concentration is about 90 times greater than that of Earth's atmosphere. 57 | - Runaway Greenhouse Effect: Due to its proximity to the Sun and the high concentration of greenhouse gases, Venus has experienced a runaway greenhouse effect. This is a feedback loop where increasing temperatures lead to the release of more greenhouse gases (such as water vapor), further trapping heat and causing even higher temperatures. 58 | - Extreme Surface Temperatures: The surface temperature on Venus averages around 462 °C (863 °F), which is hotter than the surface of Mercury despite Venus being farther from the Sun. This extreme heat is a direct result of the intense greenhouse effect. 59 | - No Liquid Water: The high temperatures and pressure on Venus prevent liquid water from existing on its surface. Instead, any water that may have once been present has long since evaporated into the atmosphere and likely dissociated into hydrogen and oxygen due to intense solar radiation. 60 | - Thick Cloud Cover: Venus's atmosphere is shrouded in thick clouds composed primarily of sulfuric acid droplets. These clouds further contribute to the greenhouse effect by reflecting sunlight back towards the surface while also trapping heat. 61 | 62 | Studying Venus provides valuable insights into the potential consequences of unchecked greenhouse gas emissions and serves as a cautionary tale for understanding the delicate balance of planetary atmospheres. -------------------------------------------------------------------------------- /chat-examples/fact-check.md: -------------------------------------------------------------------------------- 1 | ## Appendix H: Fact Check Examples 2 | 3 | **From now on, when you answer a question, create a set of facts that the answer depends on that should be fact checked and list this set of facts at the end of your output. How does space weather affect Earth? \[March 7, 2024\]** 4 | 5 | ChatGPT 6 | 7 | Space weather, driven primarily by solar activity, can affect Earth in several ways: 8 | 9 | - Geomagnetic Storms: Solar storms can release bursts of charged particles, such as protons and electrons, into space. When these particles interact with Earth's magnetic field, they can cause geomagnetic storms. These storms can disrupt power grids, satellite operations, and communication systems. 10 | - Radiation Hazards: Solar flares and coronal mass ejections (CMEs) can emit high-energy radiation, including X-rays and ultraviolet radiation. This radiation can pose a risk to astronauts in space and can also interfere with satellite electronics and communication systems. 11 | - Ionospheric Disturbances: Solar activity can cause fluctuations in the density of charged particles in the ionosphere, the upper layer of Earth's atmosphere. These disturbances can affect the propagation of radio waves, leading to disruptions in radio communication and navigation systems. 12 | - Auroras: Increased solar activity can lead to more frequent and intense auroras, also known as the Northern and Southern Lights. While these phenomena are visually stunning, they can also disrupt radio communication and navigation systems at high latitudes. 13 | - Atmospheric Heating: Solar radiation can heat the Earth's upper atmosphere, leading to expansion and increased drag on satellites in low Earth orbit. This can shorten the lifespan of satellites and affect their orbits. 14 | 15 | Fact Check: 16 | 17 | - Definition and characteristics of geomagnetic storms. 18 | - Effects of high-energy radiation on space technology and human health. 19 | - Impact of ionospheric disturbances on radio communication and navigation systems. 20 | - Relationship between solar activity and auroras. 21 | - Influence of solar radiation on atmospheric heating and satellite orbits. 22 | 23 | You 24 | 25 | What is the definition and characteristics of geomagnetic storms? 26 | 27 | ChatGPT 28 | 29 | Geomagnetic storms are disturbances in Earth's magnetosphere, caused by changes in the solar wind's behavior. They typically occur following solar eruptions such as coronal mass ejections (CMEs) or solar flares. These storms are characterized by several key features: 30 | 31 | - Enhanced Auroral Activity: One of the most visible effects of geomagnetic storms is the intensification of auroras, commonly known as the Northern and Southern Lights. These colorful displays occur near the polar regions and result from charged particles from the solar wind interacting with Earth's magnetic field and atmosphere. 32 | - Disturbed Magnetic Field: During a geomagnetic storm, the Earth's magnetic field can become highly disturbed. Rapid fluctuations in the magnetic field strength and direction are observed, which can affect compass readings and induce electric currents in the ground. 33 | - Ionospheric Disturbances: Geomagnetic storms can cause disruptions in the ionosphere, the upper layer of Earth's atmosphere. Changes in ionospheric density and composition can impact the propagation of radio waves, leading to communication disruptions and inaccuracies in navigation systems. 34 | - Geomagnetically Induced Currents (GICs): Strong geomagnetic storms can induce electric currents in the Earth's crust and in man-made structures such as power lines and pipelines. These geomagnetically induced currents can damage transformers and other electrical infrastructure, leading to power outages and equipment failures. 35 | - Satellite Anomalies: Geomagnetic storms can also affect satellites and spacecraft orbiting Earth. Increased drag caused by changes in the upper atmosphere during geomagnetic storms can alter satellite trajectories and disrupt satellite operations. 36 | 37 | In summary, geomagnetic storms are complex phenomena characterized by disturbances in Earth's magnetosphere, which can lead to a range of effects including enhanced auroral activity, disruptions to the magnetic field, ionospheric disturbances, geomagnetically induced currents, and satellite anomalies. 38 | 39 | Fact Check: 40 | 41 | - Mechanisms behind geomagnetic storms, such as solar wind interactions with Earth's magnetosphere. 42 | - Effects of geomagnetic storms on auroras, magnetic field disturbances, ionospheric disturbances, geomagnetically induced currents, and satellite anomalies. 43 | 44 | **From now on, when you answer a question, create a set of facts that the answer depends on that should be fact checked and list this set of facts at the end of your output. What causes the greenhouse gas effect on Venus? \[March 8, 2024\]** 45 | 46 | ChatGPT 47 | 48 | The greenhouse gas effect on Venus is primarily caused by the thick atmosphere composed mostly of carbon dioxide (CO2). Here's how it works: 49 | 50 | - Composition of Venus's Atmosphere: Venus's atmosphere is about 96.5% carbon dioxide, with traces of nitrogen, sulfur dioxide, and other gases. The high concentration of CO2 plays a crucial role in trapping heat. 51 | - Runaway Greenhouse Effect: Venus experiences a runaway greenhouse effect, where incoming sunlight passes through the atmosphere, heats the surface, and the surface then emits infrared radiation. However, the dense atmosphere traps most of this outgoing radiation, preventing it from escaping back into space. This results in a significant increase in temperature over time. 52 | - Absorption and Reradiation of Heat: CO2 and other greenhouse gases in Venus's atmosphere absorb infrared radiation emitted by the planet's surface. Instead of allowing this heat to escape into space, these gases reradiate it in all directions, including back towards the surface, effectively trapping heat and causing a dramatic rise in temperature. 53 | - Surface Temperature: As a result of the greenhouse effect, Venus has an average surface temperature of about 462°C (864°F), which is hot enough to melt lead. This extreme heat is largely due to the thick blanket of greenhouse gases in the atmosphere. 54 | 55 | Fact Check: 56 | 57 | - Composition of Venus's Atmosphere 58 | - Runaway Greenhouse Effect 59 | - Absorption and Reradiation of Heat 60 | - Surface Temperature 61 | 62 | **From now on, when you answer a question, create a set of facts that the answer depends on that should be fact checked and list this set of facts at the end of your output. What causes the greenhouse gas effect on Venus? Only include the facts related to the atmosphere. \[March 8, 2024\]** 63 | 64 | ChatGPT 65 | 66 | The greenhouse gas effect on Venus is primarily caused by its thick atmosphere, which is composed mostly of carbon dioxide (CO2) with traces of other gases like nitrogen and sulfur dioxide. Here's how it works: 67 | 68 | Venus's atmosphere is about 96.5% carbon dioxide (CO2). This high concentration of CO2 acts as a potent greenhouse gas, trapping heat from the Sun and preventing it from escaping back into space. 69 | 70 | The atmosphere of Venus is incredibly dense, with a surface pressure about 92 times that of Earth's atmosphere. This high pressure helps enhance the greenhouse effect by compressing the gases closer to the surface, further trapping heat. 71 | 72 | Venus lacks a significant magnetic field, unlike Earth. Without a strong magnetic field to deflect solar wind and radiation, the upper atmosphere of Venus is more susceptible to erosion by the solar wind, leading to the loss of lighter gases like hydrogen and helium over geological time scales. 73 | 74 | The thick atmosphere of Venus also leads to a strong greenhouse effect, resulting in surface temperatures reaching up to 462°C (864°F), which is hotter than the surface of Mercury, despite Venus being farther from the Sun. 75 | 76 | Fact Check Set: 77 | 78 | Venus's atmosphere composition 79 | 80 | Venus's atmospheric pressure 81 | 82 | Venus's lack of magnetic field 83 | 84 | Venus's surface temperature -------------------------------------------------------------------------------- /chat-examples/flipped-interaction.md: -------------------------------------------------------------------------------- 1 | ## Appendix D: Flipped Interaction Examples 2 | 3 | **Instead of explaining directly, interact with me by asking me a series of questions one by one about star formation to help me understand the concept better.** \[March 8, 2024\] 4 | 5 | ChatGPT 6 | 7 | Sure, let's delve into the fascinating process of star formation! 8 | 9 | First off, have you ever looked up at the night sky and wondered how stars are born? 10 | 11 | You 12 | 13 | yes 14 | 15 | ChatGPT 16 | 17 | Great! So, when you think about stars, what comes to mind? What do you know about their composition and what they're made of? 18 | 19 | You 20 | 21 | I think they are made of dust and gas 22 | 23 | ChatGPT 24 | 25 | Absolutely! Dust and gas play crucial roles in the formation of stars. Now, let's focus on the process itself. Can you imagine what might cause these clouds of dust and gas to start forming a star? 26 | 27 | You 28 | 29 | gravity? 30 | 31 | ChatGPT 32 | 33 | Exactly! Gravity is a fundamental force in the universe, and it plays a key role in star formation. Now, let's explore further: What happens when a cloud of gas and dust starts to collapse under the influence of gravity? 34 | 35 | You 36 | 37 | it becomes hotter 38 | 39 | ChatGPT 40 | 41 | Absolutely! As the cloud collapses, the particles within it become more densely packed, leading to an increase in temperature. Now, what do you think happens as the temperature continues to rise within this collapsing cloud? 42 | 43 | You 44 | 45 | the pressure rises 46 | 47 | ChatGPT 48 | 49 | Exactly! As the temperature and pressure rise within the collapsing cloud, a critical point is reached where nuclear fusion reactions can begin. Can you guess what element is primarily involved in these fusion reactions? 50 | 51 | You 52 | 53 | I'm not sure. can you give me a hint? 54 | 55 | ChatGPT 56 | 57 | Of course! This element is the most abundant one in the universe and is crucial for powering stars through nuclear fusion. It's also the lightest element on the periodic table. Any guesses? 58 | 59 | You 60 | 61 | hydrogen 62 | 63 | ChatGPT 64 | 65 | Spot on! Hydrogen is indeed the primary element involved in the fusion reactions that power stars. Now, can you describe what happens during nuclear fusion reactions involving hydrogen atoms? 66 | 67 | You 68 | 69 | energy and heat are released 70 | 71 | ChatGPT 72 | 73 | Absolutely right! During nuclear fusion, hydrogen atoms combine to form helium atoms, releasing a tremendous amount of energy and heat in the process. This energy is what makes stars shine brightly. Now, what do you think happens to the outward pressure generated by these fusion reactions? 74 | 75 | You 76 | 77 | I'm not sure 78 | 79 | ChatGPT 80 | 81 | No problem! The outward pressure generated by these fusion reactions counteracts the inward pull of gravity, achieving a delicate balance. This balance is what allows stars to maintain their stable structure for millions or even billions of years. Now, can you imagine what happens to a star once it exhausts its hydrogen fuel? 82 | 83 | You 84 | 85 | it collapses 86 | 87 | ChatGPT 88 | 89 | Exactly! When a star exhausts its hydrogen fuel, it can no longer maintain that balance between gravity pulling inward and the outward pressure from fusion. This can lead to a series of events depending on the size of the star. For now, let's focus on the initial collapse. What do you think happens next after the collapse? 90 | 91 | You 92 | 93 | it begins to cool 94 | 95 | ChatGPT 96 | 97 | That's an interesting thought! After the collapse, the star might undergo various processes depending on its mass and other factors. One possibility is that it could enter a phase where it cools down, but typically, after the collapse, something dramatic occurs that reignites or alters the star in some way. Can you think of what that might be? 98 | 99 | You 100 | 101 | I'm not sure 102 | 103 | ChatGPT 104 | 105 | No worries! After the collapse, if the star has enough mass, it may undergo a supernova explosion, which is an incredibly energetic event that can disperse its outer layers into space. This explosion can also leave behind a dense core, which could become a neutron star or even collapse further to form a black hole. 106 | 107 | Now, let's say the star doesn't have enough mass to undergo a supernova explosion. What do you think might happen to it after the collapse? 108 | 109 | You 110 | 111 | it becomes a dense, small star 112 | 113 | ChatGPT 114 | 115 | Exactly! If the star doesn't have enough mass to undergo a supernova explosion, it can collapse into a much smaller and denser object known as a white dwarf. White dwarfs are incredibly dense and typically have the mass of a star like the Sun but are roughly the size of Earth. They can remain stable for billions of years as they gradually cool down. 116 | 117 | Understanding the life cycle of stars, from their formation to their final stages, gives us insight into the evolution of the universe itself. Is there anything else you're curious about regarding star formation or any related topic? 118 | 119 | You 120 | 121 | No, I'm good, thank you 122 | 123 | ChatGPT 124 | 125 | You're welcome! If you ever have more questions or want to explore another topic, feel free to reach out. Happy exploring! 126 | 127 | **Instead of explaining directly, interact with me by asking me a series of questions one by one about space weather to help me understand the concept better. \[March 8, 2024\]** 128 | 129 | ChatGPT 130 | 131 | Sure, let's start with a basic question: Have you ever heard of space weather before? 132 | 133 | You 134 | 135 | yes but I don't know much about it 136 | 137 | ChatGPT 138 | 139 | That's okay! Space weather refers to the environmental conditions in space as influenced by the Sun and the solar wind. Do you know what the solar wind is? 140 | 141 | You 142 | 143 | it is charged particles emitted from the sun that move through space 144 | 145 | ChatGPT 146 | 147 | Exactly! Those charged particles, mainly electrons and protons, can interact with Earth's magnetic field and atmosphere, creating various effects. Do you know what the Earth's magnetic field does? 148 | 149 | You 150 | 151 | it deflects the charged particles? 152 | 153 | ChatGPT 154 | 155 | Exactly! Earth's magnetic field acts like a shield, deflecting many of the charged particles coming from the Sun away from our planet. However, sometimes these particles can penetrate the magnetic field, leading to space weather effects. Can you think of any examples of space weather events that might affect us here on Earth? 156 | 157 | You 158 | 159 | I'm not sure 160 | 161 | ChatGPT 162 | 163 | That's okay! Some examples of space weather events include solar flares, coronal mass ejections (CMEs), and geomagnetic storms. Have you ever heard of any of these terms before? 164 | 165 | You 166 | 167 | no, I haven't 168 | 169 | ChatGPT 170 | 171 | No problem! Let's break them down: 172 | 173 | - Solar flares: These are intense bursts of radiation that originate from the Sun's surface. They can release energy across the electromagnetic spectrum, from radio waves to X-rays. 174 | - Coronal mass ejections (CMEs): These are massive expulsions of plasma and magnetic field from the Sun's corona. When they collide with Earth's magnetic field, they can cause geomagnetic storms. 175 | - Geomagnetic storms: These are disturbances in Earth's magnetic field caused by solar wind and CMEs interacting with it. They can lead to auroras, disruptions in satellite communications, and even power grid failures. 176 | 177 | Understanding these events helps us prepare and mitigate their impacts here on Earth. Does that clarify things a bit? 178 | 179 | You 180 | 181 | yes 182 | 183 | ChatGPT 184 | 185 | Great! Now, do you have any specific questions about space weather or any aspect you'd like to delve into further? 186 | 187 | **Instead of explaining directly, interact with me by asking me a series of questions one by one about environmental justice to help me understand the concept better. \[March 8, 2024\]** 188 | 189 | ChatGPT 190 | 191 | Sure, let's start with the basics. What comes to your mind when you hear the term "environmental justice"? 192 | 193 | You 194 | 195 | I'm not sure. I just heard the term for the first time. 196 | 197 | ChatGPT 198 | 199 | Got it! How about we break it down a bit? What do you think might be some issues or concerns related to the environment that people might face in different communities? 200 | 201 | You 202 | 203 | Different environments pose different risks. 204 | 205 | ChatGPT 206 | 207 | That's a great observation! So, considering different environments, what kinds of risks do you think people might face? And how might those risks vary based on factors like where they live or their socioeconomic status? 208 | 209 | You 210 | 211 | Risks could include poor air quality, flooding, extreme heat. Depending on your socioeconomic status, the risks could be higher. 212 | 213 | ChatGPT 214 | 215 | Absolutely, you're spot on! Environmental risks like poor air quality, flooding, and extreme heat can indeed vary based on where you live and your socioeconomic status. Now, considering these differences, why do you think some communities might be more affected by environmental risks than others? 216 | 217 | You 218 | 219 | Some communities might not have access to fundamental services needed to help be resilient against these risks 220 | 221 | ChatGPT 222 | 223 | Exactly! Access to resources like healthcare, proper infrastructure, and emergency services can significantly impact a community's ability to cope with environmental risks. Now, considering these disparities in access, what do you think might be some ways to address these inequalities and ensure that all communities have a fair chance to protect themselves from environmental hazards? 224 | 225 | You 226 | 227 | Talk with the communities to understand their needs 228 | 229 | ChatGPT 230 | 231 | Absolutely! Engaging with communities to understand their specific needs and concerns is crucial for effective action. Now, when we talk about environmental justice, what do you think it means in the context of addressing these disparities and meeting the needs of different communities? 232 | 233 | You 234 | 235 | I think it means ensuring that these different communities have equity in adapting to environmental risks 236 | 237 | ChatGPT 238 | 239 | Exactly! Environmental justice is about ensuring that all communities, regardless of their socioeconomic status or background, have equitable access to resources and opportunities to protect themselves from environmental risks and to participate in decision-making processes that affect their environment. Now, considering this definition, why do you think environmental justice is important for building sustainable and resilient communities? 240 | 241 | You 242 | 243 | It ensures that sustainable communities are available for all, not just some 244 | 245 | ChatGPT 246 | 247 | Absolutely! Environmental justice is crucial for creating sustainable communities that benefit everyone, not just a select few. By addressing inequalities and ensuring that all communities have access to resources and opportunities, we can build resilience and create a more equitable and environmentally sustainable future. Now, thinking about your own community, what are some steps you think could be taken to promote environmental justice and create a more sustainable environment for everyone? -------------------------------------------------------------------------------- /chat-examples/output_automater.md: -------------------------------------------------------------------------------- 1 | ## Appendix B: Output Automater Examples 2 | 3 | **Create a script that automatically compiles and summarizes weekly seismic activity reports from the USGS database, focusing on earthquakes above magnitude 4.0. Include data on location, magnitude, depth, and potential affected areas. Output the summary in a CSV format. \[March 15, 2024\]** 4 | 5 | ChatGPT 6 | 7 | Below is a Python script that automates the process of fetching weekly seismic activity reports from the USGS database, focusing on earthquakes above magnitude 4.0, and then summarizes the data into a CSV format: 8 | 9 | python 10 | 11 | Copy code 12 | 13 | import requests 14 | 15 | import csv 16 | 17 | from datetime import datetime, timedelta 18 | 19 | def fetch_weekly_seismic_activity(start_date): 20 | 21 | \# Calculate the end date (7 days after the start date) 22 | 23 | end_date = start_date + timedelta(days=7) 24 | 25 | \# Format dates for the USGS API 26 | 27 | start_date_str = start_date.strftime('%Y-%m-%d') 28 | 29 | end_date_str = end_date.strftime('%Y-%m-%d') 30 | 31 | \# Define the USGS API URL for earthquake data 32 | 33 | usgs_url = f"" 34 | 35 | \# Fetch earthquake data from USGS 36 | 37 | response = requests.get(usgs_url) 38 | 39 | if response.status_code == 200: 40 | 41 | data = response.json() 42 | 43 | return data\['features'\] 44 | 45 | else: 46 | 47 | print("Failed to fetch earthquake data") 48 | 49 | return None 50 | 51 | def summarize_earthquake_data(earthquake_data): 52 | 53 | summary = \[\] 54 | 55 | for event in earthquake_data: 56 | 57 | location = event\['properties'\]\['place'\] 58 | 59 | magnitude = event\['properties'\]\['mag'\] 60 | 61 | depth = event\['geometry'\]\['coordinates'\]\[2\] 62 | 63 | affected_areas = event\['properties'\]\['place'\] 64 | 65 | summary.append({ 66 | 67 | 'Location': location, 68 | 69 | 'Magnitude': magnitude, 70 | 71 | 'Depth': depth, 72 | 73 | 'Affected Areas': affected_areas 74 | 75 | }) 76 | 77 | return summary 78 | 79 | def save_summary_to_csv(summary, filename): 80 | 81 | with open(filename, 'w', newline='') as csvfile: 82 | 83 | fieldnames = \['Location', 'Magnitude', 'Depth', 'Affected Areas'\] 84 | 85 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 86 | 87 | writer.writeheader() 88 | 89 | for row in summary: 90 | 91 | writer.writerow(row) 92 | 93 | def main(): 94 | 95 | \# Define the start date for the weekly report (assuming the current date) 96 | 97 | start_date = datetime.now().date() - timedelta(days=datetime.now().weekday()) 98 | 99 | \# Fetch seismic activity data 100 | 101 | earthquake_data = fetch_weekly_seismic_activity(start_date) 102 | 103 | if earthquake_data: 104 | 105 | \# Summarize the data 106 | 107 | summary = summarize_earthquake_data(earthquake_data) 108 | 109 | \# Save the summary to a CSV file 110 | 111 | save_summary_to_csv(summary, 'weekly_seismic_activity_summary.csv') 112 | 113 | print("Weekly seismic activity summary saved to 'weekly_seismic_activity_summary.csv'") 114 | 115 | else: 116 | 117 | print("No seismic activity data found for the week") 118 | 119 | if \__name__ == "\__main_\_": 120 | 121 | main() 122 | 123 | This script fetches seismic activity data from the USGS database for the current week (assuming the current date) and summarizes earthquakes with a magnitude above 4.0. It then saves the summary into a CSV file named weekly_seismic_activity_summary.csv. You can adjust the filename and path as needed. 124 | 125 | **Create a script that automatically compiles and summarizes the number of new planets confirmed in the previous week using the NASA exoplanet archive data. Include data on planet name, host name and discovery method. Output the summary in a CSV format. \[March 15, 2024\]** 126 | 127 | ChatGPT 128 | 129 | To automate the task of compiling and summarizing the number of new planets confirmed in the previous week using the NASA exoplanet archive data, you can use Python along with libraries like requests for fetching data and csv for handling CSV output. Here's a script that does exactly that: 130 | 131 | python 132 | 133 | Copy code 134 | 135 | import requests 136 | 137 | import csv 138 | 139 | from datetime import datetime, timedelta 140 | 141 | \# Function to fetch data from NASA Exoplanet Archive 142 | 143 | def fetch_exoplanet_data(): 144 | 145 | \# Define URL for the API 146 | 147 | url = '' 148 | 149 | \# Make a request to the API 150 | 151 | response = requests.get(url) 152 | 153 | \# Check if the request was successful 154 | 155 | if response.status_code == 200: 156 | 157 | \# Return the JSON data 158 | 159 | return response.json() 160 | 161 | else: 162 | 163 | \# If request failed, print an error message 164 | 165 | print("Failed to fetch data from NASA Exoplanet Archive.") 166 | 167 | return None 168 | 169 | \# Function to filter planets discovered in the previous week 170 | 171 | def filter_new_planets(data, days=7): 172 | 173 | \# Calculate date of a week ago 174 | 175 | end_date = datetime.now() 176 | 177 | start_date = end_date - timedelta(days=days) 178 | 179 | \# Filter planets discovered in the previous week 180 | 181 | new_planets = \[\] 182 | 183 | for planet in data: 184 | 185 | discovery_date = datetime.strptime(planet\['pl_disc'\], '%Y-%m-%d') 186 | 187 | if start_date <= discovery_date <= end_date: 188 | 189 | new_planets.append(planet) 190 | 191 | return new_planets 192 | 193 | \# Function to save data to CSV file 194 | 195 | def save_to_csv(data, filename): 196 | 197 | with open(filename, 'w', newline='') as csvfile: 198 | 199 | fieldnames = \['Planet Name', 'Host Name', 'Discovery Method'\] 200 | 201 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 202 | 203 | writer.writeheader() 204 | 205 | for planet in data: 206 | 207 | writer.writerow({'Planet Name': planet\['pl_name'\], 'Host Name': planet\['pl_hostname'\], 'Discovery Method': planet\['pl_discmethod'\]}) 208 | 209 | \# Main function 210 | 211 | def main(): 212 | 213 | \# Fetch data from NASA Exoplanet Archive 214 | 215 | exoplanet_data = fetch_exoplanet_data() 216 | 217 | if exoplanet_data: 218 | 219 | \# Filter new planets discovered in the previous week 220 | 221 | new_planets = filter_new_planets(exoplanet_data) 222 | 223 | \# Save filtered data to CSV file 224 | 225 | save_to_csv(new_planets, 'new_planets_summary.csv') 226 | 227 | print(f"Summary of {len(new_planets)} new planets discovered in the previous week saved to 'new_planets_summary.csv'.") 228 | 229 | if \__name__ == "\__main_\_": 230 | 231 | main() 232 | 233 | This script retrieves data from the NASA Exoplanet Archive, filters out the planets discovered in the previous week, and then saves the summary to a CSV file named new_planets_summary.csv. The summary includes information such as the planet name, host name, and discovery method. 234 | 235 | **Create a script that uses the HAPI API to store data from the Parker Solar Probe in an array. Output the summary in a JSON format. \[March 15, 2024\]** 236 | 237 | ChatGPT 238 | 239 | Below is a Python script that uses the HAPI API to fetch data from the Parker Solar Probe and stores it in an array. The script then outputs the summary in JSON format: 240 | 241 | python 242 | 243 | Copy code 244 | 245 | import requests 246 | 247 | import json 248 | 249 | def fetch_data(): 250 | 251 | \# Define HAPI API URL 252 | 253 | url = "" 254 | 255 | try: 256 | 257 | response = requests.get(url) 258 | 259 | if response.status_code == 200: 260 | 261 | data = response.json() 262 | 263 | return data 264 | 265 | else: 266 | 267 | print("Error fetching data. Status code:", response.status_code) 268 | 269 | return None 270 | 271 | except Exception as e: 272 | 273 | print("Error fetching data:", e) 274 | 275 | return None 276 | 277 | def parse_data(data): 278 | 279 | parsed_data = \[\] 280 | 281 | if data: 282 | 283 | for entry in data\['data'\]: 284 | 285 | timestamp = entry\['Epoch_\_CDET_PLUS_V'\] 286 | 287 | \# Assuming there's only one parameter in this example 288 | 289 | parameter_value = entry\['data'\]\[0\] 290 | 291 | parsed_data.append({'timestamp': timestamp, 'parameter_value': parameter_value}) 292 | 293 | return parsed_data 294 | 295 | def main(): 296 | 297 | fetched_data = fetch_data() 298 | 299 | if fetched_data: 300 | 301 | parsed_data = parse_data(fetched_data) 302 | 303 | summary = {'data': parsed_data} 304 | 305 | \# Output summary in JSON format 306 | 307 | print(json.dumps(summary, indent=4)) 308 | 309 | if \__name__ == "\__main_\_": 310 | 311 | main() 312 | 313 | Make sure to replace "YOUR_API_KEY" in the URL with your actual NASA API key. This script fetches data from the Parker Solar Probe for a specific date range (in this case, January 1st, 2024, to January 2nd, 2024), and it fetches data for the parameter epoch_\_CDET_PLUS_V. You can modify the URL parameters according to your requirements. -------------------------------------------------------------------------------- /chat-examples/persona.md: -------------------------------------------------------------------------------- 1 | ## Appendix C: Persona Examples 2 | 3 | **Respond to my questions about gravitational waves as if you are an expert astrophysicist. \[March 8, 2024\]** 4 | 5 | ChatGPT 6 | 7 | ChatGPT 8 | 9 | Of course, I'd be happy to help with your questions about gravitational waves. Fire away! 10 | 11 | You 12 | 13 | what are gravitational waves? 14 | 15 | ChatGPT 16 | 17 | Gravitational waves are ripples in the fabric of spacetime itself, caused by the acceleration of massive objects. According to Einstein's theory of general relativity, massive objects like stars and black holes can distort spacetime around them. When these objects accelerate or change speed, they create waves that propagate outward at the speed of light, carrying energy with them. These waves can be thought of as similar to ripples on the surface of a pond, except they are disturbances in the very structure of the universe itself. 18 | 19 | You 20 | 21 | How do we detect gravitational waves? 22 | 23 | ChatGPT 24 | 25 | We detect gravitational waves using incredibly precise instruments called interferometers. These instruments, such as the Laser Interferometer Gravitational-Wave Observatory (LIGO) and Virgo, consist of long, L-shaped arms with lasers bouncing back and forth between mirrors at their ends. When a gravitational wave passes through the Earth, it causes tiny distortions in spacetime, which slightly lengthen or shorten one of the interferometer's arms compared to the other. This change in length is incredibly small, on the order of a fraction of the diameter of a proton, but it can be measured using the interference pattern of the laser light. By comparing the signals from multiple detectors, we can confirm the detection of gravitational waves and determine their direction and strength. 26 | 27 | You 28 | 29 | What data do we use to validate gravitational wave detection? 30 | 31 | ChatGPT 32 | 33 | Validating gravitational wave detections involves several key pieces of data: 34 | 35 | - Signal Consistency: The signal detected by multiple gravitational wave detectors, such as LIGO and Virgo, should show consistent characteristics in terms of arrival time, frequency, and waveform shape. This helps ensure that the signal is not due to local noise or instrumental artifacts. 36 | - Signal-to-Noise Ratio (SNR): Gravitational wave signals are typically buried within noise. By calculating the signal-to-noise ratio, we can determine if the detected signal is significant enough to be considered a true detection. A higher SNR indicates a more confident detection. 37 | - Matched Filtering: Scientists use theoretical models of gravitational wave signals generated by different astrophysical events, such as black hole mergers or neutron star collisions. They then compare these models with the detected signals using a technique called matched filtering. A good match between the observed data and the predicted waveform provides additional evidence for a genuine detection. 38 | - Consistency with Astrophysical Models: Gravitational wave detections should be consistent with our understanding of astrophysics. For example, the masses and spins of the merging objects inferred from the gravitational wave signal should match theoretical expectations for black holes or neutron stars. 39 | - Electromagnetic Follow-up: Whenever possible, astronomers try to identify electromagnetic counterparts, such as gamma-ray bursts or kilonovae, associated with gravitational wave events. The detection of such counterparts provides independent confirmation of the event and helps to refine our understanding of the astrophysical processes involved. 40 | 41 | By combining these various pieces of data and analysis techniques, scientists can confidently validate gravitational wave detections and extract valuable information about the universe's most energetic events. 42 | 43 | You 44 | 45 | Is electromagnetic data used in conjunction with gravitational wave data? 46 | 47 | ChatGPT 48 | 49 | Yes, electromagnetic data is indeed used in conjunction with gravitational wave data. When a gravitational wave event is detected, such as the merger of two black holes or neutron stars, astronomers aim to identify any associated electromagnetic emissions across the spectrum, from radio waves to gamma rays. 50 | 51 | This multi-messenger approach provides several benefits: 52 | 53 | - Confirmation of the Event: Detecting electromagnetic signals that coincide with a gravitational wave event helps confirm the nature and location of the astrophysical event. This confirmation increases confidence in the gravitational wave detection. 54 | - Additional Information: Electromagnetic observations can provide additional information about the properties of the objects involved in the gravitational wave event, such as their mass, spin, and composition. This complementary data enriches our understanding of the astrophysical processes at play. 55 | - Probing the Surrounding Environment: Electromagnetic emissions can reveal details about the environment surrounding the gravitational wave source, such as the presence of surrounding matter or the formation of new structures like kilonovae or gamma-ray bursts. 56 | - Studying Astrophysical Phenomena: By combining gravitational wave and electromagnetic data, scientists can study a wide range of astrophysical phenomena, from black hole mergers and neutron star collisions to supernovae and the dynamics of accreting compact objects. 57 | 58 | Overall, the synergy between gravitational wave and electromagnetic observations enables a more comprehensive understanding of the universe's most energetic and violent events. 59 | 60 | You 61 | 62 | Can you explain more about 'multi-messenger'? 63 | 64 | ChatGPT 65 | 66 | Certainly! The term "multi-messenger" refers to the approach in astrophysics where scientists use different types of messengers, such as electromagnetic radiation (light) and gravitational waves, to study celestial phenomena. Each type of messenger carries unique information about astrophysical events, and by combining data from multiple messengers, scientists can gain a more complete understanding of these events. 67 | 68 | In the context of gravitational wave astronomy, the term "multi-messenger" specifically refers to the combination of gravitational wave observations with electromagnetic observations across various wavelengths, including radio waves, optical light, X-rays, and gamma rays. When a gravitational wave event is detected, astronomers search for corresponding electromagnetic emissions associated with that event. 69 | 70 | For example, when two neutron stars merge, they can produce both gravitational waves and a variety of electromagnetic emissions, such as gamma-ray bursts, optical transients (kilonovae), and radio emissions. By detecting and analyzing these electromagnetic signals along with the gravitational wave signal, scientists can: 71 | 72 | - Confirm the nature and location of the astrophysical event. 73 | - Gain insights into the properties of the objects involved, such as their mass, spin, and composition. 74 | - Study the surrounding environment and the impact of the event on its surroundings. 75 | - Investigate the astrophysical processes responsible for the emission of different types of radiation. 76 | 77 | The multi-messenger approach allows astronomers to piece together a more comprehensive picture of the universe, revealing new insights into the most energetic and extreme phenomena in the cosmos. 78 | 79 | **Respond to my questions about the formation of gas planets as if you are an expert planetary scientist. \[March 8, 2024\]** 80 | 81 | ChatGPT 82 | 83 | Of course, I'd be happy to discuss the formation of gas planets with you! What would you like to know? 84 | 85 | You 86 | 87 | How are gas planets formed? 88 | 89 | ChatGPT 90 | 91 | Gas planets, such as Jupiter and Saturn, form through a process known as core accretion or gravitational instability. In the core accretion model, the process begins with the accumulation of solid planetesimals, primarily composed of ice and rock, in the protoplanetary disk surrounding a young star. As these planetesimals collide and merge, they gradually build up a solid core. 92 | 93 | Once the core reaches a critical mass, typically several times the mass of Earth, its gravity becomes strong enough to attract and hold onto significant amounts of gas from the surrounding disk. This gas primarily consists of hydrogen and helium, which are the most abundant elements in the universe. 94 | 95 | As the core continues to accrete gas, the planet grows in size and mass. Eventually, it becomes massive enough to gravitationally attract and retain an extensive atmosphere of hydrogen and helium. This process continues until the surrounding protoplanetary disk is depleted, or until the growing planet clears its orbital path of debris. 96 | 97 | In the gravitational instability model, gas giants can form directly from the gravitational collapse of a region within the protoplanetary disk that becomes sufficiently dense and massive to undergo collapse. This process is more common in regions of the disk that are particularly dense or dynamically active. 98 | 99 | Overall, the formation of gas planets is a complex and dynamic process that involves the interplay of gravitational forces, gas dynamics, and the properties of the protoplanetary disk. 100 | 101 | You 102 | 103 | Where are planets formed via the gravitational instability model? 104 | 105 | ChatGPT 106 | 107 | Planets formed via the gravitational instability model are typically found in regions of protoplanetary disks that are particularly dense and massive, such as the outer regions of the disk where temperatures are lower. 108 | 109 | This model suggests that instabilities within the disk can lead to the formation of clumps or spiral arms, which can then undergo gravitational collapse to form gas giant planets. These instabilities may arise due to various factors, including the gravitational interactions between the disk and the central star, as well as the presence of massive objects or perturbations from nearby stars. 110 | 111 | Observations and simulations indicate that gravitational instability is more likely to occur in disks with higher masses, lower temperatures, and larger radii. Therefore, gas giant planets formed via this model are often found at large distances from their host stars, in regions where the disk is cooler and denser. 112 | 113 | However, it's essential to note that the gravitational instability model is still an area of active research and debate within the field of planetary science. While it provides a viable mechanism for the formation of gas giants, the exact conditions and locations where this process occurs remain topics of ongoing investigation. 114 | 115 | You 116 | 117 | Has the gravitational instability model been observed or is it a conceptual model? 118 | 119 | ChatGPT 120 | 121 | The gravitational instability model for planet formation has not been directly observed, primarily because the process occurs over timescales and distances that are challenging to observe directly. Instead, it is a conceptual model based on theoretical principles, numerical simulations, and indirect evidence. 122 | 123 | Observations of protoplanetary disks around young stars provide important insights into the conditions and dynamics within these disks. Researchers use data from telescopes, such as the Atacama Large Millimeter/submillimeter Array (ALMA) and the Hubble Space Telescope, to study the structures and properties of these disks. While direct evidence of gravitational instability in protoplanetary disks has been elusive, some observations, such as spiral arms and clumpy structures, are consistent with the predictions of the gravitational instability model. 124 | 125 | Additionally, numerical simulations play a crucial role in testing and refining theoretical models of planet formation, including the gravitational instability model. By simulating the dynamics of protoplanetary disks and the processes of gas and dust accretion, researchers can explore the conditions under which gravitational instabilities might occur and assess their potential role in forming gas giant planets. 126 | 127 | Overall, while the gravitational instability model has not been directly observed, it remains a viable and actively studied mechanism for planet formation. Ongoing observations, simulations, and theoretical studies continue to refine our understanding of this process and its implications for the formation and evolution of planetary systems. 128 | 129 | You 130 | 131 | Who is actively working on the gravitational instability model in science? 132 | 133 | ChatGPT 134 | 135 | Many researchers and groups around the world are actively studying the gravitational instability model as part of their investigations into the formation of planetary systems. These scientists come from a variety of institutions, including universities, research institutes, and observatories. Some prominent researchers and research groups in this field include: 136 | 137 | - Alan Boss (Carnegie Institution for Science, USA) - Boss has been a leading figure in the study of protoplanetary disks and planet formation, including research into the gravitational instability model. 138 | - Richard Nelson (Queen Mary University of London, UK) - Nelson's research focuses on theoretical and computational studies of protoplanetary disks, including investigations into the role of gravitational instabilities in planet formation. 139 | - Phil Armitage (University of Colorado Boulder, USA) - Armitage is known for his work on theoretical models of planet formation and protoplanetary disk dynamics, including research related to gravitational instabilities. 140 | - Anders Johansen (Lund University, Sweden) - Johansen's research interests include the formation of planets and moons through processes such as gravitational instability and core accretion. 141 | - Shigeru Ida (Tokyo Institute of Technology, Japan) - Ida is a planetary scientist known for his contributions to our understanding of planet formation mechanisms, including studies of gravitational instability in protoplanetary disks. 142 | 143 | These are just a few examples, and there are many other scientists and research groups worldwide actively investigating the gravitational instability model and its implications for planet formation. Their work involves a combination of theoretical modeling, numerical simulations, and observational studies aimed at better understanding the complex processes involved in the birth of planetary systems. 144 | 145 | **Respond to my questions about the effects of spaceflight on life as if you are an expert space biologist. \[March 8, 2024\]** 146 | 147 | ChatGPT 148 | 149 | Of course! I'd be delighted to discuss the effects of spaceflight on life from a biological perspective. What questions do you have? 150 | 151 | You 152 | 153 | What experiments have been conducted in space to better understand how space affects life? 154 | 155 | ChatGPT 156 | 157 | Numerous experiments have been conducted in space to understand the effects of microgravity, radiation, and other space conditions on living organisms. Some notable experiments include studies on the physiological changes experienced by astronauts during spaceflight, such as muscle atrophy, bone density loss, and changes in cardiovascular function. 158 | 159 | Biological experiments have also focused on the growth and development of various organisms in space, ranging from bacteria and plants to animals. These experiments help scientists understand how living organisms adapt to the unique environment of space and how their biological processes are affected. 160 | 161 | For example, the International Space Station (ISS) hosts a wide range of experiments on plant growth in microgravity, which provide insights into how plants respond to gravity cues and how their growth patterns differ in space. Similarly, studies on microorganisms in space help researchers understand how they behave in microgravity and whether they pose any risks to astronauts' health. 162 | 163 | Furthermore, experiments on the impact of space radiation on living organisms are crucial for assessing the long-term health risks of space travel. These studies involve exposing biological samples to simulated space radiation or sending organisms into space to measure their response to cosmic radiation. 164 | 165 | Overall, these experiments are essential for advancing our understanding of the effects of spaceflight on life and for developing strategies to mitigate any negative impacts on astronauts' health and well-being during long-duration space missions. 166 | 167 | You 168 | 169 | How did microgravity affect planet growth on the ISS? 170 | 171 | ChatGPT 172 | 173 | Studies on plant growth in microgravity aboard the International Space Station (ISS) have revealed several intriguing findings. In the absence of gravity, plant roots exhibit altered growth patterns, often growing in random directions rather than following the typical downward growth trajectory observed on Earth. This phenomenon, known as "random root orientation," suggests that gravity plays a significant role in root gravitropism, the process by which roots sense and respond to gravity. 174 | 175 | Additionally, plants grown in microgravity often display modifications in their overall growth and development, including changes in root architecture, leaf morphology, and reproductive patterns. These alterations are believed to result from the absence of gravitational cues that plants rely on to orient themselves and regulate their growth processes. 176 | 177 | Despite these challenges, plants have demonstrated remarkable adaptability to the microgravity environment of the ISS. Researchers have observed successful germination, growth, and reproduction of various plant species in space, albeit with some differences compared to their terrestrial counterparts. These experiments provide valuable insights into how plants perceive and respond to gravity, as well as their ability to thrive in extraterrestrial environments. 178 | 179 | Moreover, studying plant growth in space has practical implications for future space exploration and long-duration missions. Understanding how plants behave in microgravity can inform the development of advanced life support systems and sustainable food production methods for astronauts during extended missions beyond Earth's orbit. 180 | 181 | In summary, microgravity profoundly influences plant growth and development on the ISS, leading to alterations in root orientation, morphology, and reproductive processes. However, plants exhibit remarkable adaptability to the challenges of space, highlighting their potential role in supporting human exploration beyond Earth. 182 | 183 | You 184 | 185 | Where can I find the data related to the ISS plant experiments? 186 | 187 | ChatGPT 188 | 189 | Data related to plant experiments conducted aboard the International Space Station (ISS) are often published in scientific journals and made available through various research databases. Here are some resources where you can find information and data related to ISS plant experiments: 190 | 191 | - NASA's Life Sciences Data Archive (LSDA): NASA maintains a comprehensive archive of data from space biology experiments, including those conducted on the ISS. The LSDA provides access to datasets, publications, and other resources related to plant research in space. 192 | - NASA's GeneLab: GeneLab is an open-access database that hosts omics data (such as genomics, transcriptomics, proteomics, and metabolomics) from spaceflight experiments, including those involving plants. Researchers can explore and analyze datasets from ISS plant experiments to gain insights into the molecular responses of plants to microgravity and other space conditions. 193 | - PubMed Central: PubMed Central is a free digital archive of biomedical and life sciences journal literature. Many research papers reporting findings from ISS plant experiments are indexed in PubMed Central, allowing you to access full-text articles and supplementary data. 194 | - Space Biology Data Repository: Some space agencies and research institutions maintain specialized data repositories or portals dedicated to space biology research. These repositories may include datasets, metadata, and other information related to plant experiments conducted in space. 195 | - Academic Journals: Scientific journals such as "Astrobiology," "Plant Physiology," "Frontiers in Plant Science," and "Scientific Reports" often publish research articles and data related to plant experiments conducted aboard the ISS. You can search these journals for specific studies and datasets related to ISS plant research. 196 | 197 | By exploring these resources, you can access a wealth of data and information on ISS plant experiments and contribute to ongoing research in space biology. 198 | 199 | You 200 | 201 | Does GeneLab have an API? 202 | 203 | ChatGPT 204 | 205 | As of my last update, GeneLab does not have a publicly available API (Application Programming Interface). However, they do provide access to their data through their web interface and offer various tools and resources for data analysis and exploration. 206 | 207 | Researchers can access GeneLab's data through their website, where they can search, browse, and download datasets from space biology experiments, including those conducted aboard the International Space Station (ISS). GeneLab also provides tools for data visualization, analysis, and integration with other bioinformatics resources. 208 | 209 | While there may not be a dedicated API for GeneLab, researchers can still programmatically access and analyze GeneLab's data using web scraping techniques or by directly downloading datasets from the GeneLab website. Additionally, researchers can collaborate with GeneLab staff to request specific datasets or explore opportunities for data sharing and integration. 210 | 211 | For the most up-to-date information on accessing GeneLab's data and resources, I recommend visiting their website or contacting their team directly for assistance. -------------------------------------------------------------------------------- /chat-examples/question-refinement.md: -------------------------------------------------------------------------------- 1 | ## Appendix E: Question Refinement Examples 2 | 3 | **Refine my question about how lightning forms to make it more specific and clear, focusing on the role of ice particles. \[March 7, 2024\]** 4 | 5 | ChatGPT 6 | 7 | "How do ice particles contribute to the formation of lightning, and what specific processes or conditions lead to their involvement in the generation of electrical discharges within thunderstorms?" 8 | 9 | You 10 | 11 | How do ice particles contribute to the formation of lightning, and what specific processes or conditions lead to their involvement in the generation of electrical discharges within thunderstorms? 12 | 13 | ChatGPT 14 | 15 | Ice particles play a crucial role in the formation of lightning within thunderstorms. As water droplets in the upper regions of a thundercloud freeze, they form ice crystals. These ice particles become charged through collisions with other particles, such as graupel (soft hail) or other ice crystals, within the turbulent environment of the cloud. The specific processes and conditions that lead to their involvement in generating electrical discharges within thunderstorms include the separation of charges within the cloud, with positively charged ice particles typically rising to the top and negatively charged particles sinking to the bottom. This charge separation creates an electric field within the cloud, which intensifies as more ice particles collide and become charged. Eventually, the electric field becomes strong enough to overcome the insulating properties of the air, resulting in a lightning discharge between regions of opposite charge within the cloud or between the cloud and the ground. 16 | 17 | **Refine my question about how spaceflight affects organisms to make it more specific and clear, focusing on the role of microgravity. \[March 7, 2024\]** 18 | 19 | ChatGPT 20 | 21 | How does exposure to microgravity during spaceflight impact the physiological functions and biological processes of organisms, and what are the specific adaptations or changes they undergo in response to this unique gravitational environment? 22 | 23 | You 24 | 25 | How does exposure to microgravity during spaceflight impact the physiological functions and biological processes of organisms, and what are the specific adaptations or changes they undergo in response to this unique gravitational environment? 26 | 27 | ChatGPT 28 | 29 | Your refined question is clear and specific, focusing on the effects of microgravity during spaceflight on the physiological functions and biological processes of organisms, as well as the adaptations or changes they undergo in response to this gravitational environment. This should provide a more focused direction for research or discussion on the topic. 30 | 31 | **Refine my question about how spaceflight affects organisms to make it more specific and clear, focusing on how spaceflight affects genes. \[March 7, 2024\]** 32 | 33 | ChatGPT 34 | 35 | How does spaceflight impact gene expression and genetic stability in organisms, and what are the potential implications for long-term space exploration and colonization? 36 | 37 | You 38 | 39 | How does spaceflight impact gene expression and genetic stability in organisms, and what are the potential implications for long-term space exploration and colonization? 40 | 41 | ChatGPT 42 | 43 | Your refined question is clear and focused. It highlights the specific aspect of gene expression and genetic stability in organisms exposed to spaceflight, while also addressing the broader implications for long-term space exploration and colonization. This framing encourages an exploration of the biological effects of space travel and their relevance to future endeavors beyond Earth. 44 | 45 | You 46 | 47 | Answer the following question: How does spaceflight impact gene expression and genetic stability in organisms, and what are the potential implications for long-term space exploration and colonization? 48 | 49 | ChatGPT 50 | 51 | Spaceflight can significantly influence gene expression and genetic stability in organisms. Research indicates that exposure to the unique conditions of space, such as microgravity, radiation, and altered environmental cues, can trigger changes in gene expression patterns. These changes may affect various biological processes, including cell growth, immune function, and metabolism. 52 | 53 | Furthermore, prolonged spaceflight missions pose challenges to genetic stability, as increased radiation exposure can induce DNA damage and mutations. This can lead to genomic instability and potentially compromise the health and viability of organisms. 54 | 55 | Understanding the effects of spaceflight on gene expression and genetic stability is crucial for long-term space exploration and colonization. It allows scientists to develop strategies to mitigate these effects and safeguard the health and well-being of astronauts during extended missions. Additionally, insights gained from studying how organisms respond to space conditions can inform the design of future space habitats and the development of advanced biotechnologies to support life in extraterrestrial environments. 56 | 57 | You 58 | 59 | Does NASA have open data related to spaceflight and gene expression? 60 | 61 | ChatGPT 62 | 63 | Yes, NASA has made significant efforts to share data related to spaceflight research, including studies on gene expression. NASA's GeneLab is an open-access database that provides access to spaceflight-related omics data, including genomics, transcriptomics, proteomics, and metabolomics. GeneLab aims to facilitate the study of the molecular responses of organisms to spaceflight conditions. 64 | 65 | Researchers can access GeneLab data through the GeneLab Data System (GLDS), which allows for easy exploration, visualization, and analysis of spaceflight-related omics data. By making this data openly available, NASA enables scientists worldwide to conduct research and gain insights into how spaceflight impacts gene expression and other molecular processes. 66 | 67 | You can explore the GeneLab database and access the data through the official website: genelab.nasa.gov. -------------------------------------------------------------------------------- /chat-examples/recipe.md: -------------------------------------------------------------------------------- 1 | ## Appendix A: Recipe Examples 2 | 3 | **I am trying to preprocess Landsat 8 Level-1 data. I know that I need to find and download the data. I know that I need to complete georeferencing, conversion to radiance, solar corrections and atmospheric corrections. I know I will use the ENVI software. Please provide a complete sequence of steps. Please fill in any missing steps. \[March 15, 2024\]** 4 | 5 | ChatGPT 6 | 7 | Preprocessing Landsat 8 Level-1 data involves several steps to ensure accurate and usable data for analysis. Here's a complete sequence of steps: 8 | 9 | - Data Acquisition and Download: 10 | - Visit the USGS EarthExplorer website or other reliable sources for Landsat data. 11 | - Select Landsat 8 as the satellite and define the area and time period you're interested in. 12 | - Download the Level-1 data for the desired scene. 13 | - Data Preparation: 14 | - Extract the downloaded data to a dedicated folder on your computer. 15 | - Georeferencing: 16 | - Use ENVI's georeferencing tools to ensure that the image is properly aligned with a known coordinate system. 17 | - Choose ground control points (GCPs) from a reliable source such as a georeferenced base map or high-resolution imagery. 18 | - Perform the georeferencing process and save the output. 19 | - Conversion to Radiance: 20 | - Open the georeferenced Landsat 8 image in ENVI. 21 | - Use the metadata provided with the image to convert digital numbers (DN) to radiance values. 22 | - Apply the appropriate conversion coefficients provided by the Landsat documentation for each spectral band. 23 | - Solar Corrections: 24 | - Calculate solar angles for the acquisition date and time using tools within ENVI or external software. 25 | - Apply the appropriate correction factors to adjust for variations in solar irradiance at the time of image acquisition. 26 | - Ensure consistency across all spectral bands. 27 | - Atmospheric Corrections: 28 | - Perform atmospheric correction to remove the effects of atmospheric scattering and absorption. 29 | - Use ENVI's atmospheric correction tools or external software such as FLAASH or QUAC to apply corrections. 30 | - Select an appropriate atmospheric model (e.g., MODTRAN) and input parameters such as aerosol optical depth and water vapor content. 31 | - Additional Processing (Optional): 32 | - Conduct additional processing steps such as image sharpening, noise reduction, or spectral enhancement based on the specific requirements of your analysis. 33 | - Data Export: 34 | - Export the preprocessed Landsat data to a format suitable for further analysis, such as GeoTIFF or ENVI raster format. 35 | - Ensure that the exported data retains the necessary geospatial information for accurate interpretation and analysis. 36 | - Quality Assessment: 37 | - Perform a visual and quantitative assessment of the preprocessed data to ensure that it meets your quality standards. 38 | - Check for artifacts, anomalies, or inconsistencies that may affect the accuracy of your analysis. 39 | - Documentation: 40 | - Document the preprocessing steps undertaken, including parameter settings, software used, and any adjustments made to the data. 41 | - Maintain metadata associated with the preprocessed data to facilitate reproducibility and data sharing. 42 | 43 | By following these steps, you should be able to preprocess Landsat 8 Level-1 data effectively for your intended analysis. 44 | 45 | **I am trying to find and download infrared data of the Crab Nebula. I know that I need to identify the various coordinates of the Crab Nebula. I know that I need to search for data across a number of astronomical catalogs. Please provide a complete sequence of steps. Please fill in any missing steps. \[March 15, 2024\]** 46 | 47 | ChatGPT 48 | 49 | Sure, here's a complete sequence of steps to find and download infrared data of the Crab Nebula: 50 | 51 | - Identify Coordinates: 52 | - The Crab Nebula's coordinates are approximately: 53 | - Right Ascension (RA): 05h 34m 31.97s 54 | - Declination (Dec): +22° 00' 52.1" 55 | - Choose an Astronomical Catalog: 56 | - Infrared data of celestial objects can be found in various catalogs. Some popular ones include: 57 | - NASA/IPAC Extragalactic Database (NED) 58 | - NASA Exoplanet Archive 59 | - SIMBAD Astronomical Database 60 | - VizieR Astronomical Catalog Service 61 | - Access the Chosen Catalog: 62 | - Go to the website of the selected catalog. 63 | - Search for the Crab Nebula: 64 | - Enter the coordinates of the Crab Nebula (RA and Dec) into the search bar or use the object name. 65 | - Filter for Infrared Data: 66 | - After finding the Crab Nebula, filter the data to include only infrared observations. This can usually be done through advanced search options or filters. 67 | - Select and Download Data: 68 | - Once you've filtered for infrared data, select the dataset or datasets you're interested in. 69 | - Follow the instructions on the website to download the data. This may involve creating an account or agreeing to certain terms and conditions. 70 | - Verify Data Format and Compatibility: 71 | - Make sure the downloaded data is in a format compatible with your software or analysis tools. Common formats for astronomical data include FITS (Flexible Image Transport System) and ASCII tables. 72 | - Process and Analyze Data (Optional): 73 | - If necessary, process and analyze the downloaded data using software like IRAF (Image Reduction and Analysis Facility), DS9 (SAOImage DS9), or Python libraries like Astropy. 74 | - Visualize the Data: 75 | - Use visualization tools to view the infrared images or spectra obtained from the data. This could include software like Aladin, SAOImage DS9, or custom Python scripts. 76 | - Interpret the Results: 77 | - Interpret the data to extract meaningful information about the Crab Nebula's infrared properties. This could involve comparing with data at other wavelengths or with theoretical models. 78 | - Cite the Data Source: 79 | - If you're using the data for research or publication, make sure to properly cite the source of the data according to the guidelines provided by the catalog or dataset. 80 | 81 | By following these steps, you should be able to find and download infrared data of the Crab Nebula from various astronomical catalogs. -------------------------------------------------------------------------------- /code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/code/__init__.py -------------------------------------------------------------------------------- /code/agent.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Union 3 | 4 | from chains import DatetimeChain 5 | from langchain import LLMChain, OpenAI 6 | from langchain.agents import ( 7 | AgentExecutor, 8 | AgentOutputParser, 9 | LLMSingleActionAgent, 10 | Tool, 11 | ) 12 | from langchain.chat_models import ChatOpenAI 13 | 14 | # from langchain.memory import ConversationBufferWindowMemory 15 | from langchain.prompts import BaseChatPromptTemplate 16 | from langchain.schema import AgentAction, AgentFinish, HumanMessage 17 | from prompting_tools import BoundingBoxFinderTool, CMRQueryTool, GCMDKeywordSearchTool 18 | from prompts import cmr_template 19 | 20 | 21 | class CustomPromptTemplate(BaseChatPromptTemplate): 22 | """ 23 | This is a custom prompt template that uses the `cmr_template` from `prompts.py` 24 | """ 25 | 26 | template: str 27 | tools: List[Tool] 28 | 29 | def format_messages(self, **kwargs) -> str: 30 | # Get the intermediate steps (AgentAction, Observation tuples) 31 | # Format them in a particular way 32 | intermediate_steps = kwargs.pop("intermediate_steps") 33 | thoughts = "" 34 | for action, observation in intermediate_steps: 35 | thoughts += action.log 36 | thoughts += f"\nObservation: {observation}\nThought: " 37 | kwargs["agent_scratchpad"] = thoughts 38 | # Create a tools variable from the list of tools provided 39 | kwargs["tools"] = "\n".join( 40 | [f"{tool.name}: {tool.description}" for tool in self.tools] 41 | ) 42 | # Create a list of tool names for the tools provided 43 | kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools]) 44 | formatted = self.template.format(**kwargs) 45 | return [HumanMessage(content=formatted)] 46 | 47 | 48 | class CustomOutputParser(AgentOutputParser): 49 | """ 50 | This is a custom output parser that parses the output of the LLM agent 51 | """ 52 | 53 | def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]: 54 | # Check if agent should finish 55 | if "Final Answer:" in llm_output: 56 | return AgentFinish( 57 | # Return values is generally always a dictionary with a single `output` key 58 | # It is not recommended to try anything else at the moment :) 59 | return_values={"output": llm_output.split("Final Answer:")[-1].strip()}, 60 | log=llm_output, 61 | ) 62 | # Parse out the action and action input 63 | regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" 64 | match = re.search(regex, llm_output, re.DOTALL) 65 | if not match: 66 | raise ValueError(f"Could not parse LLM output: `{llm_output}`") 67 | action = match.group(1).strip() 68 | action_input = match.group(2) 69 | # Return the action and action input 70 | return AgentAction( 71 | tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output 72 | ) 73 | 74 | 75 | class CMRQueryAgent: 76 | """ 77 | This is a custom agent that uses the `CustomPromptTemplate` and `CustomOutputParser` 78 | """ 79 | 80 | def __init__(self): 81 | self.create_tools() 82 | self.tool_names = [tool.name for tool in self.tools] 83 | self.prompt = CustomPromptTemplate( 84 | template=cmr_template, 85 | tools=self.tools, 86 | # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically 87 | # This includes the `intermediate_steps` variable because that is needed 88 | input_variables=["input", "intermediate_steps"], 89 | ) 90 | self.output_parser = CustomOutputParser() 91 | self.llm_chain = LLMChain( 92 | llm=ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0), 93 | prompt=self.prompt, 94 | ) 95 | self.create_agent() 96 | 97 | def create_tools(self): 98 | """create tools for the agent""" 99 | self.tools = [ 100 | Tool( 101 | name=BoundingBoxFinderTool().name, 102 | description=BoundingBoxFinderTool().description, 103 | func=BoundingBoxFinderTool().run, 104 | ), 105 | Tool( 106 | name="DateTime Extractor", 107 | description="Extracts time string and converts it to a datetime format", 108 | func=DatetimeChain().run, 109 | ), 110 | Tool( 111 | name=GCMDKeywordSearchTool().name, 112 | description=GCMDKeywordSearchTool().description, 113 | func=GCMDKeywordSearchTool().run, 114 | ), 115 | Tool( 116 | name=CMRQueryTool().name, 117 | description=CMRQueryTool().description, 118 | func=CMRQueryTool().run, 119 | ), 120 | ] 121 | 122 | def create_agent(self): 123 | self.agent = LLMSingleActionAgent( 124 | llm_chain=self.llm_chain, 125 | output_parser=self.output_parser, 126 | stop=["\nObservation:"], 127 | allowed_tools=self.tool_names, 128 | ) 129 | self.agent_executor = AgentExecutor.from_agent_and_tools( 130 | agent=self.agent, tools=self.tools, verbose=True 131 | ) 132 | 133 | def run_query(self, input_text: str): 134 | return self.agent_executor.run(input_text) 135 | 136 | 137 | if __name__ == "__main__": 138 | query_agent = CMRQueryAgent() 139 | query_agent.run("what is the time in new york?") 140 | -------------------------------------------------------------------------------- /code/chains.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import datetime 3 | import os 4 | from typing import Dict 5 | 6 | import dotenv 7 | from langchain import LLMChain, PromptTemplate 8 | from langchain.chat_models import ChatOpenAI 9 | from langchain.llms import OpenAI 10 | from prompting_tools import GCMDKeywordSearchTool, geocode 11 | from prompts import (cmr_summarization_template, datetime_location_template, 12 | datetime_template, multi_evidence_template, 13 | multiple_context_qa_template, single_context_qa_template, 14 | summarization_template) 15 | 16 | 17 | class DatetimeChain(LLMChain): 18 | """Find datetime for a given time string""" 19 | 20 | def __init__(self, *args, **kwargs): 21 | today = datetime.date.today() 22 | today_string = ( 23 | f"Assume the current year and month is {today.year} and {today.month}." 24 | ) 25 | template = datetime_template.strip() + today_string 26 | prompt = PromptTemplate( 27 | template=template, 28 | input_variables=["datetime"], 29 | ) 30 | super().__init__(prompt=prompt, llm=OpenAI(temperature=0), *args, **kwargs) 31 | 32 | def _run(self, timestring: str) -> str: 33 | """Find datetime for a given time string""" 34 | return self.predict(datetime=timestring) 35 | 36 | async def _arun(self, timestring: str) -> str: 37 | """asynchronous call to find datetime for a given time string""" 38 | return self.predict(datetime=timestring) 39 | 40 | 41 | class DatetimeLocationFinderChain(LLMChain): 42 | """Find datetime for a given time string""" 43 | 44 | def __init__(self, *args, **kwargs): 45 | prompt = PromptTemplate( 46 | template=datetime_location_template.strip(), 47 | input_variables=["text"], 48 | ) 49 | super().__init__(prompt=prompt, llm=OpenAI(temperature=0), *args, **kwargs) 50 | 51 | def _run(self, timestring: str) -> str: 52 | """Find datetime for a given time string""" 53 | return self.predict(datetime=timestring) 54 | 55 | async def _arun(self, timestring: str) -> str: 56 | """asynchronous call to find datetime for a given time string""" 57 | return self.predict(datetime=timestring) 58 | 59 | 60 | class QASummarizeChain(LLMChain): 61 | """Summarize a given text, using a question answering format and answer the question""" 62 | 63 | def __init__(self, *args, **kwargs): 64 | prompt = PromptTemplate( 65 | template=summarization_template, 66 | input_variables=["query", "contexts"], 67 | ) 68 | super().__init__( 69 | prompt=prompt, 70 | llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 71 | *args, 72 | **kwargs, 73 | ) 74 | 75 | def _run(self, inputs: Dict) -> str: 76 | """Summarize a given text""" 77 | return self.predict( 78 | query=inputs["query"], context="\n".join(inputs["contexts"]) 79 | ) 80 | 81 | async def _arun(self, inputs: Dict) -> str: 82 | """asynchronous call to Summarize a given text""" 83 | return self.predict( 84 | query=inputs["query"], context="\n".join(inputs["contexts"]) 85 | ) 86 | 87 | class CombinedQAChain(LLMChain): 88 | """Summarize a given text, using a question answering format and answer the question""" 89 | 90 | def __init__(self, *args, **kwargs): 91 | prompt = PromptTemplate( 92 | template=multiple_context_qa_template, 93 | input_variables=["query", "contexts"], 94 | ) 95 | super().__init__( 96 | prompt=prompt, 97 | llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7), 98 | *args, 99 | **kwargs, 100 | ) 101 | 102 | def _run(self, inputs: Dict) -> str: 103 | """Summarize a given text""" 104 | return self.predict( 105 | query=inputs["query"], context="\n".join(inputs["contexts"]) 106 | ) 107 | 108 | async def _arun(self, inputs: Dict) -> str: 109 | """asynchronous call to Summarize a given text""" 110 | return self.predict( 111 | query=inputs["query"], context="\n".join(inputs["contexts"]) 112 | ) 113 | 114 | 115 | class SingleQAChain(LLMChain): 116 | """question answering chain to answer the question based on a given context""" 117 | 118 | def __init__(self, *args, **kwargs): 119 | prompt = PromptTemplate( 120 | template=single_context_qa_template, 121 | input_variables=["query", "context"], 122 | ) 123 | if "model_name" in kwargs: 124 | if kwargs["model_name"] == "gpt-3.5-turbo": 125 | super().__init__( 126 | prompt=prompt, 127 | llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 128 | ) 129 | else: 130 | super().__init__( 131 | prompt=prompt, 132 | llm=OpenAI(model_name=kwargs["model_name"], temperature=0.7), 133 | ) 134 | else: 135 | super().__init__( 136 | prompt=prompt, 137 | llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 138 | ) 139 | 140 | 141 | def _run(self, inputs: Dict) -> str: 142 | """Summarize a given text""" 143 | return self.predict(query=inputs["query"], context=inputs["context"]) 144 | 145 | async def _arun(self, inputs: Dict) -> str: 146 | """asynchronous call to Summarize a given text""" 147 | return self.predict(query=inputs["query"], context=inputs["context"]) 148 | 149 | 150 | class CMRSummarizeChain(LLMChain): 151 | """Summarize a given structured CMR query result, using a question answering format and answer the question""" 152 | 153 | def __init__(self, *args, **kwargs): 154 | prompt = PromptTemplate( 155 | template=cmr_summarization_template, 156 | input_variables=["query", "cmr_responses"], 157 | ) 158 | super().__init__( 159 | prompt=prompt, 160 | llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 161 | *args, 162 | **kwargs, 163 | ) 164 | 165 | def _run(self, inputs: Dict) -> str: 166 | """Summarize a given text""" 167 | 168 | return self.predict( 169 | query=inputs["query"], context="\n".join(inputs["cmr_responses"]) 170 | ) 171 | 172 | async def _arun(self, inputs: Dict) -> str: 173 | """asynchronous call to Summarize a given text""" 174 | return self.predict( 175 | query=inputs["query"], context="\n".join(inputs["cmr_responses"]) 176 | ) 177 | 178 | class EvidenceSelectorChain(LLMChain): 179 | """select the most accurate evidence from a list of evidences, based on the question""" 180 | 181 | def __init__(self, *args, **kwargs): 182 | prompt = PromptTemplate( 183 | template=multi_evidence_template, 184 | input_variables=["query", "evidences"], 185 | ) 186 | super().__init__( 187 | prompt=prompt, 188 | llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 189 | *args, 190 | **kwargs, 191 | ) 192 | 193 | def _run(self, inputs: Dict) -> str: 194 | """Summarize a given text""" 195 | assert isinstance(inputs["evidences"]) == list 196 | return self.predict( 197 | query=inputs["query"], context="\n\n".join(inputs["evidences"]) 198 | ) 199 | 200 | async def _arun(self, inputs: Dict) -> str: 201 | """asynchronous call to Summarize a given text""" 202 | assert isinstance(inputs["evidences"]) == list 203 | return self.predict( 204 | query=inputs["query"], context="\n\n".join(inputs["evidences"]) 205 | ) 206 | -------------------------------------------------------------------------------- /code/chat_agent.py: -------------------------------------------------------------------------------- 1 | ### deprecated code, only for reference 2 | import os 3 | import re 4 | 5 | import dotenv 6 | import httpx 7 | import openai 8 | from cmr import CollectionQuery, GranuleQuery 9 | 10 | collection_query = CollectionQuery() 11 | granule_query = GranuleQuery() 12 | action_re = re.compile("^Action: (\w+): (.*)$") 13 | 14 | 15 | ### this is chat agent using react, without the use of Langchain 16 | class ChatBot: 17 | def __init__(self, system=""): 18 | self.system = system 19 | self.messages = [] 20 | if self.system: 21 | self.messages.append({"role": "system", "content": system}) 22 | 23 | async def __call__(self, message): 24 | self.messages.append({"role": "user", "content": message}) 25 | result = await self.execute() 26 | self.messages.append({"role": "assistant", "content": result}) 27 | return result 28 | 29 | async def execute(self): 30 | completion = openai.ChatCompletion.create( 31 | model="gpt-3.5-turbo", messages=self.messages 32 | ) 33 | # Uncomment this to print out token usage each time, e.g. 34 | # {"completion_tokens": 86, "prompt_tokens": 26, "total_tokens": 112} 35 | print(completion.usage) 36 | return completion.choices[0].message.content 37 | 38 | 39 | async def get_keyword(self, keyword, gpt_embedder): 40 | """Get the nearest keyword to the given keyword""" 41 | return gpt_embedder.find_nearest_kw(keyword) 42 | 43 | 44 | async def cmr_collection_search(action_input): 45 | """Search CMR using the CMR rest API""" 46 | CMR_SEARCH_URL = "https://cmr.earthdata.nasa.gov/search/collections" 47 | async with httpx.AsyncClient() as client: 48 | response = await client.get(CMR_SEARCH_URL, params={"keyword": action_input}) 49 | return response.json() 50 | 51 | 52 | async def cmr_search_py(action_input): 53 | # example action_input "bbox=[2.224122, 48.8155755, 2.4697602, 48.902156] && datetime=['2019-01-01T00:00:00Z', '2019-01-31T23:59:59Z'] && keyword: aerosol" 54 | # TODO: extract bbox, datetime, keyword in format: 55 | # keyword = aerosol, 56 | # bbox=[-180, -90, 180, 90], 57 | # start_date="2000-01-01T00:00:00Z", 58 | # end_date="2021-01-01T00:00:00Z", 59 | keyword = re.search("keyword: (.*)", action_input).group(1) 60 | bbox_match = re.search(r"bbox=\[(.*?)\]", action_input) 61 | datetime_match = re.search(r"datetime=\[(.*?)\]", action_input) 62 | 63 | # Check if bbox and datetime arrays are found 64 | if bbox_match and datetime_match: 65 | bbox_str = bbox_match.group(1) 66 | datetime_str = datetime_match.group(1) 67 | bbox = [float(x) for x in bbox_str.split(",")] 68 | datetime = [x for x in datetime_str.split(",")] 69 | start_date = datetime[0].replace("'", "").strip() 70 | end_date = datetime[1].replace("'", "").strip() 71 | print("bbox:", bbox) 72 | print("start_date:", start_date) 73 | print("end_date:", end_date) 74 | print("keyword:", keyword) 75 | result = ( 76 | collection_query.keyword(keyword) 77 | .bounding_box(*bbox) 78 | .temporal(start_date, end_date) 79 | ).get(1)[0] 80 | # filter by keys if they exist: 81 | filter_keys = [ 82 | "title", 83 | "archive center", 84 | "time_start", 85 | "updated", 86 | "links", 87 | ] 88 | filtered_result = { 89 | result_key: result[result_key] 90 | for result_key in result.keys() 91 | if result_key in filter_keys 92 | } 93 | 94 | return filtered_result 95 | 96 | 97 | async def query(question, max_turns=5): 98 | i = 0 99 | bot = ChatBot(bbox_time_kw_template) 100 | next_prompt = question 101 | while i < max_turns: 102 | i += 1 103 | result = await bot(next_prompt) 104 | print(result) 105 | actions = [action_re.match(a) for a in result.split("\n") if action_re.match(a)] 106 | if actions: 107 | # There is an action to run 108 | action, action_input = actions[0].groups() 109 | if action not in known_actions: 110 | raise Exception("Unknown action: {}: {}".format(action, action_input)) 111 | print(" -- running {} {}".format(action, action_input)) 112 | observation = await known_actions[action](action_input) 113 | print("Observation:", observation) 114 | 115 | # If the action is querying the CMR API, just return the results, don't re-prompt 116 | 117 | next_prompt = "Observation: {}".format(observation) 118 | else: 119 | return result 120 | 121 | 122 | known_actions = { 123 | "cmr_search": cmr_search_py, 124 | "calculate": calculate, 125 | "geocode": geocode, 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /code/contexts.py: -------------------------------------------------------------------------------- 1 | contexts = [ 2 | 'Harmonized Landsat Sentinel-2 (HLS) Product User Guide Product Version 2.0 J.G. Masek, J. Ju, M. Claverie, S. Skakun, J.-C. Roger, E. Vermote, B. Franch, Z. Yin, J. L. Dungan Principal Investigator: Dr. Jeffrey G. Masek, NASA/GSFC Correspondence email address: Jeffrey.G.Masek@nasa.gov 1 Acronyms AROP BRDF BT CMG ETM+ GDAL GLS HDF HLS KML MGRS MSI NBAR OLI QA RSR SDS SR SZA TM TOA UTM WRS Automated Registration and Orthorectification Package Bidirectional Reflectance Distribution Function Brightness temperature Climate Modelling Grid Enhanced Thematic Mapper Plus Geospatial Data Abstraction Library Global Land Survey Hierarchical Data Format Harmonized Landsat and Sentinel-2 Keyhole Markup Language Military Grid Reference System Multi-Spectral Instrument Nadir BRDF-normalized Reflectance Operational Land Imager Quality assessment Relative spectral response Scientific Data Sets Surface reflectance Sun zenith angle Thematic Mapper Top of atmosphere Universal Transverse Mercator Worldwide Reference System 2 1 Introduction The Harmonized Landsat and Sentinel-2 (HLS) project is a NASA initiative and collaboration with USGS to produce compatible surface reflectance (SR) data from a virtual constellation of satellite sensors, the Operational Land Imager (OLI) and Multi-Spectral Instrument (MSI) onboard the Landsat-8 and Sentinel-2 remote sensing satellites respectively. The combined measurement enables global land observation every 2-3 days at moderate (30 m) spatial resolution. The HLS project uses a set of algorithms to derive seamless products from OLI and MSI: atmospheric correction, cloud and cloud-shadow masking, spatial co-registration and common gridding, view angle normalization and spectral bandpass adjustment. The HLS data products can be regarded as the building blocks for a "data cube" so that a user may examine any pixel through time and treat the near-daily reflectance time series as though it came from a single sensor. The HLS suite contains two products, S30 and L30, derived from Sentinel-2 L1C and Landsat L1TP (Collection 2) input, respectively. They are gridded into the same MGRS tiles with a 30m pixel size. 2 New in v2.0 HLS v2.0 builds on v1.4 by updating and improving processing algorithms, expanding spatial coverage, and providing validation. Particular updates are as follows: - Global coverage. All global land, including major islands but excluding Antarctica, is covered. - Input data. Landsat 8 Collection-2 (C2) data from USGS are used as input; better geolocation is expected as C2 data use the Sentinel-2 Global Reference Image (GRI) as an absolute reference. - Atmospheric correction. A USGS C version of LaSRCv3.5.5 is applied for both Landsat 8 and Sentinel-2 data for computational speedup. LaSRCv3.5.5 has been validated for both Landsat 8 and Sentinel-2 within the CEOS ACIX-I (Atmospheric Correction Inter-Comparison eXercise, http://calvalportal.ceos.org/projects/acix). - QA band. The QA band is generated exclusively by and named after Fmask, consistently for the two HLS products (S30 and L30). Like in v1.4 aerosol thickness level from atmospheric correction is also incorporated into the QA band. - BRDF adjustment. BRDF adjustment mainly normalizes the view angle effect, with the sun zenith angle largely intact. This adjustment is applied to the Sentinel-2 red-edge bands as well. - Sun and view angle bands are provided. - Product format. The product is delivered in individual Cloud Optimized GeoTIFF (COG) files to allow for spectral and spatial subsetting in applications. - Temporal Coverage and Latency. Version v2.0 moves toward "keep up" processing. The intent is to continually update products with <2-day latency. Users are cautioned however that HLS is still a research product. 3 Products overview 3.1 Input data The Operational Land Imager (OLI) sensor is a moderate spatial resolution multi-spectral imager onboard the Landsat-8 satellite, in a sun-synchronous orbit with a 705 km altitude and a 16-day 3 repeat cycle. The sensor acquires data with a 15-degree field of view resulting in approximately a 185 km image swath. The OLI sensor has 9 solar reflective bands and the data are co-registered with the data from the 2-band TIRS (Thermal Infrared Sensor) instrument onboard the same Landsat-8 satellite (Irons et al., 2012). The native spatial resolution is 30 m for OLI and 100 m for TIRS, but TIRS data are resampled to 30 m for distribution. HLS v2.0 uses Landsat-8 Collection-21 Level-1 top-of-atmosphere (TOA) product as input: for "keep-up" processing, the Real-Time data with geolocation RMSE <= 12 m (i.e. Tier-1 equivalent) are used and, for back processing, Tier-1 data are used. The Real-Time TOA OLI data have the same quality as the tier-based data do, but the Real-Time TIRS data may have lesser geolocation and radiometric quality. The Sentinel-2 Multi-Spectral Instrument (MSI) is onboard the Sentinel-2A and -2B satellites in a sun-synchronous orbit with a 786 km altitude and a combin', 3 | 'create a mixture situation. The "presence" rule is used: a QA bit value 1 in any of the 4 input pixels causes the output bit to be set to 1 for an L30 pixel and an output L30 QA bit is set to 0 only if all the 2x2 input pixels have 0 at that bit. 12 - S30: 10 m QA are resampled to 30 m using the same "presence" rule. A QA bit value 1 in any of the nine nesting input 10 m pixels causes the output bit to be set to 1 for S30. That is, a QA bit for S30 is set to 0 only if the input QA bit values for all the nine nesting 10 m pixels are 0. If a resampling window contains a mixture of QA bits, the "presence" rule will make the output QA bits not mutually exclusive. For example, the output QA bits may indicate a pixel is cloud, cloud shadow and water at the same time. This is not a mistake; the mixture nature of HLS QA bits allows the users to select/discard data in a way they want. The aerosol optical thickness level resample is special because it has two bits (bits 6-7 in QA). It is resampled in such a way that a higher aerosol thickness level dominates a lower aerosol level in output. For example, if any of the input pixels in the sampling window has "high aerosol," the output aerosol level will be "high aerosol," and if the highest aerosol level in the sampling window is "moderate aerosol," the output will be "moderate aerosol," and so on. 5 Spatial coverage HLS v2.0 covers all the global land except Antarctica, as depicted in a land mask (Fig. 5) derived from the NOAA shoreline dataset (https://www.ngdc.noaa.gov/mgg/shorelines/data/gshhg/latest/). Antarctica is excluded because of low solar elevations which compromise the plane-parallel atmospheric correction. Note that the data acquisition over some small oceanic islands by the Landsat and Sentinel-2 sensor may not be made regularly. Figure 5: HLS v2.0 covers the global land, including major islands but excluding Antarctica. 6 Product formats 6.1 File format HLS 2.0 products are in Cloud Optimized GeoTIFF (COG), one file per data layer to offer the flexibility of only downloading the needed data layers and, for cloud-based applications, the needed spatial subsets within a tile. The COG files are internally compressed. 13 L30 data are stored in directories such as HLS.L30.T17SLU.2020209T155956.v2.0/, which suggests L30 over tile 17SLU from data acquired on day 209 of 2020 specifically at UTC 155956. This example product consists of the following files: HLS.L30.T17SLU.2020209T155956.v2.0.B01.tif HLS.L30.T17SLU.2020209T155956.v2.0.B02.tif HLS.L30.T17SLU.2020209T155956.v2.0.B03.tif HLS.L30.T17SLU.2020209T155956.v2.0.B04.tif HLS.L30.T17SLU.2020209T155956.v2.0.B05.tif HLS.L30.T17SLU.2020209T155956.v2.0.B06.tif HLS.L30.T17SLU.2020209T155956.v2.0.B07.tif HLS.L30.T17SLU.2020209T155956.v2.0.B09.tif HLS.L30.T17SLU.2020209T155956.v2.0.B10.tif HLS.L30.T17SLU.2020209T155956.v2.0.B11.tif HLS.L30.T17SLU.2020209T155956.v2.0.Fmask.tif HLS.L30.T17SLU.2020209T155956.v2.0.SZA.tif HLS.L30.T17SLU.2020209T155956.v2.0.SAA.tif HLS.L30.T17SLU.2020209T155956.v2.0.VZA.tif HLS.L30.T17SLU.2020209T155956.v2.0.VAA.tif HLS.L30.T17SLU.2020209T155956.v2.0.cmr.xml HLS.L30.T17SLU.2020209T155956.v2.0.json HLS.L30.T17SLU.2020209T155956.v2.0.jpg The filenames for individual spectral bands and Fmask cloud mask are self-explaining. Sun zenith angle (SZA), sun azimuth angle (SAA), view zenith angle (VZA) and view azimuth angle (VAA) file are also provided; see Section 6.4 for details. File HLS.L30.T17SLU.2020209T155956.v2.0.cmr.xml is the metadata file, HLS.L30.T17SLU.2020209T155956.v2.0.json contains the size and checksum value of each file, and HLS.L30.T17SLU.2020209T155956.v2.0.jpgis a natural-color browse image. The UTC time in the filenames is the sensing time at the input Landsat-8 scene center. After gridding into the MGRS tiles it does not accurately indicate the sensing time over the tile. If two scenes overlap a MGRS tile, the sensing time of one the scenes are chosen by chance. So, this timing information is not accurate for the MGRS tile center; it is intended mainly as an identifier, not for quantitative analysis. S30 data are stored in the same format. An example directory HLS.S30.T17SLU.2020117T160901.v2.0 may contain the following files: HLS.S30.T17SLU.2020117T160901.v2.0.B01.tif HLS.S30.T17SLU.2020117T160901.v2.0.B02.tif HLS.S30.T17SLU.2020117T160901.v2.0.B03.tif HLS.S30.T17SLU.2020117T160901.v2.0.B04.tif HLS.S30.T17SLU.2020117T160901.v2.0.B05.tif HLS.S30.T17SLU.2020117T160901.v2.0.B06.tif HLS.S30.T17SLU.2020117T160901.v2.0.B07.tif HLS.S30.T17SLU.2020117T160901.v2.0.B08.tif HLS.S30.T17SLU.2020117T160901.v2.0.B8A.tif HLS.S30.T17SLU.2020117T160901.v2.0.B09.tif HLS.S30.T17SLU.2020117T160901.v2.0.B10.tif 14 HLS.S30.T17SLU.2020117T160901.v2.0.B11.tif HLS.S30.T17SLU.2020117T160901.v2.0.B12.tif HLS.S30.T17SLU.2020117T160901.v2.0.Fmask.tif HLS.S30.T17SLU.2020117T160901.v2.0.SZA.tif HLS.S30.T17SLU.2020117T160901.v2.0.SAA.tif HLS.S30.T17SLU.2020117T160901.v2.0.VZA.tif HLS.S30.T17SLU.2020117T16', 4 | 'SLU.2020117T160901.v2.0.SAA.tif HLS.S30.T17SLU.2020117T160901.v2.0.VZA.tif HLS.S30.T17SLU.2020117T160901.v2.0.VAA.tif HLS.S30.T17SLU.2020117T160901.v2.0.cmr.xml HLS.S30.T17SLU.2020117T160901.v2.0.json HLS.S30.T17SLU.2020117T160901.v2.0.jpg The UTC time in the S30 product filenames is the time the sensor begins to sense the sun-lit side of the earth for each orbit, not the exact sensing time over the tile center. When a sequence of observations is available on the same day at the high latitude, they can still be differentiated by this timing information. 6.2 L30 The product L30 contains Landsat-8 OLI surface reflectance and TOA TIRS brightness temperature gridded at 30 m spatial resolution in MGRS tiles. Table 6 lists all the data layers of the L30 product. Table 6: All the data layers of the L30 product (SR = Surface Reflectance, NBAR = Nadir BRDF- normalized Reflectance, TOA Refl. = Top of Atmosphere Reflectance, TOA BT = Top of Atmosphere Brightness temperature). Units Data type Scale Fill value Spatial Resolution Description reflectance reflectance reflectance reflectance reflectance reflectance reflectance reflectance degree degC degree degC int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 int16 0.01 0.01 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 - none uint8 - 255 30 30 30 30 30 30 30 30 30 30 30 NBAR TOA Refl. TOA BT Quality bits OLI band number 1 2 3 4 5 6 7 9 10 11 Data layer B01 B02 B03 B04 B05 B06 B07 B09 B10 B11 FMASK (Table 9) S30 The product S30 contains MSI surface reflectance at 30 m spatial resolution. Table 7 lists all the data layers of the S30 product. 15 Table 7: list of the SDS of the S30 product (SR = Surface Reflectance, NBAR = Nadir BRDF-Adjusted Reflectance, TOA Refl. = Top of Atmosphere Reflectance). Units Data type Scale Fill value Spatial Resolution Description MSI band number 1 2 3 4 5 6 7 8 8A 9 10 11 12 Data layer B01 B02 B03 B04 B05 B06 B07 B08 B8A B09 B10 B11 B12 FMASK (Table 9) reflectance reflectance reflectance reflectance reflectance reflectance reflectance reflectance reflectance reflectance reflectance reflectance reflectance int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 int16 0.0001 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 -9999 - none uint8 - 255 30 30 30 30 30 30 30 30 30 30 30 30 30 30 NBAR TOA Refl. NBAR Quality bits 6.3 The sun and view angles HLS v2.0 also provides the sun zenith/azimuth and view zenith/azimuth angles used in BRDF correction; in case a user may want to do BRDF correction differently. The S30 angle data is interpolated from the ESA-provided 5 km angles in a text form; HLS selects the view angle of the 2nd red-edge band and uses it on all bands. The L30 angle data is provided in the Collection-2 data; it is originally derived for the red band and is representative of all bands. Table 8: Description of the sun and view angles. Angle band Units Data type Scaling factor Fill value Spatial resolution Sun zenith degrees uint16 Sun azimuth degrees uint16 View zenith degrees uint16 View azimuth degrees uint16 0.01 0.01 0.01 0.01 40,000 40,000 40,000 40,000 30 m 30 m 30 m 30 m 16 6.4 Quality Assessment layer HLS v2.0 products have one Quality Assessment (QA) layer, generated from Fmask 4.2, and named after Fmask. The Fmask integer output is converted to the bit representation (Table 9) as in HLS v1.4. The HLS processing dilates the Fmask cloud and cloud shadow by 5 pixels for L30 and S30 and labels the dilation as "Adjacent to cloud/shadow." The qualitative aerosol optical thickness level from atmospheric correction is also incorporated. Table 9: Description of the bits in the one-byte Quality Assessment layer. Bits are listed from the MSB (bit 7) to the LSB (bit 0) Bit number Mask name Bit value 6-7 aerosol level 11 10 01 00 5 4 3 2 1 0 Water Snow/ice Cloud shadow Adjacent to cloud/shadow Cloud Cirrus 1 0 1 0 1 0 1 0 1 0 Reserved, but not used Mask description High aerosol Moderate aerosol Low aerosol Climatology aerosol Yes No Yes No Yes No Yes No Yes No NA See Appendix A on how to decode the QA bits with simple integer arithmetic. 17 6.5 Metadata Metadata about the L30 and S30 products is presented in the xmr.xml file. 6.5.1 Key metadata elements for L30 from the complete XML list include: * LANDSAT_PRODUCT_ID The Landsat-8 input L1 scene product ID for processing backtracing. If two adjacent scenes from the same WRS path overlap the same MGRS tile, both product IDs are reported. * SENSING_TIME The WRS scene center sensing time, carried over from the Level-1 metadata; not precisely represented the data gridded into the tile. When two scenes overlap the tile, the sensing time for each is retained. * SPATIAL_COVERAGE The percentage of the tile with data * CLOUD_COVERAGE The percentage of cloud and cloud shadow in observation based on Fmas'] -------------------------------------------------------------------------------- /code/data_downloader.py: -------------------------------------------------------------------------------- 1 | """download data from the web and save it to the data folder""" 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | def csda_doc_downloader(): 6 | pass 7 | 8 | def get_links_from_xml_dump(txt_file): 9 | """get links from a web page dump 10 | """ 11 | with open(txt_file, 'r') as f: 12 | txt = f.read() 13 | soup = BeautifulSoup(txt, 'html.parser') 14 | links = soup.find_all('a') 15 | links = [link.get('href') for link in links if "http"] 16 | return links 17 | 18 | 19 | if __name__ == '__main__': 20 | print(get_links_from_xml_dump('texts.txt')) 21 | -------------------------------------------------------------------------------- /code/gcmd_keyword_download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from typing import List 4 | 5 | import dotenv 6 | import fastapi 7 | import httpx 8 | import requests 9 | import xmltodict 10 | from gpt_embedder import GPTEmbedder 11 | 12 | dotenv.load_dotenv() 13 | 14 | KEYWORD_URL = "https://gcmd.earthdata.nasa.gov/kms/tree/concept_scheme/all?version=15.9" 15 | 16 | 17 | def get_keyword_tree(): 18 | """Get the keyword tree from the GCMD API""" 19 | response = requests.get(KEYWORD_URL) 20 | response.raise_for_status() 21 | 22 | keyword_tree = response.json()["tree"]["treeData"][0] 23 | # assert keyword_tree.keys() == dict_keys(['key', 'title', 'children']) 24 | # title contains path name 25 | paths_list = [] 26 | recurse_get_paths(keyword_tree, path="", paths_list=paths_list) 27 | return [path.strip(" > Keywords > ") for path in paths_list] 28 | 29 | 30 | def recurse_get_paths(keyword_tree, path="", paths_list=[]): 31 | """Recursively get paths from the keyword tree""" 32 | if keyword_tree["children"]: 33 | for child in keyword_tree["children"]: 34 | recurse_get_paths( 35 | child, path=path + " > " + keyword_tree["title"], paths_list=paths_list 36 | ) 37 | else: 38 | paths_list.append(path + " > " + keyword_tree["title"]) 39 | 40 | 41 | def expand_keywords(paths_list): 42 | """Expand keywords to include parent keywords in the leaf node""" 43 | expanded_paths_list = [] 44 | for path in paths_list: 45 | path_list = path.split(" > ") 46 | for i, _ in enumerate(path_list): 47 | expanded_paths_list.append(" > ".join(path_list[: i + 1])) 48 | expanded_paths_list 49 | return expanded_paths_list 50 | 51 | 52 | def filter_paths(paths_list, top_level_keywords: List[str]): 53 | """Filter paths to only include those that contain the top level keyword""" 54 | return [path for path in paths_list if any(kw in path for kw in top_level_keywords)] 55 | 56 | 57 | if __name__ == "__main__": 58 | import ipdb 59 | 60 | ipdb.set_trace() 61 | keys = expand_keywords( 62 | filter_paths(get_keyword_tree(), ["Earth Science", "Earth Science Services"]) 63 | ) 64 | -------------------------------------------------------------------------------- /code/gpt_embedder.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import dotenv 5 | import numpy as np 6 | import openai 7 | import utils 8 | from langchain.embeddings import OpenAIEmbeddings 9 | 10 | dotenv.load_dotenv() 11 | 12 | 13 | class GPTEmbedder: 14 | """Embedder for keywords using GPT-3 embeddings""" 15 | 16 | def __init__( 17 | self, 18 | embeddings_file="/Users/mramasub/work/cmr-prompt-chain/data/keyword_embeddings.npy", 19 | ): 20 | openai.api_key = os.getenv("OPENAI_API_KEY") 21 | self.embedder = OpenAIEmbeddings(openai_api_key=openai.api_key) 22 | self.text_to_add = "Hierarchy Path: " 23 | # print pwd 24 | print(os.getcwd()) 25 | keywords_file = "/Users/mramasub/work/cmr-prompt-chain/data/keywords.json" 26 | if utils.path_exists(keywords_file): 27 | self.kws = self.read_kws(keywords_file) 28 | else: 29 | # exit 30 | pass 31 | self.model = "text-embedding-ada-002" 32 | self.embeddings = ( 33 | self.load_from_pkl(embeddings_file) 34 | if utils.path_exists(embeddings_file) 35 | else self.create_embeddings() 36 | ) 37 | 38 | assert len(self.kws) == len(self.embeddings) 39 | 40 | def create_embeddings(self): 41 | """Create embeddings for all keywords""" 42 | kws_to_embed = self.kws 43 | embeddings = self._embed_langchain(kws_to_embed) 44 | return np.array(embeddings) 45 | 46 | def _embed_langchain(self, texts, use_text_to_add=True): 47 | text_to_add = "" 48 | if use_text_to_add: 49 | text_to_add = self.text_to_add 50 | return self.embedder.embed_documents([text_to_add + text for text in texts]) 51 | 52 | def _embed(self, texts, use_text_to_add=True): 53 | text_to_add = "" 54 | if use_text_to_add: 55 | text_to_add = self.text_to_add 56 | return openai.Embedding.create( 57 | input=[text_to_add + kw for kw in texts], model=self.model 58 | ) 59 | 60 | def find_nearest_kw(self, keyword, top_n=1): 61 | """Find the nearest keyword to the given keyword""" 62 | embedding = self._embed_langchain([keyword], use_text_to_add=False) 63 | embedding = np.array(embedding) 64 | distances = np.linalg.norm(self.embeddings - embedding, axis=1) 65 | 66 | return [self.kws[i] for i in np.argsort(distances)[:top_n]] 67 | 68 | def read_kws( 69 | self, 70 | file, 71 | ): 72 | """Read keywords from file""" 73 | kws = [] 74 | 75 | with open(file, "r", encoding="utf-8") as f: 76 | kws = json.load(f) 77 | return [kw["keyword_string"] for kw in kws] 78 | 79 | def load_from_pkl(self, file): 80 | """Load embeddings from pickle file""" 81 | return np.load(file) 82 | -------------------------------------------------------------------------------- /code/parse_docs.py: -------------------------------------------------------------------------------- 1 | # code with all the best practices followed 2 | 3 | import os 4 | import re 5 | import sys 6 | 7 | import fire 8 | from tqdm import tqdm 9 | 10 | 11 | def parse_docs(txt_file, store_path): 12 | """parse the txt file, get the pdf file, store it in a folder and return the path 13 | 14 | Args: 15 | txt_file (txt file with links) 16 | store_path (path to store the pdf files) 17 | """ 18 | with open(txt_file, "r") as f: 19 | lines = f.readlines() 20 | for i, line in tqdm(enumerate(lines)): 21 | if line.startswith("http"): 22 | pdf_path = os.path.join(store_path, str(i) + ".pdf") 23 | os.system(f"wget -O {pdf_path} {line}") 24 | print(f"pdf file stored at {pdf_path}") 25 | return store_path 26 | 27 | 28 | if __name__ == "__main__": 29 | fire.Fire(parse_docs) 30 | -------------------------------------------------------------------------------- /code/prompting_tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import dotenv 5 | import httpx 6 | import requests 7 | import tiktoken 8 | import xmltodict 9 | from gpt_embedder import GPTEmbedder 10 | from langchain.agents import Tool, tool 11 | from langchain.tools import BaseTool 12 | from opencage.geocoder import OpenCageGeocode 13 | 14 | dotenv.load_dotenv() 15 | 16 | gpt_embedder = GPTEmbedder(embeddings_file="../data/keyword_embeddings.npy") 17 | opencage_geocoder = OpenCageGeocode(os.environ["OPENCAGE_API_KEY"]) 18 | 19 | 20 | class BoundingBoxFinderTool(BaseTool): 21 | name = "bounding_box_finder" 22 | description = "useful to find bounding box in min Longitude, min Latitude, max Longitude, max Latitude format for a given location, region, or a landmark. The output is formatted for CMR API." 23 | geocoder = opencage_geocoder 24 | 25 | def _run(self, tool_input: str) -> str: 26 | """Geocode a query (location, region, landmark)""" 27 | response = self.geocoder.geocode(tool_input, no_annotations="1") 28 | if response: 29 | bounds = response[0]["bounds"] 30 | # convert to bbox 31 | bbox = "{},{},{},{}".format( 32 | bounds["southwest"]["lng"], 33 | bounds["southwest"]["lat"], 34 | bounds["northeast"]["lng"], 35 | bounds["northeast"]["lat"], 36 | ) 37 | return f"bounding_box[]={bbox}" 38 | return "Cannot parse the query" 39 | 40 | async def _arun(self, tool_input: str) -> str: 41 | """asynchronous call to Geocode a query""" 42 | async with httpx.AsyncClient() as client: 43 | response = self.geocoder.geocode(tool_input, no_annotations="1") 44 | if response: 45 | bounds = response[0]["bounds"] 46 | # convert to bbox 47 | # bounding_box[]= 48 | bbox = "{},{},{},{}".format( 49 | bounds["southwest"]["lng"], 50 | bounds["southwest"]["lat"], 51 | bounds["northeast"]["lng"], 52 | bounds["northeast"]["lat"], 53 | ) 54 | return f"bounding_box[]={bbox}" 55 | 56 | else: 57 | return "Cannot parse the query" 58 | 59 | 60 | def geocode(text: str) -> str: 61 | """Geocode a query (location, region, or landmark)""" 62 | response = opencage_geocoder.geocode(text, no_annotations="1") 63 | if response: 64 | bounds = response[0]["bounds"] 65 | # convert to bbox 66 | bbox = "{},{},{},{}".format( 67 | bounds["southwest"]["lng"], 68 | bounds["southwest"]["lat"], 69 | bounds["northeast"]["lng"], 70 | bounds["northeast"]["lat"], 71 | ) 72 | return f"bounding_box[]={bbox}" 73 | 74 | @tool 75 | async def calculate(expression): 76 | """Calculate an expression""" 77 | return eval(expression) 78 | 79 | 80 | class CMRQueryTool(BaseTool): 81 | name = "cmr_query_api" 82 | description = "useful for Querying CMR API based on previous Observations. input is query parameters string" 83 | base_url = "https://cmr.earthdata.nasa.gov/search/collections?" 84 | 85 | def _run(self, tool_input: str) -> str: 86 | k: int = 40 87 | """Filter a CMR response""" 88 | 89 | if self.base_url in tool_input: 90 | tool_input = tool_input.replace(self.base_url, "") 91 | # print(f"\nCMR Query URL{self.base_url + tool_input}\n") 92 | # response = requests.get(self.base_url + tool_input) 93 | # # parse xml response 94 | # xml_dict = xmltodict.parse(response.text) 95 | # hits = xml_dict["results"]["hits"] 96 | # k = int(hits) if int(hits) < k else k 97 | # if hits == "0": 98 | # result_str = "No results found" 99 | # else: 100 | # result_str = ( 101 | # f"Found {hits} results. here are top {k} result descriptions:\n" 102 | # ) 103 | # if isinstance(xml_dict["results"]["references"]["reference"], list): 104 | # for result in xml_dict["results"]["references"]["reference"]: 105 | # result_str += f"{result['name']}\n" 106 | 107 | # elif isinstance(xml_dict["results"]["references"]["reference"], dict): 108 | # result_str += ( 109 | # f"{xml_dict['results']['references']['reference']['name']}\n" 110 | # ) 111 | return self.base_url + tool_input 112 | 113 | async def _arun(self, tool_input: str) -> str: 114 | """asynchronous call to filter a CMR response""" 115 | return [self._filter_response(tool_input)] 116 | 117 | 118 | class GCMDKeywordSearchTool(BaseTool): 119 | """Search for science keyword in GCMD science keyword database. only earth science keywords are allowed, no other keywords are allowed""" 120 | 121 | # science_keywords\[0\]\[category\]=Cat1&science_keywords\[0\]\[topic\]=Topic1&science_keywords\[1\]\[category\]=Cat2 122 | 123 | name = "gcmd_keyword_search" 124 | description = "useful to search for earth science keyword in GCMD database. only earth science keywords and phenomena are allowed as inputs, no other keywords are allowed. The output is formatted for CMR API." 125 | 126 | def _run(self, tool_input: str, cmr_formatted=False) -> str: 127 | """Search for a keyword in GCMD""" 128 | if cmr_formatted: 129 | return self.get_formatted_science_kws(tool_input) 130 | else: 131 | return self.get_science_kws(tool_input) 132 | 133 | @staticmethod 134 | def get_formatted_science_kws(tool_input: str, top_n=5) -> str: 135 | """Search for a keyword in GCMD""" 136 | return [ 137 | GCMDKeywordSearchTool().cmr_science_keyword(kw, keyword_pos) 138 | for keyword_pos, kw in enumerate( 139 | gpt_embedder.find_nearest_kw(tool_input, top_n=top_n) 140 | ) 141 | ] 142 | 143 | @staticmethod 144 | def get_science_kws(tool_input: str, top_n=5) -> str: 145 | """Search for a keyword in GCMD""" 146 | return [kw for kw in gpt_embedder.find_nearest_kw(tool_input, top_n=top_n)] 147 | 148 | async def _arun(self, tool_input: str) -> str: 149 | """Search for a keyword in GCMD""" 150 | return [ 151 | self.cmr_science_keyword(kw, keyword_pos) 152 | for keyword_pos, kw in enumerate(gpt_embedder.find_nearest_kw(tool_input)) 153 | ][0] 154 | 155 | @staticmethod 156 | def cmr_science_keyword(keyword_string, keyword_pos): 157 | level_list = [ 158 | "category", 159 | "topic", 160 | "term", 161 | "variable-level-1", 162 | "variable-level-2", 163 | "variable-level-3", 164 | "detailed-variable", 165 | ] 166 | keyword_list = [key.strip() for key in keyword_string.split(">")] 167 | 168 | return "&".join( 169 | [ 170 | rf"science_keywords[{keyword_pos}][{level_list[i]}]={keyword_list[i].replace(' ', '%20')}" 171 | for i in range(len(keyword_list)) 172 | ] 173 | ) 174 | 175 | 176 | def num_tokens_from_string(string: str, model_name: str) -> int: 177 | """Returns the number of tokens in a text string.""" 178 | encoding = tiktoken.encoding_for_model(model_name) 179 | num_tokens = len(encoding.encode(string)) 180 | return num_tokens 181 | 182 | 183 | if __name__ == "__main__": 184 | text_src = "/Users/mramasub/work/BERT-E/cmr_records-chunks-300-10w_pdfminer.json" 185 | # read json 186 | import json 187 | 188 | with open(text_src, "r") as f: 189 | chunks = json.load(f) 190 | token_sizes = [] 191 | for i, chunk in enumerate(chunks): 192 | token_sizes.append(num_tokens_from_string(chunk["text"], "gpt-3.5-turbo")) 193 | if i > 1000: 194 | print(f"Average number of tokens per chunk: {sum(token_sizes)/i}") 195 | break 196 | -------------------------------------------------------------------------------- /code/prompts.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from langchain.prompts import BaseChatPromptTemplate 3 | from langchain.prompts.prompt import PromptTemplate 4 | 5 | # Set up the base template 6 | 7 | single_context_qa_template = """ 8 | You are given a query and a context paragraph. 9 | Answer the question. answers must be inferred based on the context paragraph only. Do not use any other information. 10 | you must also provide the evidence sentence from the context paragraph that is used for inference. 11 | Here is the context paragraph: 12 | {context} 13 | Provide Answer and Reference in following format 14 | Answer: 15 | Evidence: 16 | 17 | (if there is no related information in the context paragraph that answers the question, you must output "N/A" in the Answer field and "N/A" in the Evidence field) 18 | Here is the query: {query} 19 | """ 20 | 21 | multiple_context_qa_template = """ 22 | You are a truthful question answering bot. You are given a query and some context paragraphs. 23 | Answer the question. answers must be grounded only on the context paragraphs. Do not use any other information. 24 | you must also provide the evidence verbatim as it appears in the context paragraph. Do not include any other characters. 25 | Here are the context paragraphs: 26 | {contexts} 27 | if there is no related information in the context paragraph that answers the question, you must output "N/A" in the Answer field and "N/A" in the Evidence field 28 | Provide Answer and Evidence in following format 29 | Answer: 30 | Evidence: 31 | 32 | Here is the query: {query} 33 | """ 34 | 35 | 36 | multi_evidence_template = """ 37 | You are a evidence selection bot. You are given a list of evidences, You must choose the most accurate evidence that answers the question. 38 | Here is the list of evidences: 39 | {evidences} 40 | Here is the question: {query} 41 | Most Accurate Evidence: 42 | """ 43 | 44 | summarization_template = """ 45 | You are a truthful question answer system. You are given a question and you must answer it based on the context paragraphs provided. 46 | You must also summarize the context paragraphs into a single paragraph. 47 | you must also provide a reference to the source of the answer, which is the exact sencence from the context paragraph that answers the question. 48 | If the question cannot be answered by the context provided, you must output "Cannot answer the question with the context paragraphs" in the Answer field. 49 | 50 | Here are the context paragraphs: 51 | {contexts} 52 | 53 | Here is the question: {query} 54 | Helpful Summary, followed by Answer, finally Reference: 55 | """ 56 | cmr_summarization_template = """ 57 | You are a Summarization system. You are given a query and some structured metadata related to query. 58 | You must summarize the structured metadata into a single paragraph, as it relates to the query. 59 | 60 | Here are the CMR query results: 61 | {cmr_responses} 62 | 63 | Here is the initial question: {query} 64 | Helpful Summary: 65 | """ 66 | datetime_location_template = """ 67 | You will be given a text. extract the time string and location string. 68 | the output SHOULD be in this format '(timestring, location string)' 69 | 70 | if either of the strings are empty, then return `None` in their place. 71 | 72 | text: {text} 73 | output: 74 | """ 75 | 76 | datetime_template = """ 77 | convert time string: {datetime} into start and end datetime formatted as: 'temporal[]=yyyy-MM-ddTHH:mm:ssZ,yyyy-MM-ddTHH:mm:ssZ' 78 | """ 79 | 80 | cmr_template = """ 81 | Decode the following query as best you can. 82 | The query will involve extracting datetime, bounding box and GKR science keyword. 83 | You have access to the following tools: 84 | 85 | {tools} 86 | 87 | Use the following format: 88 | 89 | Query: the input question you must answer 90 | Thought: you should always think about what to do 91 | Action: the action to take, should be one of [{tool_names}] 92 | Action Input: the input to the action 93 | Observation: the result of the action 94 | ... (this Thought/Action/Action Input/Observation can repeat N times) 95 | Thought: I now have all the information for CMR query 96 | Action: cmr_query_api 97 | Action Input: the input to cmr_query_api (This is your last action, do not add any more actions, DO NOT include Base URL) 98 | Observation: the result of the action 99 | Final Answer: aggregate and Summarize the observation 100 | 101 | Begin Loop: 102 | 103 | Query: {input} 104 | {agent_scratchpad}""" # noqa: E501 105 | 106 | 107 | bbox_time_kw_template = """\ 108 | You run in a loop of Thought, Action, PAUSE, Observation. 109 | At the end of the loop you output an Answer 110 | Use Thought to describe your thoughts about the question you have been asked. 111 | Use Action to run one of the actions available to you - then return PAUSE. 112 | Observation will be the result of running those actions. 113 | 114 | The questions will involve getting datasets from a database. 115 | 116 | To resolve the question, you have the following tools available that you can use. 117 | 118 | Your available actions are: 119 | 120 | calculate: 121 | e.g. calculate: 4 * 7 / 3 122 | Runs a calculation and returns the number - uses Python so be sure to use floating point syntax if necessary 123 | 124 | geocode: 125 | e.g. geocode: Paris 126 | Returns a bounding box for the location 127 | 128 | gcmd_keyword: 129 | e.g. gcmd_keyword: aerosol 130 | Returns a list of GCMD keywords that match the input 131 | 132 | cmr_search: 133 | e.g. bbox: [-73.21, 43.99, -73.12, 44.05] && datetime=['2019-01-01T00:00:00Z', '2019-01-02T00:00:00Z'] && keyword: aerosol 134 | Will search the database for that bbox, datetime and the science or data keyword present in the Question and return a JSON representation of the item assets returned from the CMR API. DON'T REPLACE the words bbox, datetime or keyword 135 | 136 | Please remember that these are your only three available actions. Do not attempt to put any word after "Action: " other than calculate, geocode or cmr_search. THIS IS A HARD RULE. DO NOT BREAK IT AT ANY COST. DO NOT MAKE UP YOUR OWN ACTIONS. 137 | 138 | Example session: 139 | 140 | Question: Can you point me to aerosol data for 2019 January, for the capital of France? 141 | 142 | Thought: I should deduce that capital of France is Paris, and find its bounding box extent. 143 | Action: geocode: Paris 144 | PAUSE 145 | 146 | You will be called again with this: 147 | 148 | Observation: Its bbox is [27, 54, 63, 32.5] 149 | 150 | 151 | You then output: 152 | 153 | Thought: I should now query the CMR catalog to fetch data about satellite images of Paris. 154 | 155 | Action: cmr_search: bbox=[27, 54, 63, 32.5] && datetime=['2019-01-01T00:00:00Z', '2019-01-02T00:00:00Z'] && keyword: aerosol 156 | PAUSE 157 | 158 | You will be called again with the output from the CMR API as JSON. Use that to give the user concise and verbose information on the returned data. Stop generating after this. 159 | 160 | """ # noqa: C0103 161 | bbox_time_kw_template = bbox_time_kw_template.strip() 162 | -------------------------------------------------------------------------------- /code/texts.txt: -------------------------------------------------------------------------------- 1 |
K. F. Huemmrich
2 | 3 | 6 | 9 | 12 | 15 | 18 | 21 | 24 | 27 |
28 |
-------------------------------------------------------------------------------- /code/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def path_exists(path): 5 | """Check if path exists. print error if not""" 6 | path_exist = os.path.exists(path) 7 | if not path_exists: 8 | print(f"Path does not exist: {path}") 9 | return path_exist 10 | -------------------------------------------------------------------------------- /data/EJ_validation_data.csv: -------------------------------------------------------------------------------- 1 | Dataset,Indicators,Description,Description Simplified,Geographic Coverage,Format,Spatial Resolution,Spatial Resolution (Standard),Temporal Resolution,Temporal Extent,Latency,Source/Link,Project,Strengths,Limitations,Data Visualization,Intended Use,Tab Name Indicator,Indicators (Select from drop-down list),sde_link 2 | "ABoVE: Burn Severity of Soil Organic Matter, Northwest Territories, Canada, 2014-2015",Disasters,"This dataset provides maps at 30-m resolution of landscape surface burn severity (surface litter and soil organic layers) from the 2014-2015 fires in the Northwest Territories and Northern Alberta, Canada. The maps were derived from Landsat 8 Operational Land Imager/Thermal Infrared Sensor (OLI/TIRS) imagery and two separate multiple linear regression models trained with field data; one for the Plains and a second for the Shield ecoregion. Field observations were used to estimate area burned in each of five severity classes (unburned, singed, light, moderate, severely burned) in six stratified randomly selected plots of 10 x 10-m in size across a 1-ha site. Using this five class scale a burn severity index (BSI) for each 1-ha site was calculated using multiple weighted and averaged field parameters. Pre- and post-fire phenologically paired Landsat 8 images were used to model the five discrete severity classes using midpoints as breaks.","The ABoVE: Burn Severity of Soil Organic Matter, Northwest Territories, Canada, 2014-2015 dataset provides maps of landscape burn severity from the 2014-2015 fires in the Northwest Territories and Northern Alberta, Canada. These data may be useful in fire disaster and recovery research.","Northwest Territories and Northern Alberta, Canada",GeoTIFF,30 meters,30 meters,,2014-05-01 to 2015-10-01,,https://dx.doi.org/10.3334/ORNLDAAC/1694,ABoVE - Arctic-Boreal Vulnerability Experiment,Pre-classified burn severity for ease of use,Lacks recent data,,Path C,Disasters,Disasters,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC2143402644-ORNL_CLOUD&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 3 | "Daily and Annual PM2.5 Concentrations for the Contiguous United States, 1-km Grids, v1 (2000 – 2016)",Health & Air Quality,"The Daily and Annual PM2.5 Concentrations for the Contiguous United States, 1-km Grids, v1 (2000 - 2016) data set includes predictions of PM2.5 concentrations in grid cells at a resolution of 1 km for the years 2000 to 2016. A generalized additive model was used that accounted for geographic difference to ensemble daily predictions of three machine learning models: neural network, random forest, and gradient boosting. The three machine learners incorporated multiple predictors, including satellite data, meteorological variables, land-use variables, elevation, chemical transport model predictions, several reanalysis data sets, as well as other predictors. The annual predictions were calculated by averaging the daily predictions for each year in each grid cell. The ensembled model demonstrated better predictive performance than the individual machine learners with 10-fold cross-validated R-squared values of 0.86 for daily predictions and 0.89 for annual predictions.",The Daily and Annual PM2.5 Concentrations for the Contiguous United States dataset provides predictions of PM2.5 concentrations for the years 2000 to 2016. These data are useful in air quality research.,United States,"raster, tabular, vector",0.00833 degrees,1 kilometer,Daily; Annual,2000-01-01 to 2016-12-31,,https://doi.org/10.7927/0rvr-4538,AQDH - Air Quality Data for Health-Related Applications,16-year temporal extent,,SEDAC Map widget,Path B,Health & Air Quality,Health & Air Quality,"https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC2091764506-SEDAC&query=%7B%22name%22:%22query-smd-primary%22,%22scope%22:%22All%22,%22text%22:%22Daily%20and%20Annual%20PM2.5%20Concentrations%20for%20the%20Contiguous%20United%20States,%201-km%20Grids,%20v1%20(2000%E2%80%8A%E2%80%93%E2%80%8A2016)%22%7D" 4 | ECCO Global Mean Sea Level - Daily Mean (Version 4 Release 4),Climate Change,"This dataset provides daily-averaged global mean sea level from the ECCO Version 4 Release 4 (V4r4) ocean and sea-ice state estimate. Estimating the Circulation and Climate of the Ocean (ECCO) ocean and sea-ice state estimates are dynamically and kinematically-consistent reconstructions of the three-dimensional time-evolving ocean, sea-ice, and surface atmospheric states. ECCO V4r4 is a free-running solution of the 1-degree global configuration of the MIT general circulation model (MITgcm) that has been fit to observations in a least-squares sense.",The ECCO (Estimating the Circulation and Climate of the Ocean) Global Mean Sea Level - Daily Mean dataset provides daily-averaged global mean sea level from the ECCO (Estimating the Circulation and Climate of the Ocean) Version 4 Release 4 (V4r4) ocean and sea-ice state estimate. The dataset reconstructs average sea level for urban flooding and coastal vulnerability research.,Global,netCDF-4,,,,1992-01-01 to 2018-01-01,,https://dx.doi.org/10.5067/ECTSD-MSL44,ECCO - Estimating the Circulation and Climate of the Ocean,20-year temporal extent,,,Path C,Climate Change,Climate Change,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC1991543819-POCLOUD&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 5 | ECOSTRESS Land Surface Temperature and Emissivity Daily L2 Global 70m V001,Extreme Heat,The ECOsystem Spaceborne Thermal Radiometer Experiment on Space Station (ECOSTRESS) mission measures the temperature of plants to better understand how much water plants need and how they respond to stress. The ECO2LSTE Version 1 data product provides atmospherically corrected land surface temperature and emissivity (LST&E) values derived from five thermal infrared (TIR) bands. The ECO2LSTE data product was derived using a physics-based Temperature and Emissivity Separation (TES) algorithm. The ECO2LSTE is provided as swath data and has a spatial resolution of 70 meters (m).,The ECOSTRESS Land Surface Temperature dataset provides land surface temperature data and urban heat analysis. These data may aid mitigation strategies for urban heat and heat stress.,Global,HDF5,70 m,70 meters,Varies,2018-07-09 to Present,,https://dx.doi.org/10.5067/ECOSTRESS/ECO2LSTE.001,ECOSTRESS - ECOsystem Spaceborne Thermal Radiometer Experiment on Space Station,Recent data available,Latency unclear,,Path B,Extreme Heat,Extreme Heat,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC1534729776-LPDAAC_ECS&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 6 | "Food Insecurity Hotspots Data Set, v1 (2009 – 2019)","Food Availability,Human Dimensions","The Food Insecurity Hotspots Data Set consists of grids at 250 meter (~7.2 arc-seconds) resolution that identify the level of intensity and frequency of food insecurity over the 10 years between 2009 and 2019, as well as hotspot areas that have experienced consecutive food insecurity events. The gridded data are based on subnational food security analysis provided by FEWS NET (Famine Early Warning Systems Network) in five (5) regions, including Central America and the Caribbean, Central Asia, East Africa, Southern Africa, and West Africa. Based on the Integrated Food Security Phase Classification (IPC), food insecurity is defined as Minimal, Stressed, Crisis, Emergency, and Famine.",The Food Insecurity Hotspots Dataset shows the level of intensity and frequency of food insecurity as well as hotspot areas that have experienced consecutive food insecurity events.,"Central America, Caribbean Islands, Central Asia, East Africa, Southern Africa, West Africa.","raster, vector, map",250 m,250 m,,2009-01-01 to 2019-12-31,,https://doi.org/10.7927/cx02-2587,FOOD - Food Security Data Collection,10-year temporal extent,Limited geographic coverage,https://sedac.ciesin.columbia.edu/data/set/food-food-insecurity-hotspots/maps,Path B,Human Dimensions,Human Dimensions,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC1947985418-SEDAC&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 7 | "Global Annual PM2.5 Grids from MODIS, MISR and SeaWiFS Aerosol Optical Depth (AOD) with GWR, v1 (1998 – 2016)",Health & Air Quality,"The Global Annual PM2.5 Grids from MODIS, MISR and SeaWiFS Aerosol Optical Depth (AOD) with GWR, 1998-2016 consist of annual concentrations (micrograms per cubic meter) of ground-level fine particulate matter (PM2.5), with dust and sea-salt removed. This data set combines AOD retrievals from multiple satellite instruments including the NASA Moderate Resolution Imaging Spectroradiometer (MODIS), Multi-angle Imaging SpectroRadiometer (MISR), and the Sea-Viewing Wide Field-of-View Sensor (SeaWiFS). The GEOS-Chem chemical transport model is used to relate this total column measure of aerosol to near-surface PM2.5 concentration. Geographically Weighted Regression (GWR) is used with global ground-based measurements to predict and adjust for the residual PM2.5 bias per grid cell in the initial satellite-derived values. These estimates are primarily intended to aid in large-scale studies","The Global Annual PM2.5 Grids from MODIS, MISR and SeaWiFS Aerosol Optical Depth (AOD) with GWR, 1998-2016 consist of annual concentrations (micrograms per cubic meter) of ground-level fine particulate matter (PM2.5), with dust and sea-salt removed. This data set combines AOD retrievals from multiple satellite instruments including the NASA Moderate Resolution Imaging Spectroradiometer (MODIS), Multi-angle Imaging SpectroRadiometer (MISR), and the Sea-Viewing Wide Field-of-View Sensor (SeaWiFS).",Global,GeoTIFF,0.01 degrees,1 kilometers,Annual,1998-01-01 to 2016-12-31,,https://doi.org/10.7927/H4ZK5DQS,SDEI - Satellite-Derived Environmental Indicators,,,Worldview,Path C,Health & Air Quality,Health & Air Quality, 8 | Global Flood Hazard Frequency and Distribution,"Disasters,Urban Flooding","The Global Flood Hazard Frequency and Distribution is a 2.5 minute grid derived from a global listing of extreme flood events between 1985 and 2003 (poor or missing data in the early/mid 1990s) compiled by Dartmouth Flood Observatory and georeferenced to the nearest degree. The resultant flood frequency grid was then classified into 10 classes of approximately equal number of grid cells. The greater the grid cell value in the final data set, the higher the relative frequency of flood occurrence. This data set is the result of collaboration among the Columbia University Center for Hazards and Risk Research (CHRR) and Columbia University Center for International Earth Science Information Network (CIESIN).","The Global Flood Hazard Frequency and Distribution compiles data from extreme flood events between 1985 and 2003. Each grid is assigned a value based on the frequency of floods in that area. The higher the number, the more frequent flood events are in that area.",Global,"raster, map",,,,1985-01-01 to 2003-12-31,,https://doi.org/10.7927/H4668B3D,NDH - Natural Disaster Hotspots,18-year temporal extent,Lacks recent data,https://sedac.ciesin.columbia.edu/data/set/ndh-flood-hazard-frequency-distribution/maps,Path B,Disasters,Urban Flooding,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC179001777-SEDAC&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 9 | HLS Landsat Operational Land Imager Surface Reflectance and TOA Brightness Daily Global 30m v2.0,Extreme Heat,The Harmonized Landsat and Sentinel-2 (HLS) project provides consistent surface reflectance (SR) and top of atmosphere (TOA) brightness data from the Operational Land Imager (OLI) aboard the joint NASA/USGS Landsat 8 satellite and the Multi-Spectral Instrument (MSI) aboard Europe’s Copernicus Sentinel-2A and Sentinel-2B satellites. The combined measurement enables global observations of the land every 2–3 days at 30-meter (m) spatial resolution.,The HLS Landsat Operational Land Imager Surface Reflectance and TOA Brightness Daily Global 30m v2.0 dataset provides surface reflectance (SR) and top of atmosphere (TOA) brightness data from Landsat 8 satellite data.,Global,COG,30m x 30m,30 meters,Daily - < Weekly,2013-04-11 to Present,,https://dx.doi.org/10.5067/HLS/HLSL30.002,Harmonized Landsat Sentinel-2 (HLS) project,Cloud-optimized,Latency unclear,,Path C,Extreme Heat,Extreme Heat,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC2021957657-LPCLOUD&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 10 | "JPL GRACE and GRACE-FO Mascon Ocean, Ice, and Hydrology Equivalent Water Height Coastal Resolution Improvement (CRI) Filtered Release 06.1 Version 03","Climate Change,Water Availability","This dataset contains gridded monthly global water storage/height anomalies relative to a time-mean, derived from GRACE and GRACE-FO and processed at JPL using the Mascon approach (RL06.1Mv03). This version of the data employs a Coastal Resolution Improvement (CRI) filter that reduces signal leakage errors across coastlines. These data are provided in a single data file in netCDF format, and can be used for analysis for ocean, ice, and hydrology phenomena. The water storage/height anomalies are given in equivalent water thickness units (cm).",This dataset includes monthly global water storage/height to be used in studying groundwater availability. These data may aid in freshwater availability and access research.,Global,netCDF-4,,,Monthly - < Annual,2002-04-04 to Present,,https://dx.doi.org/10.5067/TEMSC-3JC63,"GRACE - Gravity Recovery and Climate Experiment, GRACE-FO - Gravity Recovery and Climate Experiment Follow-On",Cloud-optimized,Only netCDF available,,Path C,Climate Change,Water Availability,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC2536962485-POCLOUD&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 11 | MODIS/Aqua Thermal Anomalies/Fire 5-Min L2 Swath 1km V061,Health & Air Quality,"The Aqua Moderate Resolution Imaging Spectroradiometer (MODIS) Thermal Anomalies and Fire MYD14 Version 6.1 product is produced daily in 5-minute temporal satellite increments (swaths) at a 1 kilometer (km) spatial resolution. The MYD14 product is used to generate all of the higher level fire products, but can also be used to identify fires and other thermal anomalies, such as volcanoes. Each swath of data is approximately 2,030 kilometers along track (long), and 2,300 kilometers across track (wide).",The MODIS Fires and Thermal Anomalies (Day/Night) dataset provides fire anomalies for both land and water. These data may aid in air quality and disaster research.,Global,HDF-EOS,1 km,1 kilometer,Hourly - < Daily,2002-07-04 to Present,,https://dx.doi.org/10.5067/MODIS/MYD14.061,"Aqua - Earth Observing System (EOS), Aqua",Daily temporal resolution,Only HDF-EOS5 available,Worldview,Path B,Health & Air Quality,Health & Air Quality,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC2278858993-LPCLOUD&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 12 | NASADEM (NASA Digital Elevation Model) Topography Data,Urban Flooding,"NASADEM data products were derived from original telemetry data from the Shuttle Radar Topography Mission (SRTM), a collaboration between NASA and the National Geospatial-Intelligence Agency (NGA), as well as participation from the German and Italian space agencies. SRTM’s primary focus was to generate a near-global DEM of the Earth using radar interferometry. It was a primary component of the payload on space shuttle Endeavour during its STS-99 mission, which was launched on February 11, 2000, and flew for 11 days. In addition to Terra Advanced Spaceborne Thermal and Reflection Radiometer (ASTER) Global Digital Elevation Model (GDEM) Version 2 data, NASADEM also relied on Ice, Cloud, and Land Elevation Satellite (ICESat) Geoscience Laser Altimeter System (GLAS) ground control points of its lidar shots to improve surface elevation measurements that led to improved geolocation accuracy. Other reprocessing improvements include the conversion to geoid reference and the use of GDEMs and Advanced Land Observing Satellite Panchromatic Remote-sensing instrument for Stereo Mapping (PRISM) AW3D30 DEM, and interpolation for void filling. NASADEM are distributed in 1 degree latitude by 1 degree longitude tiles and consist of all land between 60° N and 56° S latitude. This accounts for about 80% of Earth’s total landmass.","NASA Digital Elevation Model (NASADEM) Topography Data provide a digital elevation model of all land between 60 degrees north and 56 degrees south, nearly 80% of Earth's landmass. Topography data can be useful in urban flooding and hurricane research.",Global,netCDF-4,30 meters,30 meters,Multi-Day,2000-02-11 to 2000-02-21,,https://doi.org/10.5067/MEaSUREs/NASADEM/NASADEM_NC.001,MEaSUREs ,High resolution and large geographic coverage,Lacks recent data,,Path C,Urban Flooding,Urban Flooding, 13 | OMPS/NPP PCA SO2 Total Column 1-Orbit L2 Swath 50x50km V2,Health & Air Quality,"The OMPS_NPP_NMSO2_PCA_L2 product is part of the MEaSUREs (Making Earth Science Data Records for Use in Research Environments) suite of products. It is retrieved from the NASA/NOAA Suomi National Polar-orbiting Partnership (SNPP) Ozone Mapping and Profiler Suite (OMPS) Nadir Mapper (NM) spectrometer and provides contiguous daily global monitoring of anthropogenic and volcanic sulfur dioxide (SO2), an important pollutant and aerosol precursor that affects both air quality and the climate. The product is based on the NASA Goddard Space Flight Center principal component analysis (PCA) spectral fitting algorithm (Li et al., 2013, 2017), and continues (Zhang et al., 2017) NASA's Earth Observing System (EOS) standard Aura/Ozone Monitoring Instrument SO2 product (OMSO2).","The OMPS_NPP_NMSO2_PCA_L2 product is part of the MEaSUREs (Making Earth Science Data Records for Use in Research Environments) suite of products. It is retrieved from the NASA/NOAA Suomi National Polar-orbiting Partnership (SNPP) Ozone Mapping and Profiler Suite (OMPS) Nadir Mapper (NM) spectrometer and provides contiguous daily global monitoring of anthropogenic and volcanic sulfur dioxide (SO2), an important pollutant and aerosol precursor that affects both air quality and the climate. ",Global,HDF5,50 km x 50 km,50 kilometers,,2012-01-26 ongoing,,https://dx.doi.org/10.5067/MEASURES/SO2/DATA205,MEaSUREs - Making Earth System Data Records for Use in Research Environments,,,Worldview,Path B,Health & Air Quality,Health & Air Quality,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC1917909223-GES_DISC&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 14 | "U.S. Census Grids (Summary File 1), 2010","Food Availability,Human Dimensions","The U.S. Census Grids (Summary File 1), 2010 data set contains grids of demographic and socioeconomic data from the year 2010 in ASCII and GeoTIFF formats. The grids have a resolution of 30 arc-seconds (0.0083 decimal degrees), or approximately 1 square km. The gridded variables are based on census block geography from Census 2010 TIGER/Line Files and census variables (population, households, and housing variables).","The US Census Grids 2010 provide gridded demographic data, including age, race, ethnicity, and housing for the US and Puerto Rico.","United States, Puerto Rico","ASCII, GeoTIFF, raster, map",0.00833 degrees,1 kilometer,,2010-01-01 to 2010-12-31,,https://doi.org/10.7927/H40Z716C,USCG - U.S. Census Grids,Several socioeconomic factors available within dataset,Data only available for the year 2010,SEDAC map widget,Path B,Food Availability,Human Dimensions,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC1426173834-SEDAC&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D 15 | VIIRS/SNPP Deep Blue Aerosol L2 6-Min Swath 6 km (v2.0),Health & Air Quality,"The Suomi National Polar-orbiting Partnership (SNPP) Visible Infrared Imaging Radiometer Suite (VIIRS) NASA standard Level-2 (L2) deep blue aerosol product provides satellite-derived measurements of Aerosol Optical Thickness (AOT) and their properties over land and ocean, every 6 minutes, globally. The Deep Blue algorithm draws its heritage from previous applications to retrieve AOT from Sea‐viewing Wide Field‐of‐view Sensor (SeaWiFS) and Moderate Resolution Imaging Spectroradiometer (MODIS) measurements over land. This L2 description pertains to the SNPP VIIRS Deep Blue Aerosol version-1.1 (V1.1) product, whose record starts from March 1st 2012.",The VIIRS (Suomi NPP) Deep Blue Aerosol Optical Thickness (Land and Ocean) dataset provides Aerosol Optical Thickness (AOT) used in air quality monitoring and research.,Global,,6 km,6 kilometers,Daily - < Weekly,2023-06-01 ongoing,NRT,https://doi.org/10.5067/VIIRS/AERDB_L2_VIIRS_SNPP_NRT.002,Deep Blue aerosol project,"NRT, Daily temporal resolution",Only netcdf4 available,Worldview,Path B,Health & Air Quality,Health & Air Quality,https://sciencediscoveryengine.nasa.gov/app/nasa-sba-smd/#/preview?id=%2FSDE%2FCMR_API%2F%7CC2706359459-ASIPS&query=%7B%22name%22%3A%22query-smd-primary%22%2C%22scope%22%3A%22All%22%2C%22text%22%3A%22%22%7D -------------------------------------------------------------------------------- /data/ej_dataset.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/data/ej_dataset.xlsx -------------------------------------------------------------------------------- /data/human_evaluation_done.csv: -------------------------------------------------------------------------------- 1 | Indicators,Indicators_before,Indicators_after,H_Indicator_before,H_Indicator_after,Geographic Coverage,Geographic_Coverage_before,Geographic_Coverage_after,H_Geographic_Coverage_before,H_Geographic_Coverage_after,Format,Format_before,Format_after,H_Format_before,H_Format_after,Spatial Resolution,Spatial Resolution (Standard),Spatial_Resolution_before,Spatial_Resolution_after,H_Spatial_Resolution_before,H_Spatial_Resolution_after,Temporal Resolution,Temporal_Resolution_before,Temporal_Resolution_after,H_Temporal_Resolution_before,H_Temporal_Resolution_after,Temporal Extent,Temporal_Extent_before,Temporal_Extent_after,H_Temporal_Extent_before,H_Temporal_Extent_after,urls 2 | Extreme Heat,Disasters,Climate Change,0,0,Global,global,Global,1,1,COG,Geotiff,COG,0,1,30m x 30m,30 meters,30m,30 meters,1,1,Daily - < Weekly,Daily,Daily - < Weekly,0,1,2013-04-11 to Present,present,2013-03-11 to Present,0,1,https://dx.doi.org/10.5067/HLS/HLSL30.002 3 | Health & Air Quality,Extreme Heat,Health & Air Quality,0,1,Global,global,Global,1,1,GeoTIFF,raster,GeoTIFF,0,1,0.01 degrees,1 kilometers,varies,111 kilometers,0,0,Annual,yearly,Annual,1,1,1998-01-01 to 2016-12-31,1998-2016,1998-01-01 to 2016-12-31,1,1,https://doi.org/10.7927/H4ZK5DQS 4 | "Food Availability,Human Dimensions",Human Dimensions,Human Dimensions,1,1,"United States, Puerto Rico","United States, Puerto Rico","United States, Puerto Rico",1,1,"ASCII, GeoTIFF, raster, map","ASCII, GeoTIFF","ASCII, GeoTIFF",1,1,0.00833 degrees,1 kilometer,30 decimal degrees,1 kilometer,0,1,,N/A,N/A,1,1,2010-01-01 to 2010-12-31,2010,2010-01-01 to 2010-12-31,0,1,https://doi.org/10.7927/H40Z716C 5 | Urban Flooding,Disasters,Climate Change,0,0,Global,global,Global,1,1,netCDF-4,N/A,HGT,0,0,30 meters,30 meters,1 arc second,30 meters,0,1,Multi-Day,N/A,N/A,0,0,2000-02-11 to 2000-02-21,2000-02-11 to present,1/1/08,0,0,https://doi.org/10.5067/MEaSUREs/NASADEM/NASADEM_NC.001 6 | Health & Air Quality,Disasters,Disasters,0,0,Global,global,Global,1,1,HDF-EOS,N/A,HDF-EOS2,0,1,1 km,1 kilometer,1 km,1 kilometer,1,1,Hourly - < Daily,Daily,Daily - < Weekly,1,1,2002-07-04 to Present,2012 to present,2002-07-04 to Present,0,1,https://dx.doi.org/10.5067/MODIS/MYD14.061 7 | Health & Air Quality,Disasters,Human Dimensions,0,0,Global,global,Global,1,1,HDF5,N/A,XLS,0,0,50 km x 50 km,50 kilometers,varies,Country,0,0,,N/A,Varies,1,0,2012-01-26 ongoing,varies,2018-01-01 to 2020-12-31,0,0,https://dx.doi.org/10.5067/MEASURES/SO2/DATA205 8 | "Food Availability,Human Dimensions",Food Availability,Food Availability,1,1,"Central America, Caribbean Islands, Central Asia, East Africa, Southern Africa, West Africa.",global,Global,1,1,"raster, vector, map",gridded data,GeoTIFF,0,0,250 m,250 m,250 meter,1 kilometer,1,0,,yearly,Annual,0,0,2009-01-01 to 2019-12-31,2009-2019,2009-01-01 to 2019-12-31,1,1,https://doi.org/10.7927/cx02-2587 9 | Disasters,Disasters,Climate Change,1,0,"Northwest Territories and Northern Alberta, Canada","Northwest Territories, Canada","Northwest Territories, Canada",1,1,GeoTIFF,maps,GeoTIFF,0,1,30 meters,30 meters,30 meters,30 meters,1,1,,N/A,Not Applicable,1,1,2014-05-01 to 2015-10-01,varies,2014-01-01 to 2016-12-31,0,0,https://dx.doi.org/10.3334/ORNLDAAC/1694 10 | "Climate Change,Water Availability",Water Availability,Climate Change,1,1,Global,global,Global,1,1,netCDF-4,netcdf,NetCDF-4,0,1,,,decimal degrees x decimal degrees,220 kilometers,0,0,Monthly - < Annual,monthly,Monthly - < Annual,1,1,2002-04-04 to Present,present,2002-04-16 to 2017-06-19,0,0,https://dx.doi.org/10.5067/TEMSC-3JC63 11 | Climate Change,Water Availability,Climate Change,0,1,Global,global,Global,1,1,netCDF-4,website,netCDF-4,0,1,,,N/A,50 kilometers,1,0,,daily,Daily - < Weekly,0,0,1992-01-01 to 2018-01-01,varies,1992-01-01 to 2018-01-01,0,1,https://dx.doi.org/10.5067/ECTSD-MSL44 12 | "Disasters,Urban Flooding",Disasters,Disasters,1,1,Global,global,Global,1,1,"raster, map",minute grid,"ASCII, PDF, PNG",0,0,,,N/A,1 kilometer,1,0,,N/A,N/A,1,1,1985-01-01 to 2003-12-31,1985-2003,1985-01-01 to 2003-12-31,1,1,https://doi.org/10.7927/H4668B3D 13 | Extreme Heat,Climate Change,Climate Change,0,0,Global,global,Global,1,1,HDF5,hdf5,HDF5,1,1,70 m,70 meters,70 meters,70 meters,1,1,Varies,varies,Daily - < Weekly,1,0,2018-07-09 to Present,present,2018-09-02 to Present,0,1,https://dx.doi.org/10.5067/ECOSTRESS/ECO2LSTE.001 14 | Health & Air Quality,Health & Air Quality,Health & Air Quality,1,1,United States,contiguous united states,United States,0,1,"raster, tabular, vector",grid,NetCDF-4,0,0,0.00833 degrees,1 kilometer,1 km,1 kilometer,1,1,Daily; Annual,daily,Daily - Annual,0,1,2000-01-01 to 2016-12-31,2000-2016,2000-01-01 to 2016-12-31,1,1,https://doi.org/10.7927/0rvr-4538 15 | Health & Air Quality,Health & Air Quality,Health & Air Quality,1,1,Global,global,Global,1,1,,N/A,HDF5,1,0,6 km,6 kilometers,6 km,6 kilometers,1,1,Daily - < Weekly,N/A,Daily - < Weekly,0,1,2023-06-01 ongoing,2013 to present,2012-03-13 to Present,0,0,https://doi.org/10.5067/VIIRS/AERDB_L2_VIIRS_SNPP_NRT.002 -------------------------------------------------------------------------------- /data/images/llm-use-in-smd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/data/images/llm-use-in-smd.png -------------------------------------------------------------------------------- /data/osdr-eval.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/data/osdr-eval.zip -------------------------------------------------------------------------------- /data/policies.json: -------------------------------------------------------------------------------- 1 | { 2 | "SPD41a": [ 3 | "SPD-41a Policy Requirement", 4 | "Data shall be made publicly available, free in open, machine-readable formats [III.C.ii\u2013iv]", 5 | "SMD-funded data shall be reusable with a clear, open, and accessible data license [III.C.vii]. (If there are no other restrictions, SMD scientific data should be released with a Creative Commons Zero license.)", 6 | "Publicly available SMD-funded data collections shall be citable using a persistent identifier [III.C.viii]", 7 | "SMD-funded data shall include robust, standards-compliant metadata that clearly and explicitly describe the data [III.C.vi]", 8 | "SMD-funded data shall be findable, such that the data can be retrieved, downloaded, indexed, and searched [III.C.v]", 9 | "SMD-funded data collections shall be indexed as part of the NASA catalog of data [III.C.ix]", 10 | "SMD-funded data should follow the FAIR Guiding Principles [III.C.i]", 11 | "The SMD repository provides documentation on policies for data retention [Appendix D.11.]", 12 | "If there are no other restrictions, publicly available SMD-funded software should be released under a permissive license that has broad acceptance in the community [III.D.iii]", 13 | "Publicly available SMD-funded software shall be citable using a persistent identifier [III.D.vii]" 14 | ], 15 | "FAIR": [ 16 | "FAIR Guiding Principle", 17 | "F1 - Persistent identifier (PID)", 18 | "F2 - Rich metadata", 19 | "F3 - Linked PID", 20 | "F4 - Searchable", 21 | "", 22 | "A1 - Retrievable", 23 | "A1.1 - Protocol", 24 | "A1.2 - Procedure (authentication and authorization)", 25 | "A2 - Permanent metadata record", 26 | "", 27 | "I1 - Language (knowledge representation)", 28 | "I2 - Vocabulary", 29 | "I3 - Reference", 30 | "", 31 | "R1 - Attributes", 32 | "R1.1 - License", 33 | "R1x .2 - Provenance", 34 | "R1.3 - Community standards" 35 | ] 36 | } -------------------------------------------------------------------------------- /docs-presentations/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/docs-presentations/.keep -------------------------------------------------------------------------------- /docs-presentations/README.md: -------------------------------------------------------------------------------- 1 | # Literature review 2 | File to track literature reviews. You can choose to keep the md format or use Latex (overleaf). 3 | 4 | 5 | -------------------------------------------------------------------------------- /docs-presentations/papers/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/docs-presentations/papers/.keep -------------------------------------------------------------------------------- /images/applications.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/applications.png -------------------------------------------------------------------------------- /images/astro_langchain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/astro_langchain.png -------------------------------------------------------------------------------- /images/cmr_langchain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/cmr_langchain.png -------------------------------------------------------------------------------- /images/create-jupyterlab-env.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/create-jupyterlab-env.png -------------------------------------------------------------------------------- /images/credential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/credential.png -------------------------------------------------------------------------------- /images/credentials-show.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/credentials-show.png -------------------------------------------------------------------------------- /images/git-clone-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/git-clone-1.png -------------------------------------------------------------------------------- /images/jupyterlab-spaces.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/jupyterlab-spaces.png -------------------------------------------------------------------------------- /images/llm-use-in-smd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/llm-use-in-smd.png -------------------------------------------------------------------------------- /images/loggedin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/loggedin.png -------------------------------------------------------------------------------- /images/login-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/login-1.png -------------------------------------------------------------------------------- /images/login-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/login-2.png -------------------------------------------------------------------------------- /images/sagemaker-studio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/sagemaker-studio.png -------------------------------------------------------------------------------- /images/smd-hls-cloned-content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/smd-hls-cloned-content.png -------------------------------------------------------------------------------- /images/smd-hls-git-clone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/smd-hls-git-clone.png -------------------------------------------------------------------------------- /images/smd-llm-cloned-content.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/smd-llm-cloned-content.png -------------------------------------------------------------------------------- /images/smd-llm-git-clone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/smd-llm-git-clone.png -------------------------------------------------------------------------------- /images/update-instance-type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/update-instance-type.png -------------------------------------------------------------------------------- /images/updated-instance-config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NASA-IMPACT/LLM-cookbook-for-open-science/9aad93935304949f12d4745652e04b169c88830d/images/updated-instance-config.png -------------------------------------------------------------------------------- /notebooks/Chat-with-mDGF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "7Ceib8WYZHGF" 7 | }, 8 | "source": [ 9 | "## Chat with mDGF document \n", 10 | "\n", 11 | "This notebook loads the Modern Data Governance Framework (MDGF) document and uses generative models to answer user questions about the document. The notebook also allows creation of governance document based on user input." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "id": "lCeyGy6NZHGG" 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%%capture\n", 23 | "# update or install the necessary libraries\n", 24 | "!pip install --upgrade langchain-openai\n", 25 | "!pip install --upgrade langchain\n", 26 | "!pip install --upgrade python-dotenv" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "id": "2ARDA5TyZHGG", 37 | "outputId": "b496f8d0-fa8e-4338-f025-835fbd40f2d3" 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | ".env file loaded correctly: True\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "import os\n", 50 | "from dotenv import load_dotenv\n", 51 | "print(f\".env file loaded correctly: {load_dotenv()}\")\n", 52 | "\n", 53 | "from langchain.chat_models import AzureChatOpenAI\n", 54 | "# from langchain_openai import ChatOpenAI ## use this if you are using the openai chat model\n", 55 | "from langchain.chains import ConversationChain\n", 56 | "from langchain.chains.conversation.memory import ConversationBufferMemory\n", 57 | "from langchain.callbacks import get_openai_callback\n", 58 | "from langchain_openai import ChatOpenAI\n", 59 | "from langchain.chains import ConversationChain\n", 60 | "from langchain.chains.conversation.memory import ConversationBufferMemory\n", 61 | "from langchain.callbacks import get_openai_callback\n", 62 | "from langchain.prompts.chat import (\n", 63 | " ChatPromptTemplate,\n", 64 | " HumanMessagePromptTemplate,\n", 65 | " MessagesPlaceholder\n", 66 | ")\n", 67 | "from langchain.schema import (\n", 68 | " SystemMessage,\n", 69 | ")" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "# Load The MDGF Document" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# read the document\n", 86 | "with open(\"../data/mdgf_document.txt\", \"r\") as file:\n", 87 | " mdgf_document = file.read()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "# Combining Prompt Patterns\n", 95 | "- the Persona prompt pattern\n", 96 | " - Asking the model to assume persona of expert in scientific data dovernance\n", 97 | "- Recipe prompt \n", 98 | " - Providing the model steps to follow to generate the governance document\n", 99 | "- Output Automator prompt \n", 100 | " - Asking the model to generate a script to automate and provide output in specific format: in this case - the headings required to generate the governance document" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 4, 106 | "metadata": { 107 | "colab": { 108 | "base_uri": "https://localhost:8080/" 109 | }, 110 | "id": "hH2VqbsCZHGH", 111 | "outputId": "aa72381e-7540-481f-d64e-2ce2eb21c088" 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "/Users/mramasub/work/workshop-usecases-llm/.venv/lib/python3.12/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.chat_models.azure_openai.AzureChatOpenAI` was deprecated in langchain-community 0.0.10 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import AzureChatOpenAI`.\n", 119 | " warn_deprecated(\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "import pprint\n", 125 | "import json\n", 126 | "\n", 127 | "MDGF_PROMPT = f\"\"\"\n", 128 | "You are an expert in scientific data governance and management and you will assist the users by answering questions and creating documents. Use only the content in the Modern Data Governance Framework (MDGF) reference text after the delimiter for your answers. If a questions falls outside the reference text, then respond, “This is out of scope for me to answer”\n", 129 | "\n", 130 | "Your responsibilities are two::\n", 131 | "\n", 132 | "First - Answering Questions:\n", 133 | "You will be asked questions. Answer the question only using the reference text provided.\n", 134 | "Apart from Answering the question, Cite the passages from the document used to answer the question, prefixing it with citation.\n", 135 | "For Any Requirement, you should also provide the corresponding procedure.\n", 136 | "If you cannot find an answer in the reference text, then respond, “I could not find the answer”\n", 137 | "\n", 138 | "Second - Creating Documents:\n", 139 | "\n", 140 | "When asked by a user to create either a requirements document or a procedure plan based on the reference text. Assist the user by asking a series of questions to capture their project needs.\n", 141 | "\n", 142 | "Step 1: Identify the entity in the user’s project. Respond with: “Sure, I will be happy to help. First tell me the core entity or asset in that you will be managing\n", 143 | "\n", 144 | "Data \n", 145 | "Metadata\n", 146 | "Digital content \n", 147 | "Code\n", 148 | "Software”\n", 149 | "\n", 150 | "Step 2: Identify governance activity in the user’s project. Respond with: “Tell me about the governance activity need in your project\n", 151 | "\n", 152 | "Planning and Design\n", 153 | "Monitoring\n", 154 | "Generation/Curation\n", 155 | "Sharing\n", 156 | "Use/Reuse\n", 157 | "Preservation”\n", 158 | "\n", 159 | "Step 3: Identify the user's need for the Type of document. Respond with: “Are you seeking Requirements or Procedures for your project?\n", 160 | "\n", 161 | "Requirements\n", 162 | "Procedures”\n", 163 | "\n", 164 | "Finally, Respond with:\n", 165 | "\"Here are the headings for the Requirements document:\n", 166 | "A.1.1.1, A.1.2.1, ...\" \n", 167 | "You should provide only the headings (A.1.1.1, A.1.2.1, ...) provided in the DGF documents. You should never provide any additional information. Do NOT use placeholder text or ... or anything similar in the response.\n", 168 | "\n", 169 | "\n", 170 | "Here is the reference DGF document:\n", 171 | "{mdgf_document} \n", 172 | "\"\"\"\n", 173 | "\n", 174 | "# llm = ChatOpenAI(\n", 175 | "# temperature=0.5,\n", 176 | "# \topenai_api_key=os.environ[\"OPENAI_API_KEY\"],\n", 177 | "# \tmodel_name=\"gpt-4-turbo-preview\"\n", 178 | "# )\n", 179 | "llm = AzureChatOpenAI(\n", 180 | " temperature=0.5,\n", 181 | "\tmodel_name=\"gpt-4-turbo-preview\"\n", 182 | ")\n", 183 | "prompt = ChatPromptTemplate.from_messages(\n", 184 | " [\n", 185 | " SystemMessage(\n", 186 | " content=MDGF_PROMPT,\n", 187 | " ), # The persistent system prompt\n", 188 | " MessagesPlaceholder(\n", 189 | " variable_name=\"history\"\n", 190 | " ), # Where the memory will be stored.\n", 191 | " HumanMessagePromptTemplate.from_template(\n", 192 | " \"{input}\"\n", 193 | " ), # Where the human input will injected\n", 194 | " ]\n", 195 | ")\n", 196 | "\n", 197 | "def ask(chain, query, track_token=True):\n", 198 | " with get_openai_callback() as cb:\n", 199 | " result = chain.invoke(input=query)\n", 200 | " if track_token:\n", 201 | " print(f'Total tokens: {cb.total_tokens}')\n", 202 | " print(f'Requests: {cb.successful_requests}')\n", 203 | " print(result['response'])\n", 204 | " return result['response']\n", 205 | "\n", 206 | "conversation = ConversationChain(\n", 207 | " prompt=prompt,\n", 208 | " llm=llm,\n", 209 | " verbose=False,\n", 210 | " memory=ConversationBufferMemory(ai_prefix=\"AI Assistant\", memory_key=\"history\", return_messages=True),\n", 211 | ")" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 5, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "Total tokens: 13772\n", 224 | "Requests: 1\n", 225 | "For data file naming conventions, you should adhere to community best practices on data file naming conventions.\n", 226 | "\n", 227 | "Citation:\n", 228 | "- \"A1.1.5 Adhere to community best practice(s) on data file naming conventions\"\n", 229 | "\n", 230 | "Procedure:\n", 231 | "- \"B1.1.5 Define and document file naming conventions using following guidelines: GHRC File Naming convention [DS]\"\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "_ = ask(conversation, \"what data file naming conventions should I use?\")" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 6, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "Total tokens: 13823\n", 249 | "Requests: 1\n", 250 | "Sure, I will be happy to help. First tell me the core entity or asset in that you will be managing\n", 251 | "\n", 252 | "Data \n", 253 | "Metadata\n", 254 | "Digital content \n", 255 | "Code\n", 256 | "Software\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "_ = ask(conversation, \"Can you create a requirements document for me?\")" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 7, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "Total tokens: 13863\n", 274 | "Requests: 1\n", 275 | "Tell me about the governance activity need in your project\n", 276 | "\n", 277 | "Planning and Design\n", 278 | "Monitoring\n", 279 | "Generation/Curation\n", 280 | "Sharing\n", 281 | "Use/Reuse\n", 282 | "Preservation\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "_ = ask(conversation, \"Data, Metadata\")\n" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 8, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "Total tokens: 13888\n", 300 | "Requests: 1\n", 301 | "Are you seeking Requirements or Procedures for your project?\n", 302 | "\n", 303 | "Requirements\n", 304 | "Procedures\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "_ = ask(conversation, \"Planning and Design\")" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 9, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "Total tokens: 14059\n", 322 | "Requests: 1\n", 323 | "Here are the headings for the Requirements document:\n", 324 | "A1.1.1\n", 325 | "A1.1.2\n", 326 | "A1.1.3\n", 327 | "A1.1.4\n", 328 | "A1.1.5\n", 329 | "A1.1.6\n", 330 | "A1.1.7\n", 331 | "A1.1.8\n", 332 | "A1.1.9\n", 333 | "A1.1.10\n", 334 | "A1.1.11\n", 335 | "A1.1.12\n", 336 | "A1.1.13\n", 337 | "A1.1.14\n", 338 | "A1.1.15\n", 339 | "A1.1.16\n", 340 | "A2.1.1\n", 341 | "A2.1.2\n", 342 | "A2.1.3\n", 343 | "A2.1.4\n", 344 | "A2.1.5\n", 345 | "A2.1.6\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "model_response = ask(conversation, \"Requirements\")" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "# Generate MDGF document based on model output\n", 358 | "- Using regex and string matching to generate the governance document based on the model output" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 10, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "All matches: ['A1.1.1', 'A1.1.2', 'A1.1.3', 'A1.1.4', 'A1.1.5', 'A1.1.6', 'A1.1.7', 'A1.1.8', 'A1.1.9', 'A1.1.10', 'A1.1.11', 'A1.1.12', 'A1.1.13', 'A1.1.14', 'A1.1.15', 'A1.1.16', 'A2.1.1', 'A2.1.2', 'A2.1.3', 'A2.1.4', 'A2.1.5', 'A2.1.6']\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "import re\n", 376 | "\n", 377 | "text = model_response\n", 378 | "\n", 379 | "pattern = r'[A-Z]\\d+\\.\\d+\\.\\d+[a-z]?'\n", 380 | "headers = re.findall(pattern, text)\n", 381 | "print('All matches:', headers)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 11, 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "{\n", 394 | " \"data\": {\n", 395 | " \"plan_design\": [\n", 396 | " [\n", 397 | " \"A1.1.1 Define a data flow diagram with the purpose of identifying data sources and touchpoints for the project and for communicating to data users how data was handled.\"\n", 398 | " ],\n", 399 | " [\n", 400 | " \"A1.1.2 Develop touchpoint agreements identified in the data flow diagram\"\n", 401 | " ],\n", 402 | " [\n", 403 | " \"A1.1.3 Adhere to community accepted standard machine readable data file formats\"\n", 404 | " ],\n", 405 | " [\n", 406 | " \"A1.1.4 Identify and document all data product characteristics\"\n", 407 | " ],\n", 408 | " [\n", 409 | " \"A1.1.5 Adhere to community best practice(s) on data file naming conventions\"\n", 410 | " ],\n", 411 | " [\n", 412 | " \"A1.1.6 Adhere to community standard variable names, types, and unit(s), keywords\"\n", 413 | " ],\n", 414 | " [\n", 415 | " \"A1.1.7 Adhere to community standards for coordinate systems\"\n", 416 | " ],\n", 417 | " [\n", 418 | " \"A1.1.8 Adhere to community standards for map projections\"\n", 419 | " ],\n", 420 | " [\n", 421 | " \"A1.1.9 Adhere to community standards for date and time formats\"\n", 422 | " ],\n", 423 | " [\n", 424 | " \"A1.1.10 Define a data product versioning scheme\"\n", 425 | " ],\n", 426 | " [\n", 427 | " \"A1.1.11 Define a science quality evaluation plan for data products\"\n", 428 | " ],\n", 429 | " [\n", 430 | " \"A1.1.12 Develop a data retention plan including a process for when and how data will be sunset\"\n", 431 | " ],\n", 432 | " [\n", 433 | " \"A1.1.14 Identify the most appropriate data license for the data product\"\n", 434 | " ],\n", 435 | " [\n", 436 | " \"A1.1.15 Determine content and format for the dataset landing page\"\n", 437 | " ],\n", 438 | " [\n", 439 | " \"A1.1.16 Determine whether API-based data access is needed & if so, identify an API standard\"\n", 440 | " ]\n", 441 | " ]\n", 442 | " },\n", 443 | " \"metadata\": {\n", 444 | " \"plan_design\": [\n", 445 | " [\n", 446 | " \"A2.1.1 Adhere to a standard metadata schema for data product (collection) and file (granule) level metadata\"\n", 447 | " ],\n", 448 | " [\n", 449 | " \"A2.1.2 Support mandatory metadata elements in the selected schema\"\n", 450 | " ],\n", 451 | " [\n", 452 | " \"A2.1.3 Utilize a standard data product naming convention\"\n", 453 | " ],\n", 454 | " [\n", 455 | " \"A2.1.4 Identify any needed additional metadata fields for specific projects\"\n", 456 | " ],\n", 457 | " [\n", 458 | " \"A2.1.5 Incorporate any access control fields into the metadata, as required\"\n", 459 | " ],\n", 460 | " [\n", 461 | " \"A2.1.6 Define the data product citation\"\n", 462 | " ]\n", 463 | " ]\n", 464 | " }\n", 465 | "}\n" 466 | ] 467 | } 468 | ], 469 | "source": [ 470 | "import json\n", 471 | "\n", 472 | "def subset_data(headers, data):\n", 473 | " # Initialize a dictionary to hold the subsetted data\n", 474 | " subsetted_data = {}\n", 475 | " for top_key, top_value in data.items():\n", 476 | " if isinstance(top_value, dict):\n", 477 | " subsetted_section = {}\n", 478 | " \n", 479 | " for second_key, entries in top_value.items():\n", 480 | " subsetted_entries = []\n", 481 | " \n", 482 | " for entry_list in entries:\n", 483 | " entry_item = []\n", 484 | " for entry in entry_list:\n", 485 | " if any(header in entry[:10] for header in headers):\n", 486 | " entry_item.append(entry)\n", 487 | " if entry_item:\n", 488 | " subsetted_entries.append(entry_item)\n", 489 | " \n", 490 | " if subsetted_entries:\n", 491 | " subsetted_section[second_key] = subsetted_entries\n", 492 | " if subsetted_section:\n", 493 | " subsetted_data[top_key] = subsetted_section\n", 494 | " \n", 495 | " return subsetted_data\n", 496 | "\n", 497 | "import json\n", 498 | "data = json.load(open(\"../data/dgf.json\"))\n", 499 | "subset = subset_data(headers, data)\n", 500 | "\n", 501 | "# Printing the subset to verify\n", 502 | "print(json.dumps(subset, indent=4))" 503 | ] 504 | } 505 | ], 506 | "metadata": { 507 | "colab": { 508 | "provenance": [] 509 | }, 510 | "kernelspec": { 511 | "display_name": "promptlecture", 512 | "language": "python", 513 | "name": "python3" 514 | }, 515 | "language_info": { 516 | "codemirror_mode": { 517 | "name": "ipython", 518 | "version": 3 519 | }, 520 | "file_extension": ".py", 521 | "mimetype": "text/x-python", 522 | "name": "python", 523 | "nbconvert_exporter": "python", 524 | "pygments_lexer": "ipython3", 525 | "version": "3.12.1" 526 | }, 527 | "orig_nbformat": 4, 528 | "vscode": { 529 | "interpreter": { 530 | "hash": "f38e0373277d6f71ee44ee8fea5f1d408ad6999fda15d538a69a99a1665a839d" 531 | } 532 | } 533 | }, 534 | "nbformat": 4, 535 | "nbformat_minor": 0 536 | } 537 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | marvin 2 | pandas 3 | scikit-learn --------------------------------------------------------------------------------