├── docs ├── handson │ ├── .gitignore │ ├── 01 │ │ ├── .gitignore │ │ ├── 01-example.tsv │ │ ├── 01-example.csv │ │ ├── 01-venues.csv │ │ ├── 01-publications.csv │ │ └── 01-publications-venues.json │ ├── 03 │ │ └── .gitignore │ ├── 02 │ │ ├── .gitignore │ │ ├── uml.png │ │ ├── uml2.png │ │ ├── classuse.py │ │ ├── myclasses.py │ │ └── 02-Implementation_of_data_models_via_Python_classes.ipynb │ ├── 04 │ │ ├── .gitignore │ │ ├── tables.png │ │ └── 04-Configuring_and_populating_a_relational_database.ipynb │ ├── 05 │ │ ├── .gitignore │ │ ├── rdfgraph.png │ │ ├── blazegraph.png │ │ └── 05-Configuring_and_populating_a_graph_database.ipynb │ └── 06 │ │ └── 06-Interacting_with_databases_using_Pandas.ipynb ├── .gitignore ├── lecture │ ├── .gitignore │ ├── 00 │ │ ├── 00.pdf │ │ └── 00.pptx │ ├── 01 │ │ ├── 01.pdf │ │ └── 01.pptx │ ├── 02 │ │ ├── 02.pdf │ │ └── 02.pptx │ ├── 03 │ │ ├── 03.pdf │ │ └── 03.pptx │ ├── 04 │ │ ├── 04.pdf │ │ └── 04.pptx │ ├── 05 │ │ ├── 05.pdf │ │ └── 05.pptx │ ├── 06 │ │ ├── 06.pdf │ │ └── 06.pptx │ ├── 07 │ │ ├── 07.pdf │ │ └── 07.pptx │ └── rdfdata.png └── project │ ├── .gitignore │ ├── uml.png │ ├── uml2.png │ ├── datamodel.png │ ├── workflow.png │ └── README.md ├── .gitignore └── README.md /docs/handson/.gitignore: -------------------------------------------------------------------------------- 1 | .*/ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | private/ 2 | .DS_Store -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .*/ -------------------------------------------------------------------------------- /docs/lecture/.gitignore: -------------------------------------------------------------------------------- 1 | .*/ 2 | notes.* -------------------------------------------------------------------------------- /docs/handson/01/.gitignore: -------------------------------------------------------------------------------- 1 | *-modified* 2 | -------------------------------------------------------------------------------- /docs/handson/03/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | 3 | -------------------------------------------------------------------------------- /docs/project/.gitignore: -------------------------------------------------------------------------------- 1 | *.graphml 2 | private/ -------------------------------------------------------------------------------- /docs/handson/02/.gitignore: -------------------------------------------------------------------------------- 1 | __py* 2 | *.graphml 3 | -------------------------------------------------------------------------------- /docs/handson/04/.gitignore: -------------------------------------------------------------------------------- 1 | __py* 2 | *.graphml 3 | *.db 4 | -------------------------------------------------------------------------------- /docs/handson/05/.gitignore: -------------------------------------------------------------------------------- 1 | *.jnl 2 | *.jar 3 | *.graphml 4 | rules.log -------------------------------------------------------------------------------- /docs/project/uml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/project/uml.png -------------------------------------------------------------------------------- /docs/project/uml2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/project/uml2.png -------------------------------------------------------------------------------- /docs/handson/02/uml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/handson/02/uml.png -------------------------------------------------------------------------------- /docs/handson/02/uml2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/handson/02/uml2.png -------------------------------------------------------------------------------- /docs/lecture/00/00.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/00/00.pdf -------------------------------------------------------------------------------- /docs/lecture/00/00.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/00/00.pptx -------------------------------------------------------------------------------- /docs/lecture/01/01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/01/01.pdf -------------------------------------------------------------------------------- /docs/lecture/01/01.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/01/01.pptx -------------------------------------------------------------------------------- /docs/lecture/02/02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/02/02.pdf -------------------------------------------------------------------------------- /docs/lecture/02/02.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/02/02.pptx -------------------------------------------------------------------------------- /docs/lecture/03/03.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/03/03.pdf -------------------------------------------------------------------------------- /docs/lecture/03/03.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/03/03.pptx -------------------------------------------------------------------------------- /docs/lecture/04/04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/04/04.pdf -------------------------------------------------------------------------------- /docs/lecture/04/04.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/04/04.pptx -------------------------------------------------------------------------------- /docs/lecture/05/05.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/05/05.pdf -------------------------------------------------------------------------------- /docs/lecture/05/05.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/05/05.pptx -------------------------------------------------------------------------------- /docs/lecture/06/06.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/06/06.pdf -------------------------------------------------------------------------------- /docs/lecture/06/06.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/06/06.pptx -------------------------------------------------------------------------------- /docs/lecture/07/07.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/07/07.pdf -------------------------------------------------------------------------------- /docs/lecture/07/07.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/07/07.pptx -------------------------------------------------------------------------------- /docs/lecture/rdfdata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/lecture/rdfdata.png -------------------------------------------------------------------------------- /docs/handson/04/tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/handson/04/tables.png -------------------------------------------------------------------------------- /docs/project/datamodel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/project/datamodel.png -------------------------------------------------------------------------------- /docs/project/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/project/workflow.png -------------------------------------------------------------------------------- /docs/handson/05/rdfgraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/handson/05/rdfgraph.png -------------------------------------------------------------------------------- /docs/handson/05/blazegraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comp-data/2021-2022/HEAD/docs/handson/05/blazegraph.png -------------------------------------------------------------------------------- /docs/handson/01/01-example.tsv: -------------------------------------------------------------------------------- 1 | column name another name, with a comma 2 | a value a value, with a comma 3 | a quoted "value" a quoted "value", with a comma -------------------------------------------------------------------------------- /docs/handson/01/01-example.csv: -------------------------------------------------------------------------------- 1 | column name,"another name, with a comma" 2 | a value,"a value, with a comma" 3 | a quoted "value","a quoted ""value"", with a comma" -------------------------------------------------------------------------------- /docs/handson/01/01-venues.csv: -------------------------------------------------------------------------------- 1 | id,name,type 2 | 1531-6912,Comparative and Functional Genomics,journal 3 | 1367-5931,Current Opinion in Chemical Biology,journal 4 | 9780470291092,Proceedings of the 5th Annual Conference on Composites and Advanced Ceramic Materials: Ceramic Engineering and Science Proceedings,book 5 | 1027-3662,Journal of Theoretical Medicine,journal -------------------------------------------------------------------------------- /docs/handson/01/01-publications.csv: -------------------------------------------------------------------------------- 1 | doi,title,publication year,publication venue,type,issue,volume 2 | 10.1002/cfg.304,Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information,2003,1531-6912,journal article,4,4 3 | 10.1016/s1367-5931(02)00332-0,In vitro selection as a powerful tool for the applied evolution of proteins and peptides,2002,1367-5931,journal article,3,6 4 | 10.1002/9780470291092.ch20,Mechanisms of Toughening in Ceramic Matrix Composites,1981,9780470291092,book chapter,, -------------------------------------------------------------------------------- /docs/handson/02/classuse.py: -------------------------------------------------------------------------------- 1 | from myclasses import JournalArticle, Journal 2 | 3 | journal_1 = Journal(["1531-6912"], "Comparative and Functional Genomics") 4 | 5 | journal_article_1 = JournalArticle("10.1002/cfg.304", 6 | 2003, 7 | "Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information", 8 | journal_1, 9 | "4", 10 | "4") 11 | 12 | print("-- Journal article metadata") 13 | print(" | title:", journal_article_1.getTitle()) 14 | print(" | venue name:", journal_article_1.getPublicationVenue().getName()) 15 | print(" | issue:", journal_article_1.getIssue()) 16 | print(" | volume:", journal_article_1.getVolume()) 17 | -------------------------------------------------------------------------------- /docs/handson/01/01-publications-venues.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "doi": "10.1002/cfg.304", 4 | "issue": "4", 5 | "publication venue": { 6 | "id": [ "1531-6912" ], 7 | "name": "Comparative and Functional Genomics", 8 | "type": "journal" 9 | }, 10 | "publication year": 2003, 11 | "title": "Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information", 12 | "type": "journal article", 13 | "volume": "4" 14 | }, 15 | { 16 | "doi": "10.1016/s1367-5931(02)00332-0", 17 | "issue": "3", 18 | "publication venue": { 19 | "id": [ "1367-5931" ], 20 | "name": "Current Opinion in Chemical Biology", 21 | "type": "journal" 22 | }, 23 | "publication year": 2002, 24 | "title": "In vitro selection as a powerful tool for the applied evolution of proteins and peptides", 25 | "type": "journal article", 26 | "volume": "6" 27 | }, 28 | { 29 | "doi": "10.1002/9780470291092.ch20", 30 | "publication venue": { 31 | "id": [ "9780470291092" ], 32 | "name": "Proceedings of the 5th Annual Conference on Composites and Advanced Ceramic Materials: Ceramic Engineering and Science Proceedings", 33 | "type": "book" 34 | }, 35 | "publication year": 1981, 36 | "title": "Mechanisms of Toughening in Ceramic Matrix Composites", 37 | "type": "book chapter" 38 | } 39 | ] -------------------------------------------------------------------------------- /docs/handson/02/myclasses.py: -------------------------------------------------------------------------------- 1 | class Publication(object): 2 | def __init__(self, doi, publicationYear, title, publicationVenue): 3 | self.doi = doi 4 | self.publicationYear = publicationYear 5 | self.title = title 6 | self.publicationVenue = publicationVenue 7 | 8 | def getDOI(self): 9 | return self.doi 10 | 11 | def getPublicationYear(self): 12 | return self.publicationYear 13 | 14 | def getTitle(self): 15 | return self.title 16 | 17 | def getPublicationVenue(self): 18 | return self.publicationVenue 19 | 20 | 21 | class Venue(object): 22 | def __init__(self, identifiers, name): 23 | self.id = set() 24 | for identifier in identifiers: 25 | self.id.add(identifier) 26 | 27 | self.name = name 28 | 29 | def getIds(self): 30 | result = [] 31 | for identifier in self.id: 32 | result.append(identifier) 33 | result.sort() 34 | return result 35 | 36 | def getName(self): 37 | return self.name 38 | 39 | def addId(self, identifier): 40 | result = True 41 | if identifier not in self.id: 42 | self.id.add(identifier) 43 | else: 44 | result = False 45 | return result 46 | 47 | def removeId(self, identifier): 48 | result = True 49 | if identifier in self.id: 50 | self.id.remove(identifier) 51 | else: 52 | result = False 53 | return result 54 | 55 | 56 | class JournalArticle(Publication): 57 | def __init__(self, doi, publicationYear, title, publicationVenue, issue, volume): 58 | self.issue = issue 59 | self.volume = volume 60 | 61 | # Here is where the constructor of the superclass is explicitly recalled, so as 62 | # to handle the input parameters as done in the superclass 63 | super().__init__(doi, publicationYear, title, publicationVenue) 64 | 65 | def getIssue(self): 66 | return self.issue 67 | 68 | def getVolume(self): 69 | return self.volume 70 | 71 | 72 | class BookChapter(Publication): 73 | pass 74 | 75 | 76 | class Journal(Venue): 77 | pass 78 | 79 | 80 | class Book(Venue): 81 | pass -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Science 2 | 3 | This space contains all the material related to the [Data Science course](https://www.unibo.it/en/teaching/course-unit-catalogue/course-unit/2021/467046) of the [Digital Humanities and Digital Knowledge degree](https://corsi.unibo.it/2cycle/DigitalHumanitiesKnowledge) at the [University of Bologna](http://www.unibo.it/en). 4 | 5 | ## Academic year 2021/2022 6 | 7 | ### Table of content 8 | 9 | - [Data Science](#data-science) 10 | - [Academic year 2021/2022](#academic-year-20212022) 11 | - [Table of content](#table-of-content) 12 | - [Material](#material) 13 | - [Schedule](#schedule) 14 | - [Exam sessions](#exam-sessions) 15 | - [Links](#links) 16 | 17 | ### Material 18 | 19 | **Keys:** 20 | 21 | - _the_ = theoretical lecture 22 | - _hon_ = hands-on session 23 | 24 | 1. [31/01/22, *the*] Introduction to the course and final project specifications 25 | - slides: [PDF](https://comp-data.github.io/2021-2022/lecture/00/00.pdf) 26 |
27 | 28 | 2. [02/02/22, *the*] What is a datum and how it can be represented computationally 29 | - slides: [PDF](https://comp-data.github.io/2021-2022/lecture/01/01.pdf) 30 |
31 | 32 | 3. [04/02/22, *hon*] Data formats and methods for storing data in Python 33 | - material: [GitHub](https://github.com/comp-data/2021-2022/tree/main/docs/handson/01) 34 |
35 | 36 | 4. [07/02/22, *the*] Introduction to data modelling 37 | - slides: [PDF](https://comp-data.github.io/2021-2022/lecture/02/02.pdf) 38 |
39 | 40 | 5. [09/02/22, *hon*] Implementation of data models via Python classes 41 | - material: [GitHub](https://github.com/comp-data/2021-2022/tree/main/docs/handson/02) 42 |
43 | 44 | 6. [11/02/22, *the*] Processing and querying the data 45 | - slides: [PDF](https://comp-data.github.io/2021-2022/lecture/03/03.pdf) 46 |
47 | 48 | 7. [14/02/22, *hon*] Introduction to Pandas 49 | - material: [GitHub](https://github.com/comp-data/2021-2022/tree/main/docs/handson/03) 50 |
51 | 52 | 8. [16/02/22, *the*] Database Management Systems 53 | - slides: [PDF](https://comp-data.github.io/2021-2022/lecture/04/04.pdf) 54 |
55 | 56 | 9. [18/02/22, *hon*] Configuring and populating a relational database 57 | - material: [GitHub](https://github.com/comp-data/2021-2022/tree/main/docs/handson/04) 58 |
59 | 60 | 10. [21/02/22, *the*] SQL, a query language for relational databases 61 | - slides: [PDF](https://comp-data.github.io/2021-2022/lecture/05/05.pdf) 62 |
63 | 64 | 11. [23/02/22, *hon*] Configuring and populating a graph database 65 | - material: [GitHub](https://github.com/comp-data/2021-2022/tree/main/docs/handson/05) 66 |
67 | 68 | 12. [25/02/22, *the*] SPARQL, a query language for RDF databases 69 | - slides: [PDF](https://comp-data.github.io/2021-2022/lecture/06/06.pdf) 70 |
71 | 72 | 13. [28/02/22, *hon*] Interacting with databases using Pandas 73 | - material: [GitHub](https://github.com/comp-data/2021-2022/tree/main/docs/handson/06) 74 |
75 | 76 | 14. [02/03/22, *the*] Describing and visualising data 77 | - slides: [PDF](https://comp-data.github.io/2021-2022/lecture/07/07.pdf) 78 |
79 | 80 | 15. [04/03/22, *hon*] Descriptive statistics and graphs about data using Pandas 81 | - material: [GitHub](https://github.com/comp-data/2021-2022/tree/main/docs/handson/07) 82 |
83 | 84 | ### Schedule 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
31/01/202212:30-14:30Introduction to the course and final project specifications
02/02/202212:30-14:30What is a datum and how it can be represented computationally
04/02/202212:30-14:30Data formats and methods for storing data in Python
07/02/202212:30-14:30Introduction to data modelling
09/02/202212:30-14:30Implementation of data models via Python classes
11/02/202212:30-14:30Processing and querying the data
14/02/202212:30-14:30Introduction to Pandas
16/02/202212:30-14:30Database Management Systems
18/02/202212:30-14:30Configuring and populating a relational database
21/02/202212:30-14:30SQL, a query language for relational databases
23/02/202212:30-14:30Configuring and populating a graph database
25/02/202212:30-14:30SPARQL, a query language for RDF databases
28/02/202212:30-14:30Interacting with databases using Pandas
02/03/202212:30-14:30Describing and visualising data
04/03/202212:30-14:30Descriptive statistics and graphs about data using Pandas
103 | 104 | ### Exam sessions 105 | 106 | - 16 May 2022 107 | - 20 June 2022 108 | - 15 July 2022 109 | - 5 September 2022 110 | 111 | # Links 112 | 113 | - [Project information](https://github.com/comp-data/2021-2022/tree/main/docs/project) 114 | -------------------------------------------------------------------------------- /docs/project/README.md: -------------------------------------------------------------------------------- 1 | # Data Science: project 2 | 3 | The goal of the project is to develop a software that enables one to process data stored in different formats and to upload them into two distinct databases 4 | to query these databases simultaneously according to predefined operations. The software must be accompanied by a document (i.e. a Jupyter notebook) describing the data to process (their main characteristics and possible issues) and how the software has been organised (name of the files, where have been defined the various Python classes, etc.). 5 | 6 | ## Data 7 | 8 | Exemplar data for testing the project have been made available. In particular: 9 | 10 | * for creating the relational database, there are two files, a [CSV file](data/relational_publications.csv) containing data about publications and a [JSON file](data/relational_other_data.json) containing additional information including the authors of each publication, the identifiers of the venue of each publication, and the identifier and name of each publisher publishing the venues; 11 | 12 | * for creating the RDF triplestore, there are two files, a [CSV file](data/graph_publications.csv) containing data about publications and a [JSON file](data/graph_other_data.json) containing additional information including the authors of each publication, the identifiers of the venue of each publication, and the identifier and name of each publisher publishing the venues. 13 | 14 | ## Workflow 15 | 16 | ![Workflow of the project](workflow.png) 17 | 18 | ## Data model 19 | 20 | ![Data model](datamodel.png) 21 | 22 | ## UML of data model classes 23 | 24 | ![Data model classes](uml.png) 25 | 26 | All the methods of each class must return the appropriate value that have been specified in the object of that class when it has been created. It iIs up to the implementer to decide how to enable someone to add this information to the object of each class, e.g. by defining a specific constructor. While one can add additional methods to each class if needed, it is crucial that the *get* methods introduced in the UML diagram are all defined. 27 | 28 | ## UML of additional classes 29 | 30 | ![Data model classes](uml2.png) 31 | 32 | All the attributes methods of each class are defined as follows. All the constructors of each of the class introduced in the UML diagram do not take in input any parameter. While one can add additional methods to each class if needed, it is crucial that all the methods introduced in the UML diagram are defined. 33 | 34 | ### Class `RelationalProcessor` 35 | 36 | #### Attributes 37 | `dbPath`: the variable containing the path of the database, initially set as an empty string, that will be updated with the method `setDbPath`. 38 | 39 | #### Methods 40 | `getDbPath`: it returns the path of the database. 41 | 42 | `setDbPath`: it enables to set a new path for the database to handle. 43 | 44 | ### Class `TriplestoreProcessor` 45 | 46 | #### Attributes 47 | `endpointUrl`: the variable containing the URL of the SPARQL endpoint of the triplestore, initially set as an empty string, that will be updated with the method `setEndpointUrl`. 48 | 49 | #### Methods 50 | `getEndpointUrl`: it returns the URL of the SPARQL endpoint of the triplestore. 51 | 52 | `setEndpointUrl`: it enables to set a new URL for the SPARQL endpoint of the triplestore. 53 | 54 | ### Classes `RelationalDataProcessor` and `TriplestoreDataProcessor` 55 | 56 | #### Methods 57 | `uploadData`: it enables to upload the collection of data specified in the input file path (either in CSV or JSON, according to the [formats specified above](#source-data)) into the database. 58 | 59 | ### Classes `RelationalQueryProcessor` and `TriplestoreQueryProcessor` 60 | 61 | #### Methods 62 | `getPublicationsPublishedInYear`: It returns a data frame with all the publications (i.e. the rows) that have been published in the input year (e.g. `2020`). 63 | 64 | `getPublicationsByAuthorId`: It returns a data frame with all the publications (i.e. the rows) that have been authored by the person having the identifier specified as input (e.g. `"0000-0001-9857-1511"`). 65 | 66 | `getMostCitedPublication`: It returns a data frame with all the publications (i.e. the rows) that have received the most number of citations by other publications. 67 | 68 | `getMostCitedVenue`: It returns a data frame with all the venues (i.e. the rows) containing the publications that, overall, have received the most number of citations by other publications. 69 | 70 | `getVenuesByPublisherId`: It returns a data frame with all the venues (i.e. the rows) that have been published by the organisation having the identifier specified as input (e.g. `"crossref:78"`). 71 | 72 | `getPublicationInVenue`: It returns a data frame with all the publications (i.e. the rows) that have been included in the venue having the identifier specified as input (e.g. `"issn:0944-1344"`). 73 | 74 | `getJournalArticlesInIssue`: It returns a data frame with all the journal articles (i.e. the rows) that have been included in the input issue (e.g. `"9"`) of the input volume (e.g. `"17"`) of the journal having the identifier specified as input (e.g. `"issn:2164-5515"`). 75 | 76 | `getJournalArticlesInVolume`: It returns a data frame with all the journal articles (i.e. the rows) that have been included, independently from the issue, in input volume (e.g. `"17"`) of the journal having the identifier specified as input (e.g. `"issn:2164-5515"`). 77 | 78 | `getJournalArticlesInJournal`: It returns a data frame with all the journal articles (i.e. the rows) that have been included, independently from the issue and the volume, in the journal having the identifier specified as input (e.g. `"issn:2164-5515"`). 79 | 80 | `getProceedingsByEvent`: It returns a data frame with all the proceedings (i.e. the rows) that refer to the events that match (in lowercase), even partially, with the name specified as input (e.g. `"web"`). 81 | 82 | `getPublicationAuthors`: It returns a data frame with all the authors (i.e. the rows) of the publication with the identifier specified as input (e.g. `"doi:10.1080/21645515.2021.1910000"`). 83 | 84 | `getPublicationsByAuthorName`: It returns a data frame with all the publications (i.e. the rows) that have been authored by the people having their name matching (in lowercase), even partially, with the name specified as input (e.g. `"doe"`). 85 | 86 | `getDistinctPublisherOfPublications`: It returns a data frame with all the distinct publishers (i.e. the rows) that have published the venues of the publications with identifiers those specified as input (e.g. `[ "doi:10.1080/21645515.2021.1910000", "doi:10.3390/ijfs9030035" ]`). 87 | 88 | ### Class `GenericQueryProcessor` 89 | 90 | #### Attributes 91 | `queryProcessor`: the variable containing the list of `QueryProcessor` objects to involve when one of the *get* methods below is executed. In practice, every time a *get* method is executed, the method will call the related method on all the `QueryProcessor` objects included in the variable `queryProcessor`, before combining the results and returning the requested object. 92 | 93 | #### Methods 94 | `cleanQueryProcessors`: It clean the list `queryProcessor` from all the `QueryProcessor` objects it includes. 95 | 96 | `addQueryProcessor`: It append the input `QueryProcessor` object to the list `queryProcessor`. 97 | 98 | `getPublicationsPublishedInYear`: It returns a list of `Publication` objects referring to all the publications that have been published in the input year (e.g. `2020`). 99 | 100 | `getPublicationsPublishedInYear`: It returns a list of `Publication` objects referring to all the publications that have been published in the input year (e.g. `2020`). 101 | 102 | `getPublicationsByAuthorId`: It returns a list of `Publication` objects referring to all the publications that have been authored by the person having the identifier specified as input (e.g. `"0000-0001-9857-1511"`). 103 | 104 | `getMostCitedPublication`: It returns the `Publication` object that has received the most number of citations by other publications. 105 | 106 | `getMostCitedVenue`: It returns the `Venue` object containing the publications that, overall, have received the most number of citations by other publications. 107 | 108 | `getVenuesByPublisherId`: It returns a list of `Venue` objects referring to all the venues that have been published by the organisation having the identifier specified as input (e.g. `"crossref:78"`). 109 | 110 | `getPublicationInVenue`: It returns a list of `Publication` objects referring to all the publications that have been included in the venue having the identifier specified as input (e.g. `"issn:0944-1344"`). 111 | 112 | `getJournalArticlesInIssue`: It returns a list of `JournalArticle` objects referring to all the journal articles that have been included in the input issue (e.g. `"9"`) of the input volume (e.g. `"17"`) of the journal having the identifier specified as input (e.g. `"issn:2164-5515"`). 113 | 114 | `getJournalArticlesInVolume`: It returns a list of `JournalArticle` objects referring to all the journal articles that have been included, independently from the issue, in input volume (e.g. `"17"`) of the journal having the identifier specified as input (e.g. `"issn:2164-5515"`). 115 | 116 | `getJournalArticlesInJournal`: It returns a list of `JournalArticle` objects referring to all the journal articles that have been included, independently from the issue and the volume, in the journal having the identifier specified as input (e.g. `"issn:2164-5515"`). 117 | 118 | `getProceedingsByEvent`: It returns a list of `Proceedings` objects referring to all the proceedings that refer to the events that match (in lowercase), even partially, with the name specified as input (e.g. `"web"`). 119 | 120 | `getPublicationAuthors`: It returns a list of `Person` objects referring to all the authors of the publication with the identifier specified as input (e.g. `"doi:10.1080/21645515.2021.1910000"`). 121 | 122 | `getPublicationsByAuthorName`: It returns a list of `Publication` objects referring to all the publications that have been authored by the people having their name matching (in lowercase), even partially, with the name specified as input (e.g. `"doe"`). 123 | 124 | `getDistinctPublisherOfPublications`: It returns a list of `Organization` objects referring to all the distinct publishers that have published the venues of the publications with identifiers those specified as input (e.g. `[ "doi:10.1080/21645515.2021.1910000", "doi:10.3390/ijfs9030035" ]`). 125 | 126 | ## Uses of the classes 127 | 128 | ``` 129 | # Supposing that all the classes developed for the project 130 | # are contained in the file 'impl.py', then: 131 | 132 | # 1) Importing all the classes for handling the relational database 133 | from impl import RelationalDataProcessor, RelationalQueryProcessor 134 | 135 | # 2) Importing all the classes for handling RDF database 136 | from impl import TriplestoreDataProcessor, TriplestoreQueryProcessor 137 | 138 | # 3) Importing the class for dealing with generic queries 139 | from impl import GenericQueryProcessor 140 | 141 | # Once all the classes are imported, first create the relational 142 | # database using the related source data 143 | rel_path = "relational.db" 144 | rel_dp = RelationalDataProcessor() 145 | rel_dp.setDbPath(rel_path) 146 | rel_dp.uploadData("data/relational_publications.csv") 147 | rel_dp.uploadData("data/relational_other_data.json") 148 | 149 | # Then, create the RDF triplestore (remember first to run the 150 | # Blazegraph instance) using the related source data 151 | grp_endpoint = "http://127.0.0.1:9999/blazegraph/sparql" 152 | grp_dp = TriplestoreDataProcessor() 153 | grp_dp.setEndpointUrl(grp_endpoint) 154 | grp_dp.uploadData("data/graph_publications.csv") 155 | grp_dp.uploadData("data/graph_other_data.json") 156 | 157 | # In the next passage, create the query processors for both 158 | # the databases, using the related classes 159 | rel_qp = RelationalQueryProcessor() 160 | rel_qp.setDbPath(rel_path) 161 | 162 | grp_qp = TriplestoreQueryProcessor() 163 | grp_qp.setEndpointUrl(grp_endpoint) 164 | 165 | # Finally, create a generic query processor for asking 166 | # about data 167 | generic = GenericQueryProcessor() 168 | generic.addQueryProcessor(rel_qp) 169 | generic.addQueryProcessor(grp_qp) 170 | 171 | result_q1 = generic.getPublicationsPublishedInYear(2020) 172 | result_q2 = generic.getPublicationsByAuthorId("0000-0001-9857-1511") 173 | # etc... 174 | ``` 175 | 176 | ## Submission of the project 177 | 178 | You have to provide all Python files implementing your project, by sharing them in some way (e.g. via OneDrive). You have to send all the files **one week before** the exam session you want to take. 179 | -------------------------------------------------------------------------------- /docs/handson/05/05-Configuring_and_populating_a_graph_database.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "compatible-disabled", 6 | "metadata": {}, 7 | "source": [ 8 | "# Configuring and populating a graph database\n", 9 | "\n", 10 | "In this tutorial, we show how to use RDF and Blazegraph to create a graph database using Python." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "unavailable-torture", 16 | "metadata": {}, 17 | "source": [ 18 | "## What is RDF\n", 19 | "\n", 20 | "The [Resource Description Framework (RDF)](https://en.wikipedia.org/wiki/Resource_Description_Framework) is a high-level data model (some times it is improperly called \"language\") based on triples *subject-predicate-object* called statements. For instance, a simple natural language sentence such as *Umberto Eco is author of The name of the rose* can be expressed through an RDF statement assigning to:\n", 21 | "\n", 22 | "* *Umberto Eco* the role of subject;\n", 23 | "* *is author of* the role of predicate;\n", 24 | "* *The name of the rose* the role of object.\n", 25 | "\n", 26 | "The main entities comprising RDF are listed as follows.\n", 27 | "\n", 28 | "\n", 29 | "### Resources\n", 30 | "\n", 31 | "A *resource* is an object we want to talk about, and it is identified by an [IRI](https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier). IRIs are the most generic class of Internet identifiers for resources, but often [HTTP URLs](https://en.wikipedia.org/wiki/URL) are used instead, which may be considered a subclass of IRIs (e.g. the [URL `http://www.wikidata.org/entity/Q12807`](http://www.wikidata.org/entity/Q12807) identifies Umberto Eco in [Wikidata](https://wikidata.org)).\n", 32 | "\n", 33 | "\n", 34 | "### Properties\n", 35 | "\n", 36 | "A *property* is a special type of resource since it is used to describe relation between resources, and it is identified by an IRI (e.g. the [URL `http://www.wikidata.org/prop/direct/P800`](http://www.wikidata.org/entity/P800) identifies the property *has notable work* - which mimic the *is author of* predicate of the statement above).\n", 37 | "\n", 38 | "\n", 39 | "### Statements\n", 40 | "\n", 41 | "*Statements* enable one to assert properties between resources. Each statement is a triple subject-predicate-object, where the subject is a resource, the predicate is a property, and the object is either a resource or a literal (i.e. a string). \n", 42 | "\n", 43 | "There are different notations that can be used to represent statements in RDF in plain text files. The simplest (and most verbose) one is called [N-Triples](https://en.wikipedia.org/wiki/N-Triples). It allows to define statements according to the following syntax:\n", 44 | "\n", 45 | "```\n", 46 | "# 1) statement with a resource as an object\n", 47 | " .\n", 48 | "\n", 49 | "# 2) statement with a literal as an object\n", 50 | " \"literal value\"^^ .\n", 51 | "```\n", 52 | "\n", 53 | "Type (1) statements must be used to state relationships between resources, while type (2) statements are generally used to associate attributes to a specific resource (the IRI defining the type of value is not specified for generic literals, i.e. strings). For instance, in Wikidata, the exemplar sentence above (*Umberto Eco is author of The name of the rose*) is defined by three distict RDF statements:\n", 54 | "\n", 55 | "```\n", 56 | " \"Umberto Eco\" .\n", 57 | "\n", 58 | " \"The Name of the Rose\" .\n", 59 | "\n", 60 | " .\n", 61 | "```\n", 62 | "\n", 63 | "Actually, the relation described by the natural language sentence is defined by the third RDF statement above. However, two additional statements have been added to associate the strings representing the name of the resources referring to *Umberto Eco* and *The name of the rose*. Be aware: literals (i.e. simple values) cannot be subjects in any statement.\n", 64 | "\n", 65 | "\n", 66 | "### A special property\n", 67 | "\n", 68 | "While all the properties you can use in your statements as predicates can be defined in several distinct vocabularies (the [Wikidata data model](https://www.wikidata.org/wiki/Wikidata:List_of_properties), [schema.org data model](https://schema.org/docs/datamodel.html), etc.), RDF defines a special property that is used to associate a resource to its intended type (e.g. another resource representing a class of resources). The IRI of this property is `http://www.w3.org/1999/02/22-rdf-syntax-ns#type`. For instance, we can use this property to assign the appropriate type of object to the two entities defined in the excerpt above, i.e. that referencing to *Umberto Eco* and *The name of the rose*, as shown as follows:\n", 69 | "\n", 70 | "```\n", 71 | " .\n", 72 | "\n", 73 | " .\n", 74 | "```\n", 75 | "\n", 76 | "In the example above, we reuse two existing classes of resources included in [schema.org](https://schema.org) for people and books. It is worth mentioning that an existing resource can be associated via `http://www.w3.org/1999/02/22-rdf-syntax-ns#type` to one or more types, if they apply.\n", 77 | "\n", 78 | "\n", 79 | "### RDF Graphs\n", 80 | "\n", 81 | "An *RDF Graph* is a set of RDF statements. For instance, a file that contains RDF statements represents an RDF graph, and IRIs contained in different graph actually refer to the same resource. \n", 82 | "\n", 83 | "We talk about graphs in this context because all the RDF statements, and the resources they include, actually defined a direct graph structure, where the direct edges are labelled with the predicates of the statements and the subjects and objects are nodes linked through such edges. For instance, the diagram below represents all the RDF statements introduced above using a visual graph.\n", 84 | "\n", 85 | "![An image of the RDF graph presented in the RDF statements above](rdfgraph.png)\n", 86 | "\n", 87 | "\n", 88 | "### Triplestores\n", 89 | "\n", 90 | "A *triplestore* is a database built for storing and retrieving RDF statements, and can contain one or more RDF graphs." 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "drawn-identifier", 96 | "metadata": {}, 97 | "source": [ 98 | "## Blazegraph, a database for RDF data\n", 99 | "\n", 100 | "[Blazegraph DB](https://blazegraph.com/) is a ultra high-performance graph database supporting RDF/SPARQL APIs (thus, it is a triplestore). It supports up to 50 Billion edges on a single machine. Its code is entirely [open source and available on GitHub](https://github.com/blazegraph/database). \n", 101 | "\n", 102 | "Running this database as a server application is very simple. One has just to [download the .jar application](https://github.com/blazegraph/database/releases/download/BLAZEGRAPH_2_1_6_RC/blazegraph.jar), put it in a directory, and [run it](https://github.com/blazegraph/database/wiki/Quick_Start) from a shell as follows:\n", 103 | "\n", 104 | "```\n", 105 | "java -server -Xmx1g -jar blazegraph.jar\n", 106 | "```\n", 107 | "\n", 108 | "You need at least Java 9 installed in your system. If you do not have it, you can easily dowload and install it from the [Java webpage](https://www.java.com/it/download/manual.jsp). As you can see from the output of the command above, the database will be exposed via HTTP at a specific IP address:\n", 109 | "\n", 110 | "![Screenshot of the execution of Blazegraph](blazegraph.png)\n", 111 | "\n", 112 | "However, from your local machine, you can always contact it at the following URL:\n", 113 | "\n", 114 | "```\n", 115 | "http://127.0.0.1:9999/blazegraph/\n", 116 | "```" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "affiliated-enough", 122 | "metadata": {}, 123 | "source": [ 124 | "## From a diagram to a graph\n", 125 | "\n", 126 | "As you can see, the UML diagram introduced in the previous lecture, which I recall below, is already organised as a (directed) graph. Thus, translating such a data model into an RDF graph database is kind of straightforward.\n", 127 | "\n", 128 | "![UML diagram of a data model](../02/uml.png)\n", 129 | "\n", 130 | "The important thing to decide, in this context, is to clarify what are the names (i.e. the URLs) of the classes and properties to use to represent the data compliant with the data model. In particular:\n", 131 | "\n", 132 | "* supposing that each resource will be assigned to at least one of the types defined in the data model, we need to identify the names of all the most concrete classes (e.g. `JournalArticle`, `BookChapter`, `Journal`, `Book`);\n", 133 | "\n", 134 | "* each attribute of each UML class will be represented by a distinct RDF property which will be involved in statements where the subjects are always resources of the class in consideration and the objects are simple literals (i.e. values). Of course, we have to identify the names of these properties (i.e. the URLs);\n", 135 | "\n", 136 | "* each relation starting from an UML class and ending in another UML class will be represented by a distinct RDF property which will be involved in statements where the subjects are always resources of the source class while the objects are resources of the target class. Of course, we have to identify the names of these properties (i.e. the URLs);\n", 137 | "\n", 138 | "* please, bear in mind that all attributes and relations defined in a class are inherited (i.e. can be used by) all its subclasses.\n", 139 | "\n", 140 | "You can choose to reuse existing classes and properties (e.g. as defined in [schema.org](https://schema.org)) or create your own. In the latter case, you have to remind to use an URL you are in control of (e.g. your website or GitHub repository). For instance, a possible pattern for defining your own name for the class `Book` could be `https:///Book` (e.g. `https://essepuntato.it/Book`). Of course, there are strategies and guidelines that should be used to implement appropriately data model in RDF-compliant languages. However these are out of the scope of the present course (and will be clarified in other courses).\n", 141 | "\n", 142 | "The name of all the classes and properties I will use in the examples in this tutorial are as follows:\n", 143 | "\n", 144 | "* UML class `JournalArticle`: `https://schema.org/ScholarlyArticle`;\n", 145 | "* UML class `BookChapter`: `https://schema.org/Chapter`;\n", 146 | "* UML class `Journal`: `https://schema.org/Periodical`;\n", 147 | "* UML class `Book`: `https://schema.org/Book`;\n", 148 | "* UML attribute `doi` of class `Publication`: `https://schema.org/identifier`;\n", 149 | "* UML attribute `publicationYear` of class `Publication`: `https://schema.org/datePublished`;\n", 150 | "* UML attribute `title` of class `Publication`: `https://schema.org/name`;\n", 151 | "* UML attribute `issue` of class `JournalArticle`: `https://schema.org/issueNumber`;\n", 152 | "* UML attribute `volume` of class `JournalArticle`: `https://schema.org/volumeNumber`;\n", 153 | "* UML attribute `id` of class `Venue`: `https://schema.org/identifier`;\n", 154 | "* UML attribute `name` of class `Venue`: `https://schema.org/name`;\n", 155 | "* UML relation `publicationVenue` of class `Publication`: `https://schema.org/isPartOf`." 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "id": "personal-values", 161 | "metadata": {}, 162 | "source": [ 163 | "## Using RDF in Python\n", 164 | "\n", 165 | "The [library `rdflib`](https://rdflib.readthedocs.io/en/stable/) provides classes and methods that allow one to create RDF graphs and populating them with RDF statements. It can be installed using the `pip` command as follows: \n", 166 | "\n", 167 | "```\n", 168 | "pip install rdflib\n", 169 | "```\n", 170 | "\n", 171 | "The [class `Graph`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.graph.Graph) is used to create an (initially empty) RDF graph, as shown as follows:" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 13, 177 | "id": "historic-transcript", 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "from rdflib import Graph\n", 182 | "\n", 183 | "my_graph = Graph()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "environmental-marker", 189 | "metadata": {}, 190 | "source": [ 191 | "All the resources (including the properties) are defined using the [class `URIRef`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.term.URIRef). The constructor of this class takes in input a string representing the IRI (or URL) of the resource in consideration. For instance, the code below shows all the resources mentioned above, i.e. those referring to classes, attributes and relations:" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 14, 197 | "id": "scientific-norfolk", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "from rdflib import URIRef\n", 202 | "\n", 203 | "# classes of resources\n", 204 | "JournalArticle = URIRef(\"https://schema.org/ScholarlyArticle\")\n", 205 | "BookChapter = URIRef(\"https://schema.org/Chapter\")\n", 206 | "Journal = URIRef(\"https://schema.org/Periodical\")\n", 207 | "Book = URIRef(\"https://schema.org/Book\")\n", 208 | "\n", 209 | "# attributes related to classes\n", 210 | "doi = URIRef(\"https://schema.org/identifier\")\n", 211 | "publicationYear = URIRef(\"https://schema.org/datePublished\")\n", 212 | "title = URIRef(\"https://schema.org/name\")\n", 213 | "issue = URIRef(\"https://schema.org/issueNumber\")\n", 214 | "volume = URIRef(\"https://schema.org/volumeNumber\")\n", 215 | "identifier = URIRef(\"https://schema.org/identifier\")\n", 216 | "name = URIRef(\"https://schema.org/name\")\n", 217 | "\n", 218 | "# relations among classes\n", 219 | "publicationVenue = URIRef(\"https://schema.org/isPartOf\")" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "id": "competitive-louis", 225 | "metadata": {}, 226 | "source": [ 227 | "Instead, literals (i.e. value to specify as objects of RDF statements) can be created using the [class `Literal`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.term.Literal). The constructor of this class takes in input a value (of any basic type: it can be a string, an integer, etc.) and create the related literal object in RDF, as shown in the next excerpt:" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 15, 233 | "id": "functioning-lemon", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "from rdflib import Literal\n", 238 | "\n", 239 | "a_string = Literal(\"a string with this value\")\n", 240 | "a_number = Literal(42)\n", 241 | "a_boolean = Literal(True)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "id": "trying-chaos", 247 | "metadata": {}, 248 | "source": [ 249 | "Using these classes it is possible to create all the Python objects necessary to create statements describing all the data to be pushed into an RDF graph. We need to use the [method `add`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.Graph.add) to add a new RDF statement to a graph. Such method takes in input a tuple of three elements defining the subject (an `URIRef`), the predicate (another `URIRef`) and the object (either an `URIRef` or a `Literal`) of the statements.\n", 250 | "\n", 251 | "The following code show how to populate the RDF graph defining using the data obtained by processing the two CSV documents presented in previous tutorials. i.e. [that of the publications](../01/01-publications.csv) and [that of the venues](../01/01-venues.csv). For instance, all the venues are created using the following code:" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 16, 257 | "id": "ultimate-circle", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "from pandas import read_csv, Series\n", 262 | "from rdflib import RDF\n", 263 | "\n", 264 | "# This is the string defining the base URL used to defined\n", 265 | "# the URLs of all the resources created from the data\n", 266 | "base_url = \"https://comp-data.github.io/res/\"\n", 267 | "\n", 268 | "venues = read_csv(\"../01/01-venues.csv\", \n", 269 | " keep_default_na=False,\n", 270 | " dtype={\n", 271 | " \"id\": \"string\",\n", 272 | " \"name\": \"string\",\n", 273 | " \"type\": \"string\"\n", 274 | " })\n", 275 | "\n", 276 | "venue_internal_id = {}\n", 277 | "for idx, row in venues.iterrows():\n", 278 | " local_id = \"venue-\" + str(idx)\n", 279 | " \n", 280 | " # The shape of the new resources that are venues is\n", 281 | " # 'https://comp-data.github.io/res/venue-'\n", 282 | " subj = URIRef(base_url + local_id)\n", 283 | " \n", 284 | " # We put the new venue resources created here, to use them\n", 285 | " # when creating publications\n", 286 | " venue_internal_id[row[\"id\"]] = subj\n", 287 | " \n", 288 | " if row[\"type\"] == \"journal\":\n", 289 | " # RDF.type is the URIRef already provided by rdflib of the property \n", 290 | " # 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'\n", 291 | " my_graph.add((subj, RDF.type, Journal))\n", 292 | " else:\n", 293 | " my_graph.add((subj, RDF.type, Book))\n", 294 | " \n", 295 | " my_graph.add((subj, name, Literal(row[\"name\"])))\n", 296 | " my_graph.add((subj, identifier, Literal(row[\"id\"])))" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "id": "increasing-leisure", 302 | "metadata": {}, 303 | "source": [ 304 | "As you can see, all the RDF triples have been added to the graph, that currently contain the following number of distinct triples (which is coincident with the number of cells in the original table):" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 17, 310 | "id": "attractive-genius", 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "-- Number of triples added to the graph after processing the venues\n", 318 | "12\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "print(\"-- Number of triples added to the graph after processing the venues\")\n", 324 | "print(len(my_graph))" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "id": "psychological-marketplace", 330 | "metadata": {}, 331 | "source": [ 332 | "The same approach can be used to add information about the publications, as shown as follows:" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 18, 338 | "id": "flexible-affiliate", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "publications = read_csv(\"../01/01-publications.csv\", \n", 343 | " keep_default_na=False,\n", 344 | " dtype={\n", 345 | " \"doi\": \"string\",\n", 346 | " \"title\": \"string\",\n", 347 | " \"publication year\": \"int\",\n", 348 | " \"publication venue\": \"string\",\n", 349 | " \"type\": \"string\",\n", 350 | " \"issue\": \"string\",\n", 351 | " \"volume\": \"string\"\n", 352 | " })\n", 353 | "\n", 354 | "for idx, row in publications.iterrows():\n", 355 | " local_id = \"publication-\" + str(idx)\n", 356 | " \n", 357 | " # The shape of the new resources that are publications is\n", 358 | " # 'https://comp-data.github.io/res/publication-'\n", 359 | " subj = URIRef(base_url + local_id)\n", 360 | " \n", 361 | " if row[\"type\"] == \"journal article\":\n", 362 | " my_graph.add((subj, RDF.type, JournalArticle))\n", 363 | "\n", 364 | " # These two statements applies only to journal articles\n", 365 | " my_graph.add((subj, issue, Literal(row[\"issue\"])))\n", 366 | " my_graph.add((subj, volume, Literal(row[\"volume\"])))\n", 367 | " else:\n", 368 | " my_graph.add((subj, RDF.type, BookChapter))\n", 369 | " \n", 370 | " my_graph.add((subj, name, Literal(row[\"title\"])))\n", 371 | " my_graph.add((subj, identifier, Literal(row[\"doi\"])))\n", 372 | " \n", 373 | " # The original value here has been casted to string since the Date type\n", 374 | " # in schema.org ('https://schema.org/Date') is actually a string-like value\n", 375 | " my_graph.add((subj, publicationYear, Literal(str(row[\"publication year\"]))))\n", 376 | " \n", 377 | " # The URL of the related publication venue is taken from the previous\n", 378 | " # dictionary defined when processing the venues\n", 379 | " my_graph.add((subj, publicationVenue, venue_internal_id[row[\"publication venue\"]]))" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "id": "taken-trace", 385 | "metadata": {}, 386 | "source": [ 387 | "After the addition of this new statements, the number of total RDF triples added to the graph is equal to all the cells in the venue CSV plus all the non-empty cells in the publication CSV:" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 19, 393 | "id": "identified-transfer", 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "name": "stdout", 398 | "output_type": "stream", 399 | "text": [ 400 | "-- Number of triples added to the graph after processing venues and publications\n", 401 | "31\n" 402 | ] 403 | } 404 | ], 405 | "source": [ 406 | "print(\"-- Number of triples added to the graph after processing venues and publications\")\n", 407 | "print(len(my_graph))" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "id": "operational-sacrifice", 413 | "metadata": {}, 414 | "source": [ 415 | "It is worth mentioning that we should not map in RDF cells in the original table that do not contain any value. Thus, if for instance there is an `issue` cell in the publication CSV which is empty (i.e. no information about the issue have been specified), you should not create any RDF statement mapping such a non-information." 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "id": "partial-communication", 421 | "metadata": {}, 422 | "source": [ 423 | "## How to create and populate a graph database with Python\n", 424 | "\n", 425 | "Once we have created our graph with all the triples we need, we can upload persistently the graph on our triplestore. In order to do that, we have to create an instance of the [class `SPARQLUpdateStore`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.plugins.stores.html#rdflib.plugins.stores.sparqlstore.SPARQLUpdateStore), which acts as a proxy to interact with the triplestore. The important thing is to open the connection with the store passing, as input, a tuple of two strings with the same URLs, defining the SPARQL endpoint of the triplestore where to upload the data.\n", 426 | "\n", 427 | "Then, we can upload triple by triple using a for-each iteration over the list of RDF statements obtained by using the [method `triples`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.graph.Graph.triples) of the class `Graph`, passing as input a tuple with three `None` values, as shown as follows:" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 20, 433 | "id": "saved-prescription", 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore\n", 438 | "\n", 439 | "store = SPARQLUpdateStore()\n", 440 | "\n", 441 | "# The URL of the SPARQL endpoint is the same URL of the Blazegraph\n", 442 | "# instance + '/sparql'\n", 443 | "endpoint = 'http://127.0.0.1:9999/blazegraph/sparql'\n", 444 | "\n", 445 | "# It opens the connection with the SPARQL endpoint instance\n", 446 | "store.open((endpoint, endpoint))\n", 447 | "\n", 448 | "for triple in my_graph.triples((None, None, None)):\n", 449 | " store.add(triple)\n", 450 | " \n", 451 | "# Once finished, remeber to close the connection\n", 452 | "store.close()" 453 | ] 454 | } 455 | ], 456 | "metadata": { 457 | "kernelspec": { 458 | "display_name": "Python 3", 459 | "language": "python", 460 | "name": "python3" 461 | }, 462 | "language_info": { 463 | "codemirror_mode": { 464 | "name": "ipython", 465 | "version": 3 466 | }, 467 | "file_extension": ".py", 468 | "mimetype": "text/x-python", 469 | "name": "python", 470 | "nbconvert_exporter": "python", 471 | "pygments_lexer": "ipython3", 472 | "version": "3.9.0" 473 | } 474 | }, 475 | "nbformat": 4, 476 | "nbformat_minor": 5 477 | } 478 | -------------------------------------------------------------------------------- /docs/handson/02/02-Implementation_of_data_models_via_Python_classes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "modified-haven", 6 | "metadata": {}, 7 | "source": [ 8 | "# Implementation of data models via Python classes\n", 9 | "\n", 10 | "In this tutorial, we see how to create Python classes to implement a model for the representation of data." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "universal-extraction", 16 | "metadata": {}, 17 | "source": [ 18 | "## What is a class in Python\n", 19 | "\n", 20 | "In Python, as in other [object-oriented programming languages](https://en.wikipedia.org/wiki/Object-oriented_programming), a class is an extensible template for creating objects having a specific type. All the basic types of values (e.g. strings, integers, booleans) and the other data structures (e.g. lists, sets, dictionaries) are defined by means of particular classes. \n", 21 | "\n", 22 | "In addition, each class makes available a set of [methods](https://en.wikipedia.org/wiki/Method_(computer_programming)) that allow one to interact with the objects (i.e. the instances) of such a class. A method is a particular function that can be run only if directly called via an object. For instance, the instruction `\"this is a string\".split(\" \")` executes the method `split` passing `\" \"` as the input parameter on the particular string object on which the method is called, i.e. the string `\"this is a string\"` (defined by the [class `str`](https://docs.python.org/3/library/stdtypes.html#str) in Python)." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "shaped-elevation", 28 | "metadata": {}, 29 | "source": [ 30 | "## Defining a data model using Python classes\n", 31 | "\n", 32 | "[Python classes](https://docs.python.org/3/tutorial/classes.html), as the name may recall, can be used to implement a particular data model such as that introduced in the following diagram using the [Unified Modelling Language (UML)](https://en.wikipedia.org/wiki/Unified_Modeling_Language). We will use this example to understand how to implement classes in Python, and to show how they works.\n", 33 | "\n", 34 | "![UML diagram of a data model](uml.png)\n", 35 | "\n", 36 | "As you can see from the diagram above, we defined six distinct classes which are, somehow, related to each other. Let us see how to define this structure in Python." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "blessed-hardwood", 42 | "metadata": {}, 43 | "source": [ 44 | "### Defining our first class\n", 45 | "\n", 46 | "For defining classes in Python, one has to use the following signature:\n", 47 | "\n", 48 | "```\n", 49 | "class (, , ...):\n", 50 | " def __init__(self, , , ...):\n", 51 | " ...\n", 52 | "```\n", 53 | "\n", 54 | "In the excerpt above, `` is the name one wants to assign to a class, while ``, ``, etc., indicate the superclasses from which this class is derived from. In Python, all new classes must be subclass of the generic class `object`. Instead, the indented `def __init__` is a special methods defining the constructor of an object of that class, and it will called every time one wants to create a new object (instance) of this type. For instance, when we create a new set in Python using `set()`, we are calling the constructur of the [class `set`](https://docs.python.org/3/library/stdtypes.html#set), defined as shown above.\n", 55 | "\n", 56 | "It is worth mentioning that all the methods of a class, including its constructor, must specify `self` as the first parameter. This special parameter represents the instance of the class in consideration. In practice, every time we instantiate a new object of that class, `self` will be assigned to that object and provides access to its attributes (i.e. variables assigned with particular values for that object) and methods as defined in the related class. In particular, it is used to access to all object related information within the class itself.\n", 57 | "\n", 58 | "For instance, by using such a `self` parameter, it is possible to create variables and associated values that are local to a particular object of that class. In the following excerpt, we use it to define the constructor of the the class `Venue` in the data model shown above as a UML diagram:" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 1, 64 | "id": "processed-physiology", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "class Venue(object):\n", 69 | " def __init__(self, identifiers, name):\n", 70 | " self.id = set()\n", 71 | " for identifier in identifiers:\n", 72 | " self.id.add(identifier)\n", 73 | " \n", 74 | " self.name = name" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "affecting-collectible", 80 | "metadata": {}, 81 | "source": [ 82 | "As shown in the code above, the class `Venue` is defined as subclass of the top class `object`, and its constructor takes in input three parameters: `self` (as explained above), `identifiers` and `name`. \n", 83 | "\n", 84 | "The parameter `identifiers` is used to take in input a collection of strings that contains all the identifiers of such an object. In the above code, I decided to handle all the items included in the collection using a set to comply with the declaration in the data model class which wants to have a collection of at least one or more string identifiers (`id : string [1..*]`). Indeed, I have created a new variable `id` related to the particular object of the class `self` (i.e. `self.id`) and I have assigned a new set to it. Then, I added all the identifiers in the input collection to the set using the [set method `add`](https://docs.python.org/3/library/stdtypes.html#frozenset.add) (i.e. via the instruction `self.id.add(identifier)`.\n", 85 | "\n", 86 | "Instead, the parameter `name` is used to specify the name of a particular venue. Thus, I have just assigned it to the variable `name` of the object `self` (i.e. `self.name`) to mimic the data model attribute `name : str [1]`. Of course, I could also use a different structure to store this information - for instance, I could use again a set which contained only one value in it. The important thing here, while trying to map the data model into a Python class, is to be compliant with the data model declaration. I chose to assigned it straight with a variable supposing that the input will be a simple string.\n", 87 | "\n", 88 | "In practice, thanks to the `self` keyword, I can create new independent variables for each new object created using this class." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "authorized-weekend", 94 | "metadata": {}, 95 | "source": [ 96 | "### Representing relations in Python\n", 97 | "\n", 98 | "The Python class defined above represents (by means of its constructor) all the attributes associated to the related data model class. However, in data models, there are also relations that may exist between different kinds of objects, as the relation `publicationVenue` between the data model classes `Publication` and `Venue`. In Python, such relations can be represented as the other attributes, i.e. by assigning some specific values to `self`-declared variables, as shown in the following excerpt:" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 2, 104 | "id": "alpha-reading", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "class Publication(object):\n", 109 | " def __init__(self, doi, publicationYear, title, publicationVenue):\n", 110 | " self.doi = doi\n", 111 | " self.publicationYear = publicationYear\n", 112 | " self.title = title\n", 113 | " self.publicationVenue = publicationVenue" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "considerable-lancaster", 119 | "metadata": {}, 120 | "source": [ 121 | "As shown in the except above, the constructor of the class `Publication` takes in input not only the attributes of the related data model class but also its relations (i.e. the relation from which the class is the starting point), and considers it as additional parameters of the constructor. Then, they will be handled as the others. Of course, the type of object that should be specified in the parameter `publicationVenue` should have class `Venue`, defined above." 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "raising-salem", 127 | "metadata": {}, 128 | "source": [ 129 | "### Instantiating a class\n", 130 | "\n", 131 | "Once classes are defined, we can use them to instantiate objects of that kind. For doing it, we should call their constructor (using the name of the class) passing the parameters it requires **except** `self`, that will be implicitly considered. In practice, for creating a new object of class `Venue`, we need to specify only two parameters, i.e. those for `identifiers` (i.e. a collection of strings) and `name` (i.e. a string). As an example, let us consider again the first two items of the [venues CSV file](../01/01-venues.csv) we have introduced in the previous tutorial, i.e.:\n", 132 | "\n", 133 | "| id | name | type |\n", 134 | "|---|---|---|\n", 135 | "| 1531-6912 | Comparative and Functional Genomics | journal |\n", 136 | "| 1367-5931 | Current Opinion in Chemical Biology | journal |\n", 137 | "\n", 138 | "These two entities (i.e. venues) can be defined using the Python class `Venue` as follows:" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 3, 144 | "id": "dated-contamination", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "venue_1 = Venue([\"1531-6912\"], \"Comparative and Functional Genomics\")\n", 149 | "venue_2 = Venue([\"1367-5931\"], \"Current Opinion in Chemica Biology\")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "solid-example", 155 | "metadata": {}, 156 | "source": [ 157 | "As shown in the above excerpt, I have created two new objects, assigned to two distinct variables, one for each venue. All the values specified as input of the constructur have been assigned to the `self` variables of each object, that are distinct, while share the same structure. Indeed, using the Python built-in [function `id`](https://docs.python.org/3/library/functions.html#id) (that takes in input an object and returns the unique integer identifying it) and [function `type`](https://docs.python.org/3/library/functions.html#type) (that takes in input an object and returns its related type), it is possible to see that `value_1` and `value_2` are different objects of the same class:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 4, 163 | "id": "reported-sixth", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "The objects in 'value_1' and 'value_2' share the same class --> True\n", 171 | "Indeed, the types of the two objects are both \n", 172 | "\n", 173 | "The objects in 'value_1' and 'value_2' are the same object --> False\n", 174 | "Indeed, the integers identifying the two objects are 140587130074496 and 140587130073872 respectively\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "print(\"The objects in 'value_1' and 'value_2' share the same class -->\", type(venue_1) == type(venue_2))\n", 180 | "print(\"Indeed, the types of the two objects are both\", type(venue_1))\n", 181 | "\n", 182 | "print(\"\\nThe objects in 'value_1' and 'value_2' are the same object -->\", id(venue_1) == id(venue_2))\n", 183 | "print(\"Indeed, the integers identifying the two objects are\", id(venue_1), \"and\", id(venue_2), \"respectively\")" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "cognitive-property", 189 | "metadata": {}, 190 | "source": [ 191 | "Similarly, we can create new objects also of other classes, such as `Publication`. In this case, the last parameter of the constructor of `Publication` (i.e. `publicationVenue`) should take in input an object having class `Venue` as defined above. As another example, let us consider again the first two items of the [publications CSV file](../01/01-publications.csv) we have introduced in the previous tutorial, i.e.:\n", 192 | "\n", 193 | "| doi | title | publication year | publication venue | type | issue | volume |\n", 194 | "|---|---|---|---|---|---|---|\n", 195 | "| 10.1002/cfg.304 | Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information | 2003 | 1531-6912 | journal article | 4 | 4 |\n", 196 | "10.1016/s1367-5931(02)00332-0 | In vitro selection as a powerful tool for the applied evolution of proteins and peptides | 2002 | 1367-5931 | journal article | 3 | 6 |\n", 197 | "\n", 198 | "These two publications can be defined using the Python class `Publication` as follows:" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 5, 204 | "id": "outer-steal", 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "publication_1 = Publication(\"10.1002/cfg.304\", \n", 209 | " 2003, \n", 210 | " \"Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information\", \n", 211 | " venue_1)\n", 212 | "\n", 213 | "publication_2 = Publication(\"10.1016/s1367-5931(02)00332-0\", \n", 214 | " 2002, \n", 215 | " \"In vitro selection as a powerful tool for the applied evolution of proteins and peptides\", \n", 216 | " venue_2)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "id": "hybrid-vertical", 222 | "metadata": {}, 223 | "source": [ 224 | "It is worth mentioning that, as shown in the excerpt above, we have not specified the identifier of a particular venue as input, bur rather we have provided the `Venue` object representing such a venue, as also defined by the relation `publicationVenue` specified in the data model." 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "id": "miniature-welsh", 230 | "metadata": {}, 231 | "source": [ 232 | "### Creating subclasses of a given class\n", 233 | "\n", 234 | "As you may have noticed, we did not map all the columns of the CSV documents introduced above in the classes we have defined. Indeed, the data model above actually specifies some of this information (for instance the concept of publication type and the fields `issue` and `volume`) into subclasses of `Publication` and `Venue`. Python makes available a mechanism to create new classes as subclasses of existing ones, thus inheriting all the attributes and methods that the superclasses already implement, similar to what a data model enables. \n", 235 | "\n", 236 | "We can use the same signature adopted for classes for creating subclasses by specifying the classes to extend in the definition of the class, as we already did specifying the class `object` as top class of `Publication` and `Venue`, as shown as follows:" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 6, 242 | "id": "charming-minister", 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "class BookChapter(Publication):\n", 247 | " pass\n", 248 | "\n", 249 | "class Journal(Venue):\n", 250 | " pass\n", 251 | "\n", 252 | "class Book(Venue):\n", 253 | " pass" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "id": "unable-module", 259 | "metadata": {}, 260 | "source": [ 261 | "In the code above, the body of each class extending the classes `Publication` and `Venue` is left unspecified. This means that the new subclasses inherit (and can access via `self`) all the attributes and methods (including the constructor) from the superclass. Thus, the only thing they really add in this case is the specification of a new characterising type, which mimic the `type` field of the CSV file presented above.\n", 262 | "\n", 263 | "However, adding such new information is enough for classifying them as distinct classes, even if one (e.g. `Journal`) is subclass of another (e.g. `Venue`). Indeed, in the following code, I create a new instance of the class `Journal` using the same input values of `value_1`, specified above. As you can see, the classes returned by these two objects are indeed different:" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 7, 269 | "id": "alert-conducting", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "The objects in 'journal_1' and 'venue_1' share the same class --> False\n", 277 | "Indeed, the types of the two objects are and respectively\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "# An object of class 'Journal' is instantiated using the same parameters\n", 283 | "# of the constructor of its parent class 'Venue' since 'Journal' does not\n", 284 | "# define any explicit constructor\n", 285 | "journal_1 = Journal([\"1531-6912\"], \"Comparative and Functional Genomics\")\n", 286 | "\n", 287 | "print(\"The objects in 'journal_1' and 'venue_1' share the same class -->\", type(journal_1) == type(venue_1))\n", 288 | "print(\"Indeed, the types of the two objects are\", type(journal_1), \"and\", type(venue_2), \"respectively\")" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "id": "social-attack", 294 | "metadata": {}, 295 | "source": [ 296 | "Of course, in some cases, the new subclass may take in input additional information compared to its superclass. In these cases, e.g. for mapping in Python the data model class `JournalArticle` that introduces also the attributes `issue` and `volume`, it would be necessary to define an appropriate constructor extending that of the parent superclass. An implementation of the Python class `JournalArticle` is shown as follows:" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "id": "affiliated-ridge", 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "class JournalArticle(Publication):\n", 307 | " def __init__(self, doi, publicationYear, title, publicationVenue, issue, volume):\n", 308 | " self.issue = issue\n", 309 | " self.volume = volume\n", 310 | " \n", 311 | " # Here is where the constructor of the superclass is explicitly recalled, so as\n", 312 | " # to handle the input parameters as done in the superclass\n", 313 | " super().__init__(doi, publicationYear, title, publicationVenue)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "id": "wanted-character", 319 | "metadata": {}, 320 | "source": [ 321 | "In the code above, the additional parameters `issue` and `venue` are handled as before, while all the other are tranferred to the constructor of the superclass accessed by using the [function `super`](https://docs.python.org/3.5/library/functions.html#super) (which returns a proxy object that delegates method calls to the parent class) and then calling the `__init__` constructor with all the expected parameters **except** `self`. In this case, to instantiate an object of class `JournalArticle`, all the input parameters must be specified:" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 9, 327 | "id": "accurate-supply", 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "journal_article_1 = JournalArticle(\"10.1002/cfg.304\", \n", 332 | " 2003, \n", 333 | " \"Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information\", \n", 334 | " journal_1, \n", 335 | " \"4\", \n", 336 | " \"4\")" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "id": "weird-tragedy", 342 | "metadata": {}, 343 | "source": [ 344 | "## Extending classes with methods\n", 345 | "\n", 346 | "Once an object of a certain class is created, one can access all its attributes (i.e. those assigned to `self` variables) directly by their name using the following syntax: `.`. For instance, if we want to print on screen the title of the journal article we have just created, we can run the following code:" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 10, 352 | "id": "driven-governor", 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "print(journal_article_1.title)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "id": "incorrect-shareware", 370 | "metadata": {}, 371 | "source": [ 372 | "In principle, such a way to referring to specific attributes of an object allows one to also modify the value of their attributes directly, by assigning them to a new value as we do for any variable, for instance:" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 11, 378 | "id": "amateur-burner", 379 | "metadata": {}, 380 | "outputs": [ 381 | { 382 | "name": "stdout", 383 | "output_type": "stream", 384 | "text": [ 385 | "My new title!\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "journal_article_1.title = \"My new title!\"\n", 391 | "print(journal_article_1.title)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "id": "bright-firmware", 397 | "metadata": {}, 398 | "source": [ 399 | "However, this way of modifying object attributes is not safe and may have undesired outcomes if someone does not know how to deal with it properly. Thus, in order to facilitate the interaction with object's attribute and to provide operation to work with and manipolate them, Python (and all the other object-oriented programming languages) allows one to create [methods](https://en.wikipedia.org/wiki/Method_(computer_programming)). \n", 400 | "\n", 401 | "A method of a class encapsulate an operation that can be run on an object of that class and that can, in principle, be responsible to act upon the attributes related to that object. In practice, methods are just functions tied to specific classes, and can provide also a mechanism to read (safely) values assigned to object attributes without accessing directly to them.\n", 402 | "\n", 403 | "We can define method visually by using UML, the same language we have initially adopted for defining our exemplar data model. Indeed, UML has been originally developed as a general-purpose modeling language in the field of software engineering, and provides widgets that permit the description of a software system including classes and their methods – even if it can be useful also in the task of defining a generic data model. The following diagram shows an extension of the data model presented above with the inclusion of new methods for accessing and, in some cases, modifying the status of particular object attributes.\n", 404 | "\n", 405 | "![An extended UML diagram with additional methods](uml2.png)\n", 406 | "\n", 407 | "In UML, the methods are listed just after the attributes of a given class, following the signature:\n", 408 | "\n", 409 | "```\n", 410 | "( : , : , ...) : \n", 411 | "``` \n", 412 | "\n", 413 | "For instance, the method `getDOI()` (no input needed here) of the class `Publication` returns a string, i.e. the DOI assigned to the particular publication; instead, the method `addId(identifier : string)` returns a boolean value that states if the operation of adding the string `identifer` to the set of identifiers of the class `Venue` went well (i.e. returned `True`) or not (i.e. returned `False`). Of course, this precise specification of the meaning of the return value of each method is not defined in the diagram itself, but it accompanies somehow the descriptive diagram as a natural language description of what the method should do. However, the diagram already provides the means of the kinds of input and the related output each method must to take and provide, respectively." 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "id": "massive-alexander", 419 | "metadata": {}, 420 | "source": [ 421 | "### Defining a method in Python\n", 422 | "\n", 423 | "Python uses the same structure seen for the constructor for defining all the other methods:\n", 424 | "\n", 425 | "```\n", 426 | "def (self, , , ...):\n", 427 | " ...\n", 428 | "```\n", 429 | "\n", 430 | "The only thing that changes here is that one can specify the name of the method. For instance, let us define all the method of the class `Publication` as defined in the diagram - the rationale behind each method should be self-explanatory:" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 12, 436 | "id": "verified-essex", 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "class Publication(object):\n", 441 | " def __init__(self, doi, publicationYear, title, publicationVenue):\n", 442 | " self.doi = doi\n", 443 | " self.publicationYear = publicationYear\n", 444 | " self.title = title\n", 445 | " self.publicationVenue = publicationVenue\n", 446 | " \n", 447 | " def getDOI(self):\n", 448 | " return self.doi\n", 449 | " \n", 450 | " def getPublicationYear(self):\n", 451 | " return self.publicationYear\n", 452 | " \n", 453 | " def getTitle(self):\n", 454 | " return self.title\n", 455 | " \n", 456 | " def getPublicationVenue(self):\n", 457 | " return self.publicationVenue" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "id": "designing-registrar", 463 | "metadata": {}, 464 | "source": [ 465 | "As shown in the code above, the methods defined add a few hooks to access the value of all the attributes of the class. Then, one can use call methods as done for the other built-in classes, i.e. using the signature `.(, , ...)` (as for the constructor, the `self` parameter must not be specified when calling a method), as shown as follows:" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 13, 471 | "id": "racial-detective", 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "name": "stdout", 476 | "output_type": "stream", 477 | "text": [ 478 | "-- The title of this publication is:\n", 479 | "In vitro selection as a powerful tool for the applied evolution of proteins and peptides\n" 480 | ] 481 | } 482 | ], 483 | "source": [ 484 | "# It uses the most recent definition of the class 'Publication', i.e. that with\n", 485 | "# the new methods implemented\n", 486 | "publication_2 = Publication(\"10.1016/s1367-5931(02)00332-0\", \n", 487 | " 2002, \n", 488 | " \"In vitro selection as a powerful tool for the applied evolution of proteins and peptides\", \n", 489 | " venue_2)\n", 490 | "\n", 491 | "print(\"-- The title of this publication is:\")\n", 492 | "print(publication_2.getTitle())" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "id": "effective-disaster", 498 | "metadata": {}, 499 | "source": [ 500 | "Using methods permits one to detach the ratrionale used to store information about the attributes from the particular contract-like committment defined by the UML diagram, that is what the user expects from running a method. For instance, let us see the methods of the class `venue`:" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 14, 506 | "id": "floppy-radical", 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "class Venue(object):\n", 511 | " def __init__(self, identifiers, name):\n", 512 | " self.id = set()\n", 513 | " for identifier in identifiers:\n", 514 | " self.id.add(identifier)\n", 515 | " \n", 516 | " self.name = name\n", 517 | " \n", 518 | " def getIds(self):\n", 519 | " result = []\n", 520 | " for identifier in self.id:\n", 521 | " result.append(identifier)\n", 522 | " result.sort()\n", 523 | " return result\n", 524 | " \n", 525 | " def getName(self):\n", 526 | " return self.name\n", 527 | " \n", 528 | " def addId(self, identifier):\n", 529 | " result = True\n", 530 | " if identifier not in self.id:\n", 531 | " self.id.add(identifier)\n", 532 | " else:\n", 533 | " result = False\n", 534 | " return result\n", 535 | " \n", 536 | " def removeId(self, identifier):\n", 537 | " result = True\n", 538 | " if identifier in self.id:\n", 539 | " self.id.remove(identifier)\n", 540 | " else:\n", 541 | " result = False\n", 542 | " return result" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "id": "packed-taiwan", 548 | "metadata": {}, 549 | "source": [ 550 | "As you can see from the new UML diagram with methods, the method `getIds` must return a list of strings, even we have originally defined the attribute `self.id` as a set. Thus, it is up to the method to implement the request as defined in the diagram. In particular, in the implementation above, a new list has been created which contains the same identifiers in the attrubute set `self.id`, but ordered alphabetically. The list returned by the method and the set in `self.id` are two different objects (containing the same items), as shown in the following excerpt:" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 15, 556 | "id": "painted-management", 557 | "metadata": {}, 558 | "outputs": [ 559 | { 560 | "name": "stdout", 561 | "output_type": "stream", 562 | "text": [ 563 | "The value in 'self.id' and that returned by the method 'getIds' are two different objects -->\n", 564 | "True\n", 565 | "\n", 566 | "However, they both contains the same collection of element -->\n", 567 | "True\n" 568 | ] 569 | } 570 | ], 571 | "source": [ 572 | "venue_1 = Venue([\"1531-6912\"], \"Comparative and Functional Genomics\")\n", 573 | "\n", 574 | "print(\"The value in 'self.id' and that returned by the method 'getIds' are two different objects -->\")\n", 575 | "print(id(venue_1.id) != id(venue_1.getIds())) \n", 576 | "\n", 577 | "print(\"\\nHowever, they both contains the same collection of element -->\")\n", 578 | "print(len(venue_1.id.difference(venue_1.getIds())) == 0)" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "id": "superb-concord", 584 | "metadata": {}, 585 | "source": [ 586 | "This way of handling the interation with class attributes may prevent, also, some undesired effect on mutable values – as a reminder, please see the section \"Clarification: immutable and mutable values\" in the [chapter \"Divide and conquer\" of the *Computational Thinking and Programming Book*](https://comp-think.github.io/book/09.pdf). For instance:\n", 587 | "\n", 588 | "1. What does it happen if the method `getIds` would return directly the set in `self.id`? \n", 589 | "2. What does it happen if such a set, retriveved by using the method mentioned in the previous question, is then directly modified by a user using the `add` method of the `set` class?\n", 590 | "3. How can the structure of the implementation of `getIds` in the code above prevent these issues?" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "id": "literary-expense", 596 | "metadata": {}, 597 | "source": [ 598 | "### What about methods and inheritance\n", 599 | "\n", 600 | "Superclass inheritance applies also to the methods, not only to attributes. For instance, let us introduce the extended implementation of the class `JournalArticle` shown above, where we add also the implementation of the additional two methods `getIssue` and `getVolume` as defined in the last UML diagram:" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 16, 606 | "id": "legitimate-creator", 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "class JournalArticle(Publication):\n", 611 | " def __init__(self, doi, publicationYear, title, publicationVenue, issue, volume):\n", 612 | " self.issue = issue\n", 613 | " self.volume = volume\n", 614 | " \n", 615 | " # Here is where the constructor of the superclass is explicitly recalled, so as\n", 616 | " # to handle the input parameters as done in the superclass\n", 617 | " super().__init__(doi, publicationYear, title, publicationVenue)\n", 618 | " \n", 619 | " def getIssue(self):\n", 620 | " return self.issue\n", 621 | " \n", 622 | " def getVolume(self):\n", 623 | " return self.volume" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "id": "anticipated-discount", 629 | "metadata": {}, 630 | "source": [ 631 | "In practice, when we create an new `JournalArticle` object, it will have available the methods the class `JournalArticle` defines plus all those defined by all the ancestor superclasses, at any level of the hierarchy (since I can create a non-circular tree of superclass-subclass relations among a chain of different classes). The following code shows how all both the methods of the two subclass and superclass work as expected in objects having class `JournalArticle`:" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 17, 637 | "id": "caroline-trunk", 638 | "metadata": {}, 639 | "outputs": [ 640 | { 641 | "name": "stdout", 642 | "output_type": "stream", 643 | "text": [ 644 | "-- The title of the journal article (method defined in the superclass 'Publication')\n", 645 | "Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information\n", 646 | "\n", 647 | "-- The title of the journal article (method defined in the class 'JournalArticle')\n", 648 | "4\n" 649 | ] 650 | } 651 | ], 652 | "source": [ 653 | "# It uses the most recent definition of the class 'JournalArticle', i.e. that with\n", 654 | "# the new methods implemented\n", 655 | "journal_article_1 = JournalArticle(\"10.1002/cfg.304\", \n", 656 | " 2003, \n", 657 | " \"Development of Computational Tools for the Inference of Protein Interaction Specificity Rules and Functional Annotation Using Structural Information\", \n", 658 | " journal_1, \n", 659 | " \"4\", \n", 660 | " \"4\")\n", 661 | "\n", 662 | "print(\"-- The title of the journal article (method defined in the superclass 'Publication')\")\n", 663 | "print(journal_article_1.getTitle())\n", 664 | "\n", 665 | "print(\"\\n-- The title of the journal article (method defined in the class 'JournalArticle')\")\n", 666 | "print(journal_article_1.getIssue())" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "id": "subsequent-spread", 672 | "metadata": {}, 673 | "source": [ 674 | "More information about the dynamics of the class inheritance are introduced and detailed in the [chapter \"Understanding Inheritance\" of *How To Code in Python*](https://www.digitalocean.com/community/books/digitalocean-ebook-how-to-code-in-python)." 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "id": "nervous-emergency", 680 | "metadata": {}, 681 | "source": [ 682 | "### Full UML diagram implementation\n", 683 | "\n", 684 | "I have implemented in a [single Python file](myclasses.py) all the classes introduced in the last UML diagram. They can be imported and reused in other files as shown in the [classuse.py](classuse.py) file using the following import notation:\n", 685 | "\n", 686 | "```\n", 687 | "from import , , ...\n", 688 | "```\n", 689 | "\n", 690 | "You can simply run all the instructions in the latter file running the following command:\n", 691 | "\n", 692 | "```\n", 693 | "python classuse.py\n", 694 | "```" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "id": "5c404ccb-c6eb-4903-99f5-bc9a6e591a94", 701 | "metadata": {}, 702 | "outputs": [], 703 | "source": [] 704 | } 705 | ], 706 | "metadata": { 707 | "kernelspec": { 708 | "display_name": "Python 3", 709 | "language": "python", 710 | "name": "python3" 711 | }, 712 | "language_info": { 713 | "codemirror_mode": { 714 | "name": "ipython", 715 | "version": 3 716 | }, 717 | "file_extension": ".py", 718 | "mimetype": "text/x-python", 719 | "name": "python", 720 | "nbconvert_exporter": "python", 721 | "pygments_lexer": "ipython3", 722 | "version": "3.9.0" 723 | } 724 | }, 725 | "nbformat": 4, 726 | "nbformat_minor": 5 727 | } 728 | -------------------------------------------------------------------------------- /docs/handson/04/04-Configuring_and_populating_a_relational_database.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "applicable-float", 6 | "metadata": {}, 7 | "source": [ 8 | "# Configuring and populating a relational database\n", 9 | "\n", 10 | "In this tutorial we introduce how to use [SQLite](https://en.wikipedia.org/wiki/SQLite) in Python." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "legal-stable", 16 | "metadata": {}, 17 | "source": [ 18 | "## SQLite\n", 19 | "\n", 20 | "[SQLite](https://www.sqlite.org) is a relational database management system (RDBMS) which can be embedded into the end program and does not follow a classic client–server architecture, where the server database is independent and is actually accessed by client programs.\n", 21 | "\n", 22 | "Python includes [SQLite within its standard library](https://docs.python.org/3/library/sqlite3.html) - it means that you can create and modify SQLite database directly from Python code. This simplifies a lot the first approach to DBMS, as we will see in the following sections. In addition, there are already several [documents on how to use SQLite in Python](https://www.digitalocean.com/community/tutorials/how-to-use-the-sqlite3-module-in-python-3) that are worth of reading to get more details about the features it provides. In this tutorial, we will see the main constructs used to create a database and populate it with tables." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "ready-integer", 28 | "metadata": {}, 29 | "source": [ 30 | "## Create a new database\n", 31 | "\n", 32 | "First of all, we have to import functions defined in the [`sqlite3` package](https://docs.python.org/3/library/sqlite3.html) in order to use its classes to create and execute operations on a database. The first thing we need, in particular, is to use the [class `Connection`](https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection). This class is responsible to connect to a particular database defined as a file. In SQLite, any database is actually stored in one single `.db` file. A new object having class `Connection` is created by calling the [function `connect`](https://docs.python.org/3/library/sqlite3.html#sqlite3.connect), as shown as follows:" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "id": "technical-collect", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from sqlite3 import connect\n", 43 | "\n", 44 | "with connect(\"publications.db\") as con:\n", 45 | " # do some operation with the new connection\n", 46 | " \n", 47 | " con.commit() # commit the current transaction to the database\n", 48 | " \n", 49 | " # when you finish, the collection will be closed automatically via 'with'" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "bright-filling", 55 | "metadata": {}, 56 | "source": [ 57 | "As you can see, the way we use a new connection to a database is similar to what we have seen with files in previous tutorials, using the instruction `with` that allows us to close the connection to the database when all the operations with it have been executed. Before closing the connection, though, it is crucial to run the [method `commit`](https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.commit) in order to commit the current [transaction](https://en.wikipedia.org/wiki/Database_transaction) (i.e. the set of operations that may have changed the status of a database) to the database itself. Of course, it is possible to call the method `commit` more than once during the lifespan of a connection. In practice, you can call it every time you execute an operation that modifies the status of the database, in order to record such modification into the file system.\n", 58 | "\n", 59 | "It is worth mentioning that the `.db` file specifed as input of the function `connect` is created if no database is available with that name in the path specified. However, if such a file already exists, it will be loaded by the connection with the information it already stores." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "paperback-senator", 65 | "metadata": {}, 66 | "source": [ 67 | "## From a diagram to tables\n", 68 | "\n", 69 | "Before starting populating the database, it is necessary to create the appropriate tables that enable the description of all the entities (and related data) we need. In order to understand what to do, it is important to look at the UML diagram describing the data model introduced in a previous tutorial.\n", 70 | "\n", 71 | "![UML diagram of a data model](../02/uml.png)\n", 72 | "\n", 73 | "There are different strategies that we can follow to crete the tables describing a data model such as that deiscribed above. For instance, we can approach such a translation as follows:\n", 74 | "\n", 75 | "* Create a table for each class which does not have any subclass (i.e. the most concrete classes), by using as columns all single-valued attributes and relations defined in such a class and all its superclasses. Concretely, we create four tables, i.e. for `JournalArticle` and `BookChapter` (which inherit all the attributes and relations of `Publication`), and `Journal` and `Book` (which inherit all the attributes and relations of `Venue`).\n", 76 | "\n", 77 | "* For each of the tables above, add also a new column that enables us to specify an internal identifier we use to clearly identify all the entities of the various types. In this case, it is enough to add an additional column in each table, e.g. `internalId`. Suggestion: having an internal identifier which is globally unique in the database is the way to go.\n", 78 | "\n", 79 | "* Keep in mind that the value of all the columns related to relations must point to an internal identifier defined in some of the tables. For instance, the column `publicationVenue` in the table `JournalArticle` will contain an internal identifier of a journal as defined in the column `internalId` of the table `Journal`.\n", 80 | "\n", 81 | "* Create a two column table, where the first column enables the identification of an internal identifier of an entity specified in the other tables, for each multivalued attribute or relation in the diagram. In the example, only the attribute `id` of the class `Venue` is multivalued, and thus an additional table `VenueId` is created to link a `Venue` entity with one or more identifiers characterising it.\n", 82 | "\n", 83 | "A possible translation of the UML diagram above following the rules just mentioned is shown as follows:\n", 84 | "\n", 85 | "![tables to represent the data model mentioned above](../04/tables.png)\n", 86 | "\n", 87 | "It is worth mentioning that this is not the only possible way to translate the original UML data mode, and other paths can be followed to this respect." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "id": "varied-virtue", 93 | "metadata": {}, 94 | "source": [ 95 | "## How to create and populate a table with Pandas\n", 96 | "\n", 97 | "Pandas makes available specific methods that simplify the creation and population of database tables via `DataFrame`, and that take care also of running some database-related operations such as the `commit` shown above. The main tool to use to push a table into a SQLite database is the [method `to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html) of the class `DataFrame`. However, before seen how to use it to populate a database, let us reorganise the original data provided in CSV about publications and venues in a series of Pandas data frames recalling the tabular structures introduced in the previous section." 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "brilliant-aurora", 103 | "metadata": {}, 104 | "source": [ 105 | "### Venue-Id table\n", 106 | "\n", 107 | "Let us start by creating the `VenueId` table. This can be done using one of the columns of the [CSV document describing venues](\"../01/01-venues.csv\"), i.e. the column `id`. Thus, we create a new sub-data frame containing only that column and we add an additional column defining the strings referring to the internal identifiers of each venue. The following code shows how to perform all these operations:" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 2, 113 | "id": "hired-anaheim", 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/html": [ 119 | "
\n", 120 | "\n", 133 | "\n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | "
venueIdid
0venue-01531-6912
1venue-11367-5931
2venue-29780470291092
3venue-31027-3662
\n", 164 | "
" 165 | ], 166 | "text/plain": [ 167 | " venueId id\n", 168 | "0 venue-0 1531-6912\n", 169 | "1 venue-1 1367-5931\n", 170 | "2 venue-2 9780470291092\n", 171 | "3 venue-3 1027-3662" 172 | ] 173 | }, 174 | "execution_count": 2, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "from pandas import read_csv, Series\n", 181 | "\n", 182 | "venues = read_csv(\"../01/01-venues.csv\", \n", 183 | " keep_default_na=False,\n", 184 | " dtype={\n", 185 | " \"id\": \"string\",\n", 186 | " \"name\": \"string\",\n", 187 | " \"type\": \"string\"\n", 188 | " })\n", 189 | "\n", 190 | "# This will create a new data frame starting from 'venues' one,\n", 191 | "# and it will include only the column \"id\"\n", 192 | "venues_ids = venues[[\"id\"]]\n", 193 | "\n", 194 | "# Generate a list of internal identifiers for the venues\n", 195 | "venue_internal_id = []\n", 196 | "for idx, row in venues_ids.iterrows():\n", 197 | " venue_internal_id.append(\"venue-\" + str(idx))\n", 198 | "\n", 199 | "# Add the list of venues internal identifiers as a new column\n", 200 | "# of the data frame via the class 'Series'\n", 201 | "venues_ids.insert(0, \"venueId\", Series(venue_internal_id, dtype=\"string\"))\n", 202 | "\n", 203 | "# Show the new data frame on screen\n", 204 | "venues_ids" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "id": "acquired-finish", 210 | "metadata": {}, 211 | "source": [ 212 | "In practice, after reading the CSV, we create a new data frame with only the column `id` by using the command `[[, , ...]]`. In practice, this command will create a new sub-data frame using only the values in the columns specified.\n", 213 | "\n", 214 | "Then, we have to define the internal identifiers for all the venues. To this end, we iterate over the rows of the new data frame and we compose a list of internal identifiers by concatenating the string `\"venue-\"` with the string of the values specified in the index of each row. Thus, we add that list (mediated via a `Series` of string values) into the data frame using the [method `insert`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.insert.html), which takes in input the position where to put the new column (`0` is the first position, `1` is the second position, etc.), the column name, and the values to associate to that column." 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "id": "moderate-edmonton", 220 | "metadata": {}, 221 | "source": [ 222 | "### Tables for journals and books\n", 223 | "\n", 224 | "With this new table, we can start to create the two additional tables for journals and books. First of all, we create two new data frames containing only entities (i.e. rows) of the same type (e.g. journals) by using the method `query`:" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 3, 230 | "id": "dress-chapter", 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/html": [ 236 | "
\n", 237 | "\n", 250 | "\n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
idnametype
01531-6912Comparative and Functional Genomicsjournal
11367-5931Current Opinion in Chemical Biologyjournal
31027-3662Journal of Theoretical Medicinejournal
\n", 280 | "
" 281 | ], 282 | "text/plain": [ 283 | " id name type\n", 284 | "0 1531-6912 Comparative and Functional Genomics journal\n", 285 | "1 1367-5931 Current Opinion in Chemical Biology journal\n", 286 | "3 1027-3662 Journal of Theoretical Medicine journal" 287 | ] 288 | }, 289 | "execution_count": 3, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "# Data frame of journals\n", 296 | "journals = venues.query(\"type == 'journal'\")\n", 297 | "journals # Showing the data frame" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "id": "impossible-persian", 303 | "metadata": {}, 304 | "source": [ 305 | "Then, for each row in the new data frame, we retrieve the associated `internalId` of each journal by looking at the table `venueId` created above. In this case, we can use the method `merge` to accomplish the task:" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 4, 311 | "id": "skilled-excellence", 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/html": [ 317 | "
\n", 318 | "\n", 331 | "\n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | "
idnametypevenueId
01531-6912Comparative and Functional Genomicsjournalvenue-0
11367-5931Current Opinion in Chemical Biologyjournalvenue-1
21027-3662Journal of Theoretical Medicinejournalvenue-3
\n", 365 | "
" 366 | ], 367 | "text/plain": [ 368 | " id name type venueId\n", 369 | "0 1531-6912 Comparative and Functional Genomics journal venue-0\n", 370 | "1 1367-5931 Current Opinion in Chemical Biology journal venue-1\n", 371 | "2 1027-3662 Journal of Theoretical Medicine journal venue-3" 372 | ] 373 | }, 374 | "execution_count": 4, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "from pandas import merge\n", 381 | "\n", 382 | "df_joined = merge(journals, venues_ids, left_on=\"id\", right_on=\"id\")\n", 383 | "df_joined" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "id": "interracial-soccer", 389 | "metadata": {}, 390 | "source": [ 391 | "Finally, the final journal table can be defined by selecting only two columns of the last merged data frame and by modifying the column name `venueId` in `internalId`, as shown as follows:" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 5, 397 | "id": "starting-anchor", 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/html": [ 403 | "
\n", 404 | "\n", 417 | "\n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | "
internalIdname
0venue-0Comparative and Functional Genomics
1venue-1Current Opinion in Chemical Biology
2venue-3Journal of Theoretical Medicine
\n", 443 | "
" 444 | ], 445 | "text/plain": [ 446 | " internalId name\n", 447 | "0 venue-0 Comparative and Functional Genomics\n", 448 | "1 venue-1 Current Opinion in Chemical Biology\n", 449 | "2 venue-3 Journal of Theoretical Medicine" 450 | ] 451 | }, 452 | "execution_count": 5, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | } 456 | ], 457 | "source": [ 458 | "journals = df_joined[[\"venueId\", \"name\"]]\n", 459 | "journals = journals.rename(columns={\"venueId\": \"internalId\"})\n", 460 | "journals" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "id": "logical-advocate", 466 | "metadata": {}, 467 | "source": [ 468 | "The rename of a column is performed with the [method `rename`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html), that takes in input a dictonary with the named paramenter `columns`, where each key represent the old column name while the value is the new column name. The method returns a new data frame where the columns are renamed as specified." 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "id": "local-affair", 474 | "metadata": {}, 475 | "source": [ 476 | "A similar organisation can be provided also for books, by using the code specified above, customising it for handling books:" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 6, 482 | "id": "willing-springfield", 483 | "metadata": {}, 484 | "outputs": [ 485 | { 486 | "data": { 487 | "text/html": [ 488 | "
\n", 489 | "\n", 502 | "\n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | "
internalIdname
0venue-2Proceedings of the 5th Annual Conference on Co...
\n", 518 | "
" 519 | ], 520 | "text/plain": [ 521 | " internalId name\n", 522 | "0 venue-2 Proceedings of the 5th Annual Conference on Co..." 523 | ] 524 | }, 525 | "execution_count": 6, 526 | "metadata": {}, 527 | "output_type": "execute_result" 528 | } 529 | ], 530 | "source": [ 531 | "# Data frame of books\n", 532 | "books = venues.query(\"type == 'book'\")\n", 533 | "df_joined = merge(books, venues_ids, left_on=\"id\", right_on=\"id\")\n", 534 | "books = df_joined[[\"venueId\", \"name\"]]\n", 535 | "books = books.rename(columns={\"venueId\": \"internalId\"})\n", 536 | "books" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "id": "infinite-planet", 542 | "metadata": {}, 543 | "source": [ 544 | "### Tables for publications\n", 545 | "\n", 546 | "Similarly, all the other tables (i.e. `JournalArticles` and `BookChapters`) can be created using the same set of operations, but starting from the [CSV document containing publications](\"../p1/01-publications.csv\"). First, we create a new column with all the internal identifiers for all publications, as shown as follows:" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 7, 552 | "id": "essential-japanese", 553 | "metadata": {}, 554 | "outputs": [ 555 | { 556 | "data": { 557 | "text/html": [ 558 | "
\n", 559 | "\n", 572 | "\n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | "
internalIddoititlepublication yearpublication venuetypeissuevolume
0publication-010.1002/cfg.304Development of Computational Tools for the Inf...20031531-6912journal article44
1publication-110.1016/s1367-5931(02)00332-0In vitro selection as a powerful tool for the ...20021367-5931journal article36
2publication-210.1002/9780470291092.ch20Mechanisms of Toughening in Ceramic Matrix Com...19819780470291092book chapter
\n", 622 | "
" 623 | ], 624 | "text/plain": [ 625 | " internalId doi \\\n", 626 | "0 publication-0 10.1002/cfg.304 \n", 627 | "1 publication-1 10.1016/s1367-5931(02)00332-0 \n", 628 | "2 publication-2 10.1002/9780470291092.ch20 \n", 629 | "\n", 630 | " title publication year \\\n", 631 | "0 Development of Computational Tools for the Inf... 2003 \n", 632 | "1 In vitro selection as a powerful tool for the ... 2002 \n", 633 | "2 Mechanisms of Toughening in Ceramic Matrix Com... 1981 \n", 634 | "\n", 635 | " publication venue type issue volume \n", 636 | "0 1531-6912 journal article 4 4 \n", 637 | "1 1367-5931 journal article 3 6 \n", 638 | "2 9780470291092 book chapter " 639 | ] 640 | }, 641 | "execution_count": 7, 642 | "metadata": {}, 643 | "output_type": "execute_result" 644 | } 645 | ], 646 | "source": [ 647 | "publications = read_csv(\"../01/01-publications.csv\", \n", 648 | " keep_default_na=False,\n", 649 | " dtype={\n", 650 | " \"doi\": \"string\",\n", 651 | " \"title\": \"string\",\n", 652 | " \"publication year\": \"int\",\n", 653 | " \"publication venue\": \"string\",\n", 654 | " \"type\": \"string\",\n", 655 | " \"issue\": \"string\",\n", 656 | " \"volume\": \"string\"\n", 657 | " })\n", 658 | "\n", 659 | "# Create a new column with internal identifiers for each publication\n", 660 | "publication_internal_id = []\n", 661 | "for idx, row in publications.iterrows():\n", 662 | " publication_internal_id.append(\"publication-\" + str(idx))\n", 663 | "publications.insert(0, \"internalId\", Series(publication_internal_id, dtype=\"string\"))\n", 664 | "publications" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "id": "executed-lottery", 670 | "metadata": {}, 671 | "source": [ 672 | "Then, we create the table for journal articles similarly to what we have done before:" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 8, 678 | "id": "herbal-manchester", 679 | "metadata": {}, 680 | "outputs": [ 681 | { 682 | "data": { 683 | "text/html": [ 684 | "
\n", 685 | "\n", 698 | "\n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | "
internalIddoipublicationYeartitleissuevolumepublicationVenue
0publication-010.1002/cfg.3042003Development of Computational Tools for the Inf...44venue-0
1publication-110.1016/s1367-5931(02)00332-02002In vitro selection as a powerful tool for the ...36venue-1
\n", 734 | "
" 735 | ], 736 | "text/plain": [ 737 | " internalId doi publicationYear \\\n", 738 | "0 publication-0 10.1002/cfg.304 2003 \n", 739 | "1 publication-1 10.1016/s1367-5931(02)00332-0 2002 \n", 740 | "\n", 741 | " title issue volume \\\n", 742 | "0 Development of Computational Tools for the Inf... 4 4 \n", 743 | "1 In vitro selection as a powerful tool for the ... 3 6 \n", 744 | "\n", 745 | " publicationVenue \n", 746 | "0 venue-0 \n", 747 | "1 venue-1 " 748 | ] 749 | }, 750 | "execution_count": 8, 751 | "metadata": {}, 752 | "output_type": "execute_result" 753 | } 754 | ], 755 | "source": [ 756 | "# Data frame of journal articles\n", 757 | "journal_articles = publications.query(\"type == 'journal article'\")\n", 758 | "df_joined = merge(journal_articles, venues_ids, left_on=\"publication venue\", right_on=\"id\")\n", 759 | "journal_articles = df_joined[\n", 760 | " [\"internalId\", \"doi\", \"publication year\", \"title\", \"issue\", \"volume\", \"venueId\"]]\n", 761 | "journal_articles = journal_articles.rename(columns={\n", 762 | " \"publication year\": \"publicationYear\",\n", 763 | " \"venueId\": \"publicationVenue\"})\n", 764 | "journal_articles" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "id": "chubby-marsh", 770 | "metadata": {}, 771 | "source": [ 772 | "Similarly, we create the table for book chapters:" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": 9, 778 | "id": "adequate-rachel", 779 | "metadata": {}, 780 | "outputs": [ 781 | { 782 | "data": { 783 | "text/html": [ 784 | "
\n", 785 | "\n", 798 | "\n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | "
internalIddoipublicationYeartitlepublicationVenue
0publication-210.1002/9780470291092.ch201981Mechanisms of Toughening in Ceramic Matrix Com...venue-2
\n", 820 | "
" 821 | ], 822 | "text/plain": [ 823 | " internalId doi publicationYear \\\n", 824 | "0 publication-2 10.1002/9780470291092.ch20 1981 \n", 825 | "\n", 826 | " title publicationVenue \n", 827 | "0 Mechanisms of Toughening in Ceramic Matrix Com... venue-2 " 828 | ] 829 | }, 830 | "execution_count": 9, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "# Data frame of book chapters\n", 837 | "book_chapters = publications.query(\"type == 'book chapter'\")\n", 838 | "df_joined = merge(book_chapters, venues_ids, left_on=\"publication venue\", right_on=\"id\")\n", 839 | "book_chapters = df_joined[\n", 840 | " [\"internalId\", \"doi\", \"publication year\", \"title\", \"venueId\"]]\n", 841 | "book_chapters = book_chapters.rename(columns={\n", 842 | " \"publication year\": \"publicationYear\",\n", 843 | " \"venueId\": \"publicationVenue\"})\n", 844 | "book_chapters" 845 | ] 846 | }, 847 | { 848 | "cell_type": "markdown", 849 | "id": "square-special", 850 | "metadata": {}, 851 | "source": [ 852 | "### Adding the tables to the database\n", 853 | "\n", 854 | "As anticipated before, adding a table to a database is done, in Pandas, using the `DataFrame` method `to_sql`. This method takes in input two mandatory parameters (identifying the name of the table in the database and the database connection) plus a series of optional named parameters, among which the parameter `if_exists` that, when set to `\"replace\"`, replaces the values in an existing database table having the same name with the new data, and the parameter `index` that, when set to `False`, does not add the data frame index in the database. Thus, adding the five tables to the SQLite database created at the very beginning can be done running the following commands:" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": 10, 860 | "id": "suited-staff", 861 | "metadata": {}, 862 | "outputs": [], 863 | "source": [ 864 | "with connect(\"publications.db\") as con:\n", 865 | " venues_ids.to_sql(\"VenueId\", con, if_exists=\"replace\", index=False)\n", 866 | " journals.to_sql(\"Journal\", con, if_exists=\"replace\", index=False)\n", 867 | " books.to_sql(\"Book\", con, if_exists=\"replace\", index=False)\n", 868 | " journal_articles.to_sql(\"JournalArticle\", con, if_exists=\"replace\", index=False)\n", 869 | " book_chapters.to_sql(\"BookChapter\", con, if_exists=\"replace\", index=False)" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": null, 875 | "id": "multiple-budapest", 876 | "metadata": {}, 877 | "outputs": [], 878 | "source": [] 879 | } 880 | ], 881 | "metadata": { 882 | "kernelspec": { 883 | "display_name": "Python 3", 884 | "language": "python", 885 | "name": "python3" 886 | }, 887 | "language_info": { 888 | "codemirror_mode": { 889 | "name": "ipython", 890 | "version": 3 891 | }, 892 | "file_extension": ".py", 893 | "mimetype": "text/x-python", 894 | "name": "python", 895 | "nbconvert_exporter": "python", 896 | "pygments_lexer": "ipython3", 897 | "version": "3.9.0" 898 | } 899 | }, 900 | "nbformat": 4, 901 | "nbformat_minor": 5 902 | } 903 | -------------------------------------------------------------------------------- /docs/handson/06/06-Interacting_with_databases_using_Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ruled-premium", 6 | "metadata": {}, 7 | "source": [ 8 | "# Interacting with databases using Pandas\n", 9 | "\n", 10 | "In this tutorial, we show how to use Pandas data frames to interact with SQL-based and graph-based databases." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "forty-training", 16 | "metadata": {}, 17 | "source": [ 18 | "## Data available in different sources\n", 19 | "\n", 20 | "Often, when you have to deal with and reuse existing data, the answer to a query can be possible only by combining data available in different databases. In addition, such databases can expose their data using different technologies (e.g. an SQLite database and an RDF triplestore). Thus, it is important to have a smooth method that allows one to take data from different sources, to expose these data according to a similar interface, and finally to make some additional operation on these data that, in principle, can be seen as coming from a unique abstract source.\n", 21 | "\n", 22 | "Pandas, thanks to its standard library and additional plugins developed for it, enables us to use it as a proxy model for getting and comparing data coming from different sources (and even different formats). A few tutorials ago, indeed, we have seen how to read data stored as CSV documents using Pandas. We can use similar functions to read a result of a query sent to a database as it is a source of information. In this tutorial, we see how to do it with SQLite and Blazegraph, i.e. the two databases used in the previous tutorials." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "laughing-lease", 28 | "metadata": {}, 29 | "source": [ 30 | "## Reading data from SQLite\n", 31 | "\n", 32 | "Pandas makes available the [method `read_sql`](https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html) which enables us, among the other things, to query an SQL-based database using an SQL query and to expose the answer returned as a classic Pandas data frame. This function takes in input two mandatory parameters that are the SQL query to execute on the database and the connection to it, and returns a data frame built on the data and the parameter specified in the SQL query. For instance, the following code takes the title of all the journal articles included in the table `JournalArticle`:" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "id": "steady-beginning", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | "
title
0Development of Computational Tools for the Inf...
1In vitro selection as a powerful tool for the ...
\n", 76 | "
" 77 | ], 78 | "text/plain": [ 79 | " title\n", 80 | "0 Development of Computational Tools for the Inf...\n", 81 | "1 In vitro selection as a powerful tool for the ..." 82 | ] 83 | }, 84 | "execution_count": 1, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "from sqlite3 import connect\n", 91 | "from pandas import read_sql\n", 92 | "\n", 93 | "with connect(\"../04/publications.db\") as con:\n", 94 | " query = \"SELECT title FROM JournalArticle\"\n", 95 | " df_sql = read_sql(query, con)\n", 96 | " \n", 97 | "df_sql # show the content of the result of the query" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "right-ranch", 103 | "metadata": {}, 104 | "source": [ 105 | "It is worth mentioning that, to enable the correct definition of the results of the query into a data frame, it is always better first to create all the necessary data frames within the `with` clause, and then start to work on them \"offline\", once the connection to the database has been closed. Otherwise, you could observe some unexpected behaviours.\n", 106 | "\n", 107 | "Finally, it is worth mentioning that the data type used in the database are converted into the appropriate data type in Pandas. Thus, if a column has been defined as containing integers in the database, we get back the same data type for the column in the data frame. This is clear when we try to retrieve, for instance, an entire table from the SQLite database:" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 3, 113 | "id": "focused-sailing", 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "0 2003\n", 120 | "1 2002\n", 121 | "Name: publicationYear, dtype: int64" 122 | ] 123 | }, 124 | "execution_count": 3, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "with connect(\"../04/publications.db\") as con:\n", 131 | " query = \"SELECT * FROM JournalArticle\"\n", 132 | " df_journal_article_sql = read_sql(query, con)\n", 133 | "\n", 134 | "# Show the series of the column 'publicationYear', which as 'dtype'\n", 135 | "# specifies 'int64', as expected\n", 136 | "df_journal_article_sql[\"publicationYear\"]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "rational-finder", 142 | "metadata": { 143 | "tags": [] 144 | }, 145 | "source": [ 146 | "## Reading data from Blazegraph\n", 147 | "\n", 148 | "Even if Pandas does not make available any reading method to interact with RDF triplestores, some developers has implemented a facility that permits us to interact directly with a SPARQL endpoint provided by an RDF triplestore such as Blazegraph, i.e. the [library `sparql_dataframe`](https://github.com/lawlesst/sparql-dataframe). This library is a wrapper for a SPARQL query and shows the answer to such a query as a Pandas data frame. We can install the library using the usual command:\n", 149 | "\n", 150 | "```\n", 151 | "pip install sparql_dataframe\n", 152 | "```\n", 153 | "\n", 154 | "The function `get` is called to perform such an operation, and it takes in input three parameters: the URL of the SPARQL endpoint to contact, the query to execute, and a boolean specifying if to contact the SPARQL endpoint using the [POST HTTP method](https://en.wikipedia.org/wiki/POST_(HTTP)) (strongly suggested, otherwise it could not work correctly). An example of execution of such a function is shown in the following excerpt:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 4, 160 | "id": "colored-rings", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/html": [ 166 | "
\n", 167 | "\n", 180 | "\n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | "
journal_articletitle
0https://comp-data.github.io/res/publication-0Development of Computational Tools for the Inf...
1https://comp-data.github.io/res/publication-1In vitro selection as a powerful tool for the ...
\n", 201 | "
" 202 | ], 203 | "text/plain": [ 204 | " journal_article \\\n", 205 | "0 https://comp-data.github.io/res/publication-0 \n", 206 | "1 https://comp-data.github.io/res/publication-1 \n", 207 | "\n", 208 | " title \n", 209 | "0 Development of Computational Tools for the Inf... \n", 210 | "1 In vitro selection as a powerful tool for the ... " 211 | ] 212 | }, 213 | "execution_count": 4, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "from sparql_dataframe import get\n", 220 | "\n", 221 | "endpoint = \"http://127.0.0.1:9999/blazegraph/sparql\"\n", 222 | "query = \"\"\"\n", 223 | "PREFIX rdf: \n", 224 | "PREFIX schema: \n", 225 | "\n", 226 | "SELECT ?journal_article ?title\n", 227 | "WHERE {\n", 228 | " ?journal_article rdf:type schema:ScholarlyArticle .\n", 229 | " ?journal_article schema:name ?title .\n", 230 | "}\n", 231 | "\"\"\"\n", 232 | "df_sparql = get(endpoint, query, True)\n", 233 | "df_sparql" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "d21f2e46-a3b7-425e-8fcb-d0e25369f7ce", 239 | "metadata": {}, 240 | "source": [ 241 | "Due to the implementation of the `get` function in the `sparql_dataframe` package, though, the values returned by running the SPARQL query will be inferred automatically by looking at all the values of a certain column. Thus, if one wants to change the data type of the values associated to a particular column, one has to cast the column on purpose and the reassigning the column to the data frame. For instance, let us build a query that takes information of all the publications available in the triplestore:" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 7, 247 | "id": "68a57bdc-d1fe-4878-a3f2-e9a46f223a69", 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/html": [ 253 | "
\n", 254 | "\n", 267 | "\n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | "
internalIddoipublicationYeartitleissuevolumepublicationVenue
0https://comp-data.github.io/res/publication-010.1002/cfg.3042003Development of Computational Tools for the Inf...4.04.0https://comp-data.github.io/res/venue-0
1https://comp-data.github.io/res/publication-110.1016/s1367-5931(02)00332-02002In vitro selection as a powerful tool for the ...3.06.0https://comp-data.github.io/res/venue-1
2https://comp-data.github.io/res/publication-210.1002/9780470291092.ch201981Mechanisms of Toughening in Ceramic Matrix Com...NaNNaNhttps://comp-data.github.io/res/venue-2
\n", 313 | "
" 314 | ], 315 | "text/plain": [ 316 | " internalId \\\n", 317 | "0 https://comp-data.github.io/res/publication-0 \n", 318 | "1 https://comp-data.github.io/res/publication-1 \n", 319 | "2 https://comp-data.github.io/res/publication-2 \n", 320 | "\n", 321 | " doi publicationYear \\\n", 322 | "0 10.1002/cfg.304 2003 \n", 323 | "1 10.1016/s1367-5931(02)00332-0 2002 \n", 324 | "2 10.1002/9780470291092.ch20 1981 \n", 325 | "\n", 326 | " title issue volume \\\n", 327 | "0 Development of Computational Tools for the Inf... 4.0 4.0 \n", 328 | "1 In vitro selection as a powerful tool for the ... 3.0 6.0 \n", 329 | "2 Mechanisms of Toughening in Ceramic Matrix Com... NaN NaN \n", 330 | "\n", 331 | " publicationVenue \n", 332 | "0 https://comp-data.github.io/res/venue-0 \n", 333 | "1 https://comp-data.github.io/res/venue-1 \n", 334 | "2 https://comp-data.github.io/res/venue-2 " 335 | ] 336 | }, 337 | "execution_count": 7, 338 | "metadata": {}, 339 | "output_type": "execute_result" 340 | } 341 | ], 342 | "source": [ 343 | "publication_query = \"\"\"\n", 344 | "PREFIX rdf: \n", 345 | "PREFIX schema: \n", 346 | "\n", 347 | "SELECT ?internalId ?doi ?publicationYear ?title ?issue ?volume ?publicationVenue\n", 348 | "WHERE {\n", 349 | " VALUES ?type {\n", 350 | " schema:ScholarlyArticle\n", 351 | " schema:Chapter\n", 352 | " }\n", 353 | " \n", 354 | " ?internalId rdf:type ?type .\n", 355 | " ?internalId schema:identifier ?doi .\n", 356 | " ?internalId schema:datePublished ?publicationYear .\n", 357 | " ?internalId schema:name ?title .\n", 358 | " ?internalId schema:isPartOf ?publicationVenue .\n", 359 | " \n", 360 | " OPTIONAL {\n", 361 | " ?internalId schema:issueNumber ?issue .\n", 362 | " ?internalId schema:volumeNumber ?volume .\n", 363 | " }\n", 364 | "}\n", 365 | "\"\"\"\n", 366 | "\n", 367 | "df_publications_sparql = get(endpoint, publication_query, True)\n", 368 | "df_publications_sparql" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "id": "724cf7f4-38b0-4b75-8a0f-16c5f9e95bac", 374 | "metadata": {}, 375 | "source": [ 376 | "It is worth mentioning that the optional group in the SPARQL query (`OPTIONAL { ... }`) is used to allow information to be added to the solution if it is available, otherwise the related variables will be left empty." 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "id": "f8a633f2-645a-4250-ae6a-df4b47684475", 382 | "metadata": {}, 383 | "source": [ 384 | "## Fixing some issues\n", 385 | "\n", 386 | "As you can observed from the result of the previous query, the data frame created contains some basic information depicted by the variable names chosen, that are specified in the query itself for being equal to those returned in the last SQL query done above. \n", 387 | "\n", 388 | "However, one unexpected behaviour is the way the columns `issue` and `volume` is handled. To see this, we use the [attribute `dtypes`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dtypes.html) of our data frame to see how things are handled:" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 8, 394 | "id": "8ad5aa98-74c5-4553-bd74-6ecef86e80a7", 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "internalId object\n", 401 | "doi object\n", 402 | "publicationYear int64\n", 403 | "title object\n", 404 | "issue float64\n", 405 | "volume float64\n", 406 | "publicationVenue object\n", 407 | "dtype: object" 408 | ] 409 | }, 410 | "execution_count": 8, 411 | "metadata": {}, 412 | "output_type": "execute_result" 413 | } 414 | ], 415 | "source": [ 416 | "df_publications_sparql.dtypes" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "id": "05fdf934-3a22-4e9d-a606-d578bcafa294", 422 | "metadata": {}, 423 | "source": [ 424 | "As you can see, the two columns mentioned above have been assigned with a float data type, which has been inferred by Pandas by looking at the values of these two columns. In order to change it into an appropriate kind of value, e.g. a string, we have to overwrite the data type of the entire data frame (using the [method `astype`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html) that takes in input the new data type) and/or the data type of specific columns. For doing the last operation, we have to reassign the columns with the new types to the data frame using the following syntax:\n", 425 | "\n", 426 | "```\n", 427 | "[] = [].astype()\n", 428 | "```\n", 429 | "\n", 430 | "For instance, to reassign the columns `issue` and `volume` to the type `\"string\"`, we can run the following commands:" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 9, 436 | "id": "3fb5b55e-8e3e-49ea-9659-9b10d6957239", 437 | "metadata": {}, 438 | "outputs": [ 439 | { 440 | "data": { 441 | "text/plain": [ 442 | "internalId object\n", 443 | "doi object\n", 444 | "publicationYear int64\n", 445 | "title object\n", 446 | "issue string\n", 447 | "volume string\n", 448 | "publicationVenue object\n", 449 | "dtype: object" 450 | ] 451 | }, 452 | "execution_count": 9, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | } 456 | ], 457 | "source": [ 458 | "df_publications_sparql[\"issue\"] = df_publications_sparql[\"issue\"].astype(\"string\")\n", 459 | "df_publications_sparql[\"volume\"] = df_publications_sparql[\"volume\"].astype(\"string\")\n", 460 | "\n", 461 | "df_publications_sparql.dtypes" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "id": "c435b381-8303-4ed2-b41d-7e3321617a3b", 467 | "metadata": {}, 468 | "source": [ 469 | "Similarly, if you want to replace the `NaN` values associated to the same two columns when no value is available, you can use the data frame [method `fillna`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html), which enables one to replace all `NaN` in the data frame with a value of your choice passed as input:" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 10, 475 | "id": "7c1c7490-5425-46d7-86a6-77fa707ebe57", 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "data": { 480 | "text/html": [ 481 | "
\n", 482 | "\n", 495 | "\n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | "
internalIddoipublicationYeartitleissuevolumepublicationVenue
0https://comp-data.github.io/res/publication-010.1002/cfg.3042003Development of Computational Tools for the Inf...4.04.0https://comp-data.github.io/res/venue-0
1https://comp-data.github.io/res/publication-110.1016/s1367-5931(02)00332-02002In vitro selection as a powerful tool for the ...3.06.0https://comp-data.github.io/res/venue-1
2https://comp-data.github.io/res/publication-210.1002/9780470291092.ch201981Mechanisms of Toughening in Ceramic Matrix Com...https://comp-data.github.io/res/venue-2
\n", 541 | "
" 542 | ], 543 | "text/plain": [ 544 | " internalId \\\n", 545 | "0 https://comp-data.github.io/res/publication-0 \n", 546 | "1 https://comp-data.github.io/res/publication-1 \n", 547 | "2 https://comp-data.github.io/res/publication-2 \n", 548 | "\n", 549 | " doi publicationYear \\\n", 550 | "0 10.1002/cfg.304 2003 \n", 551 | "1 10.1016/s1367-5931(02)00332-0 2002 \n", 552 | "2 10.1002/9780470291092.ch20 1981 \n", 553 | "\n", 554 | " title issue volume \\\n", 555 | "0 Development of Computational Tools for the Inf... 4.0 4.0 \n", 556 | "1 In vitro selection as a powerful tool for the ... 3.0 6.0 \n", 557 | "2 Mechanisms of Toughening in Ceramic Matrix Com... \n", 558 | "\n", 559 | " publicationVenue \n", 560 | "0 https://comp-data.github.io/res/venue-0 \n", 561 | "1 https://comp-data.github.io/res/venue-1 \n", 562 | "2 https://comp-data.github.io/res/venue-2 " 563 | ] 564 | }, 565 | "execution_count": 10, 566 | "metadata": {}, 567 | "output_type": "execute_result" 568 | } 569 | ], 570 | "source": [ 571 | "df_publications_sparql = df_publications_sparql.fillna(\"\")\n", 572 | "\n", 573 | "df_publications_sparql" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "id": "c77f7dc9-b0b3-47c8-9969-be81b79df97c", 579 | "metadata": {}, 580 | "source": [ 581 | "Of course, this allowed us to remove all `NaN` values. However, if you look at the table and in particular to the columns `issue` and `volume`, you can see something that is still a bit in these two columns. \n", 582 | "\n", 583 | "Indeed, the two strings defining issues and volumes associated with an article are, actually, the mere cast of the floating value into a string and, as such, they contain the `.0` part of the float that we need to remove. Since the same pattern is repeated in all the values of these two columns, we could apply a similar operation to all their values to clean them up. For doing that, we use the [method `apply`](https://pandas.pydata.org/docs/reference/api/pandas.Series.apply.html) of the class `Series`, which allows us to apply an input function to all the values of a column and to store, in each value, what such a function returns.\n", 584 | "\n", 585 | "A function that would allow us perform such an operation is the following one:" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 11, 591 | "id": "f2866072-8522-4fd3-abd6-d3d4c0e96452", 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "def remove_dotzero(s):\n", 596 | " return s.replace(\".0\", \"\")" 597 | ] 598 | }, 599 | { 600 | "cell_type": "markdown", 601 | "id": "bdae741f-ff5f-44b2-b362-1094da1c070a", 602 | "metadata": {}, 603 | "source": [ 604 | "The function above takes in input a string (i.e. the value of a cell) and remove the string `\".0\"` from there, if present. Thus, passing this function to the method `apply` of each column and then to assign the modified column back to the data frame will fix the issue, as shown as follows:" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 12, 610 | "id": "85d785d5-ffb8-4a97-8e9b-d7a51f01e5bb", 611 | "metadata": {}, 612 | "outputs": [ 613 | { 614 | "data": { 615 | "text/html": [ 616 | "
\n", 617 | "\n", 630 | "\n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | "
internalIddoipublicationYeartitleissuevolumepublicationVenue
0https://comp-data.github.io/res/publication-010.1002/cfg.3042003Development of Computational Tools for the Inf...44https://comp-data.github.io/res/venue-0
1https://comp-data.github.io/res/publication-110.1016/s1367-5931(02)00332-02002In vitro selection as a powerful tool for the ...36https://comp-data.github.io/res/venue-1
2https://comp-data.github.io/res/publication-210.1002/9780470291092.ch201981Mechanisms of Toughening in Ceramic Matrix Com...https://comp-data.github.io/res/venue-2
\n", 676 | "
" 677 | ], 678 | "text/plain": [ 679 | " internalId \\\n", 680 | "0 https://comp-data.github.io/res/publication-0 \n", 681 | "1 https://comp-data.github.io/res/publication-1 \n", 682 | "2 https://comp-data.github.io/res/publication-2 \n", 683 | "\n", 684 | " doi publicationYear \\\n", 685 | "0 10.1002/cfg.304 2003 \n", 686 | "1 10.1016/s1367-5931(02)00332-0 2002 \n", 687 | "2 10.1002/9780470291092.ch20 1981 \n", 688 | "\n", 689 | " title issue volume \\\n", 690 | "0 Development of Computational Tools for the Inf... 4 4 \n", 691 | "1 In vitro selection as a powerful tool for the ... 3 6 \n", 692 | "2 Mechanisms of Toughening in Ceramic Matrix Com... \n", 693 | "\n", 694 | " publicationVenue \n", 695 | "0 https://comp-data.github.io/res/venue-0 \n", 696 | "1 https://comp-data.github.io/res/venue-1 \n", 697 | "2 https://comp-data.github.io/res/venue-2 " 698 | ] 699 | }, 700 | "execution_count": 12, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | } 704 | ], 705 | "source": [ 706 | "df_publications_sparql[\"issue\"] = df_publications_sparql[\"issue\"].apply(remove_dotzero)\n", 707 | "df_publications_sparql[\"volume\"] = df_publications_sparql[\"volume\"].apply(remove_dotzero)\n", 708 | "\n", 709 | "df_publications_sparql" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "id": "b4a0f155-da4c-46f0-b37c-14933a96287a", 715 | "metadata": {}, 716 | "source": [ 717 | "## Combining data\n", 718 | "\n", 719 | "In the previous section, we have introced how to obtain data from existing databases and how to manipulate them using Pandas. However, in real case scenarios, an answer to a certain query can arrive only from mixing partial data from two distinct databases. Thus, it is important to implement some mechanisms to mash data up together, clean them if needed (e.g. removing duplicates), and to return them in a certain order (e.g. alphabetically). Of course, Pandas can be used to perform all these operations.\n", 720 | "\n", 721 | "Suppose that we want to find, by querying all the databases, all the titles and year of publication of all publications they contain (independently from their type), ordered from the oldest one to the newest one. To simplify the job for this tutorial, we could consider the two data frames computed before, i.e. `df_journal_article_sql` and `df_publications_sparql`, as the two coming from two different databases.\n", 722 | "\n", 723 | "First of all, we need something that allows us to concat two or more data frames together. However, in order to do that, it is important that, first of all, all the data frames to contact share the same columns. Thus, if necessary, it is important to rename the columns as we have seen in a previous tutorial. In this case, instead, we have already created the data frames with the same column names and, as such, we can proceed with the concat operation, i.e. obtaining a new data frame by concatenating the rows contained in both the data frames.\n", 724 | "\n", 725 | "This operation is implemented by the [function `concat`](https://pandas.pydata.org/docs/reference/api/pandas.concat.html), that takes in input a list of data frames and return a new data frame with all the rows concatenated. In addition, it can also take in input the named parameter `ignore_index` that, if set to `True`, will reindex all the rows from the beginning in the new data frame, as shown in the following code:" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 20, 731 | "id": "d4c99f9a-40d4-47a3-8117-6fe29a121997", 732 | "metadata": {}, 733 | "outputs": [ 734 | { 735 | "data": { 736 | "text/html": [ 737 | "
\n", 738 | "\n", 751 | "\n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | "
internalIddoipublicationYeartitleissuevolumepublicationVenue
0publication-010.1002/cfg.3042003Development of Computational Tools for the Inf...44venue-0
1publication-110.1016/s1367-5931(02)00332-02002In vitro selection as a powerful tool for the ...36venue-1
2https://comp-data.github.io/res/publication-010.1002/cfg.3042003Development of Computational Tools for the Inf...44https://comp-data.github.io/res/venue-0
3https://comp-data.github.io/res/publication-110.1016/s1367-5931(02)00332-02002In vitro selection as a powerful tool for the ...36https://comp-data.github.io/res/venue-1
4https://comp-data.github.io/res/publication-210.1002/9780470291092.ch201981Mechanisms of Toughening in Ceramic Matrix Com...https://comp-data.github.io/res/venue-2
\n", 817 | "
" 818 | ], 819 | "text/plain": [ 820 | " internalId \\\n", 821 | "0 publication-0 \n", 822 | "1 publication-1 \n", 823 | "2 https://comp-data.github.io/res/publication-0 \n", 824 | "3 https://comp-data.github.io/res/publication-1 \n", 825 | "4 https://comp-data.github.io/res/publication-2 \n", 826 | "\n", 827 | " doi publicationYear \\\n", 828 | "0 10.1002/cfg.304 2003 \n", 829 | "1 10.1016/s1367-5931(02)00332-0 2002 \n", 830 | "2 10.1002/cfg.304 2003 \n", 831 | "3 10.1016/s1367-5931(02)00332-0 2002 \n", 832 | "4 10.1002/9780470291092.ch20 1981 \n", 833 | "\n", 834 | " title issue volume \\\n", 835 | "0 Development of Computational Tools for the Inf... 4 4 \n", 836 | "1 In vitro selection as a powerful tool for the ... 3 6 \n", 837 | "2 Development of Computational Tools for the Inf... 4 4 \n", 838 | "3 In vitro selection as a powerful tool for the ... 3 6 \n", 839 | "4 Mechanisms of Toughening in Ceramic Matrix Com... \n", 840 | "\n", 841 | " publicationVenue \n", 842 | "0 venue-0 \n", 843 | "1 venue-1 \n", 844 | "2 https://comp-data.github.io/res/venue-0 \n", 845 | "3 https://comp-data.github.io/res/venue-1 \n", 846 | "4 https://comp-data.github.io/res/venue-2 " 847 | ] 848 | }, 849 | "execution_count": 20, 850 | "metadata": {}, 851 | "output_type": "execute_result" 852 | } 853 | ], 854 | "source": [ 855 | "from pandas import concat\n", 856 | "\n", 857 | "df_union = concat([df_journal_article_sql, df_publications_sparql], ignore_index=True)\n", 858 | "df_union" 859 | ] 860 | }, 861 | { 862 | "cell_type": "markdown", 863 | "id": "2c1f0100-d150-4dc5-9d14-d0aeea943131", 864 | "metadata": {}, 865 | "source": [ 866 | "After having obtained a new data frame concatenating the other two, we need to filter out duplicates. Once can follow different approaches for doing so. In this context, we will use the DOIs of the publications to perform the filtering. \n", 867 | "\n", 868 | "A [DOI (Digital Object Identifier)](https://en.wikipedia.org/wiki/Digital_object_identifier) is a persistent identifier used to identify publications uniquely worldwide. Thus, if a publication is included in two distinct databases, it should have the same DOI despite the local identifiers the databases may use.\n", 869 | "\n", 870 | "Once this aspect is clear, we can perform a removal of rows using the [method `drop_duplicates`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html) of the class `DataFrame`. This method allows one to specify the optional named parameter `subset` with the list of columns names to use to identify similar rows. If such a named parameter is not specified, only identical rows (those having all the values in full match) are removed from data frame. Thus, we can perform the removal of duplicates as follows:" 871 | ] 872 | }, 873 | { 874 | "cell_type": "code", 875 | "execution_count": 21, 876 | "id": "699a8aca-a807-4f02-8b7d-3ed251fb3ca8", 877 | "metadata": {}, 878 | "outputs": [ 879 | { 880 | "data": { 881 | "text/html": [ 882 | "
\n", 883 | "\n", 896 | "\n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | "
internalIddoipublicationYeartitleissuevolumepublicationVenue
0publication-010.1002/cfg.3042003Development of Computational Tools for the Inf...44venue-0
1publication-110.1016/s1367-5931(02)00332-02002In vitro selection as a powerful tool for the ...36venue-1
4https://comp-data.github.io/res/publication-210.1002/9780470291092.ch201981Mechanisms of Toughening in Ceramic Matrix Com...https://comp-data.github.io/res/venue-2
\n", 942 | "
" 943 | ], 944 | "text/plain": [ 945 | " internalId \\\n", 946 | "0 publication-0 \n", 947 | "1 publication-1 \n", 948 | "4 https://comp-data.github.io/res/publication-2 \n", 949 | "\n", 950 | " doi publicationYear \\\n", 951 | "0 10.1002/cfg.304 2003 \n", 952 | "1 10.1016/s1367-5931(02)00332-0 2002 \n", 953 | "4 10.1002/9780470291092.ch20 1981 \n", 954 | "\n", 955 | " title issue volume \\\n", 956 | "0 Development of Computational Tools for the Inf... 4 4 \n", 957 | "1 In vitro selection as a powerful tool for the ... 3 6 \n", 958 | "4 Mechanisms of Toughening in Ceramic Matrix Com... \n", 959 | "\n", 960 | " publicationVenue \n", 961 | "0 venue-0 \n", 962 | "1 venue-1 \n", 963 | "4 https://comp-data.github.io/res/venue-2 " 964 | ] 965 | }, 966 | "execution_count": 21, 967 | "metadata": {}, 968 | "output_type": "execute_result" 969 | } 970 | ], 971 | "source": [ 972 | "df_union_no_duplicates = df_union.drop_duplicates(subset=[\"doi\"])\n", 973 | "df_union_no_duplicates" 974 | ] 975 | }, 976 | { 977 | "cell_type": "markdown", 978 | "id": "cdd10775-4660-483b-ab60-5a5e84eb476b", 979 | "metadata": {}, 980 | "source": [ 981 | "Then, we have finally to sort rows in ascending order considering the publication year, and then to return just the columns publication year and title and year of publication of each row. In Pandas, the sorting can be performed using the [method `sort_values`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html) of the class `DataFrame`, that takes in input the name of the column to use to perform the sorting, as shown as follows:" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": 22, 987 | "id": "25b8d455-4a93-4966-abfc-94d70501e3d6", 988 | "metadata": {}, 989 | "outputs": [ 990 | { 991 | "data": { 992 | "text/html": [ 993 | "
\n", 994 | "\n", 1007 | "\n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | "
internalIddoipublicationYeartitleissuevolumepublicationVenue
4https://comp-data.github.io/res/publication-210.1002/9780470291092.ch201981Mechanisms of Toughening in Ceramic Matrix Com...https://comp-data.github.io/res/venue-2
1publication-110.1016/s1367-5931(02)00332-02002In vitro selection as a powerful tool for the ...36venue-1
0publication-010.1002/cfg.3042003Development of Computational Tools for the Inf...44venue-0
\n", 1053 | "
" 1054 | ], 1055 | "text/plain": [ 1056 | " internalId \\\n", 1057 | "4 https://comp-data.github.io/res/publication-2 \n", 1058 | "1 publication-1 \n", 1059 | "0 publication-0 \n", 1060 | "\n", 1061 | " doi publicationYear \\\n", 1062 | "4 10.1002/9780470291092.ch20 1981 \n", 1063 | "1 10.1016/s1367-5931(02)00332-0 2002 \n", 1064 | "0 10.1002/cfg.304 2003 \n", 1065 | "\n", 1066 | " title issue volume \\\n", 1067 | "4 Mechanisms of Toughening in Ceramic Matrix Com... \n", 1068 | "1 In vitro selection as a powerful tool for the ... 3 6 \n", 1069 | "0 Development of Computational Tools for the Inf... 4 4 \n", 1070 | "\n", 1071 | " publicationVenue \n", 1072 | "4 https://comp-data.github.io/res/venue-2 \n", 1073 | "1 venue-1 \n", 1074 | "0 venue-0 " 1075 | ] 1076 | }, 1077 | "execution_count": 22, 1078 | "metadata": {}, 1079 | "output_type": "execute_result" 1080 | } 1081 | ], 1082 | "source": [ 1083 | "df_union_no_duplicates_sorted = df_union_no_duplicates.sort_values(\"publicationYear\")\n", 1084 | "df_union_no_duplicates_sorted" 1085 | ] 1086 | }, 1087 | { 1088 | "cell_type": "markdown", 1089 | "id": "12d244dc-9d78-44bd-89f2-f913d5388c2c", 1090 | "metadata": {}, 1091 | "source": [ 1092 | "Finally, to select a sub-data frame, we use the approach adopted in past tutorial, by creating a new data frame selecting only some of the columns of another one:" 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "code", 1097 | "execution_count": 23, 1098 | "id": "94ee9442-c74c-4c3e-8700-84181c53dddb", 1099 | "metadata": {}, 1100 | "outputs": [ 1101 | { 1102 | "data": { 1103 | "text/html": [ 1104 | "
\n", 1105 | "\n", 1118 | "\n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | "
titlepublicationYear
4Mechanisms of Toughening in Ceramic Matrix Com...1981
1In vitro selection as a powerful tool for the ...2002
0Development of Computational Tools for the Inf...2003
\n", 1144 | "
" 1145 | ], 1146 | "text/plain": [ 1147 | " title publicationYear\n", 1148 | "4 Mechanisms of Toughening in Ceramic Matrix Com... 1981\n", 1149 | "1 In vitro selection as a powerful tool for the ... 2002\n", 1150 | "0 Development of Computational Tools for the Inf... 2003" 1151 | ] 1152 | }, 1153 | "execution_count": 23, 1154 | "metadata": {}, 1155 | "output_type": "execute_result" 1156 | } 1157 | ], 1158 | "source": [ 1159 | "df_final = df_union_no_duplicates_sorted[[\"title\", \"publicationYear\"]]\n", 1160 | "df_final" 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "code", 1165 | "execution_count": null, 1166 | "id": "f02dbcc3-e4f4-428b-a516-4da7de80f0d5", 1167 | "metadata": {}, 1168 | "outputs": [], 1169 | "source": [] 1170 | } 1171 | ], 1172 | "metadata": { 1173 | "kernelspec": { 1174 | "display_name": "Python 3", 1175 | "language": "python", 1176 | "name": "python3" 1177 | }, 1178 | "language_info": { 1179 | "codemirror_mode": { 1180 | "name": "ipython", 1181 | "version": 3 1182 | }, 1183 | "file_extension": ".py", 1184 | "mimetype": "text/x-python", 1185 | "name": "python", 1186 | "nbconvert_exporter": "python", 1187 | "pygments_lexer": "ipython3", 1188 | "version": "3.9.0" 1189 | } 1190 | }, 1191 | "nbformat": 4, 1192 | "nbformat_minor": 5 1193 | } 1194 | --------------------------------------------------------------------------------