├── crebas.sql └── script_insert_github_data.ipynb /crebas.sql: -------------------------------------------------------------------------------- 1 | /*==============================================================*/ 2 | /* DBMS name: Microsoft SQL Server 2012 */ 3 | /* Created on: 19/07/2023 17:48:21 */ 4 | /*==============================================================*/ 5 | 6 | CREATE DATABASE GITHUB_DATA 7 | GO 8 | USE GITHUB_DATA 9 | go 10 | if exists (select 1 11 | from sys.sysreferences r join sys.sysobjects o on (o.id = r.constid and o.type = 'F') 12 | where r.fkeyid = object_id('Contribution') and o.name = 'FK_CONTRIBU_ASSOCIATI_CONTRIBU') 13 | alter table Contribution 14 | drop constraint FK_CONTRIBU_ASSOCIATI_CONTRIBU 15 | go 16 | 17 | if exists (select 1 18 | from sys.sysreferences r join sys.sysobjects o on (o.id = r.constid and o.type = 'F') 19 | where r.fkeyid = object_id('Contribution') and o.name = 'FK_CONTRIBU_ASSOCIATI_REPOSITO') 20 | alter table Contribution 21 | drop constraint FK_CONTRIBU_ASSOCIATI_REPOSITO 22 | go 23 | 24 | if exists (select 1 25 | from sys.sysreferences r join sys.sysobjects o on (o.id = r.constid and o.type = 'F') 26 | where r.fkeyid = object_id('Repo_Topic') and o.name = 'FK_REPO_TOP_ASSOCIATI_REPOSITO') 27 | alter table Repo_Topic 28 | drop constraint FK_REPO_TOP_ASSOCIATI_REPOSITO 29 | go 30 | 31 | if exists (select 1 32 | from sys.sysreferences r join sys.sysobjects o on (o.id = r.constid and o.type = 'F') 33 | where r.fkeyid = object_id('Repo_Topic') and o.name = 'FK_REPO_TOP_ASSOCIATI_TOPIC') 34 | alter table Repo_Topic 35 | drop constraint FK_REPO_TOP_ASSOCIATI_TOPIC 36 | go 37 | 38 | if exists (select 1 39 | from sys.sysreferences r join sys.sysobjects o on (o.id = r.constid and o.type = 'F') 40 | where r.fkeyid = object_id('Repository') and o.name = 'FK_REPOSITO_ASSOCIATI_LANGUAGE') 41 | alter table Repository 42 | drop constraint FK_REPOSITO_ASSOCIATI_LANGUAGE 43 | go 44 | 45 | if exists (select 1 46 | from sys.sysreferences r join sys.sysobjects o on (o.id = r.constid and o.type = 'F') 47 | where r.fkeyid = object_id('Repository') and o.name = 'FK_REPOSITO_ASSOCIATI_TYPEOWNE') 48 | alter table Repository 49 | drop constraint FK_REPOSITO_ASSOCIATI_TYPEOWNE 50 | go 51 | 52 | if exists (select 1 53 | from sysindexes 54 | where id = object_id('Contribution') 55 | and name = 'CONTRIBUTION_FK2' 56 | and indid > 0 57 | and indid < 255) 58 | drop index Contribution.CONTRIBUTION_FK2 59 | go 60 | 61 | if exists (select 1 62 | from sysindexes 63 | where id = object_id('Contribution') 64 | and name = 'CONTRIBUTION_FK' 65 | and indid > 0 66 | and indid < 255) 67 | drop index Contribution.CONTRIBUTION_FK 68 | go 69 | 70 | if exists (select 1 71 | from sysobjects 72 | where id = object_id('Contribution') 73 | and type = 'U') 74 | drop table Contribution 75 | go 76 | 77 | if exists (select 1 78 | from sysobjects 79 | where id = object_id('Contributor') 80 | and type = 'U') 81 | drop table Contributor 82 | go 83 | 84 | if exists (select 1 85 | from sysobjects 86 | where id = object_id('Language') 87 | and type = 'U') 88 | drop table Language 89 | go 90 | 91 | if exists (select 1 92 | from sysindexes 93 | where id = object_id('Repo_Topic') 94 | and name = 'REPO_TOPIC_FK2' 95 | and indid > 0 96 | and indid < 255) 97 | drop index Repo_Topic.REPO_TOPIC_FK2 98 | go 99 | 100 | if exists (select 1 101 | from sysindexes 102 | where id = object_id('Repo_Topic') 103 | and name = 'REPO_TOPIC_FK' 104 | and indid > 0 105 | and indid < 255) 106 | drop index Repo_Topic.REPO_TOPIC_FK 107 | go 108 | 109 | if exists (select 1 110 | from sysobjects 111 | where id = object_id('Repo_Topic') 112 | and type = 'U') 113 | drop table Repo_Topic 114 | go 115 | 116 | if exists (select 1 117 | from sysindexes 118 | where id = object_id('Repository') 119 | and name = 'ASSOCIATION6_FK' 120 | and indid > 0 121 | and indid < 255) 122 | drop index Repository.ASSOCIATION6_FK 123 | go 124 | 125 | if exists (select 1 126 | from sysindexes 127 | where id = object_id('Repository') 128 | and name = 'ASSOCIATION5_FK' 129 | and indid > 0 130 | and indid < 255) 131 | drop index Repository.ASSOCIATION5_FK 132 | go 133 | 134 | if exists (select 1 135 | from sysobjects 136 | where id = object_id('Repository') 137 | and type = 'U') 138 | drop table Repository 139 | go 140 | 141 | if exists (select 1 142 | from sysobjects 143 | where id = object_id('Topic') 144 | and type = 'U') 145 | drop table Topic 146 | go 147 | 148 | if exists (select 1 149 | from sysobjects 150 | where id = object_id('TypeOwner') 151 | and type = 'U') 152 | drop table TypeOwner 153 | go 154 | 155 | /*==============================================================*/ 156 | /* Table: Contribution */ 157 | /*==============================================================*/ 158 | create table Contribution ( 159 | idContrubutor int , 160 | idRepo bigint , 161 | number_Contributions int , 162 | constraint PK_CONTRIBUTION primary key (idContrubutor, idRepo) 163 | ) 164 | 165 | go 166 | 167 | /*==============================================================*/ 168 | /* Index: CONTRIBUTION_FK */ 169 | /*==============================================================*/ 170 | create index CONTRIBUTION_FK on Contribution ( 171 | idContrubutor ASC 172 | ) 173 | go 174 | 175 | /*==============================================================*/ 176 | /* Index: CONTRIBUTION_FK2 */ 177 | /*==============================================================*/ 178 | create index CONTRIBUTION_FK2 on Contribution ( 179 | idRepo ASC 180 | ) 181 | go 182 | 183 | /*==============================================================*/ 184 | /* Table: Contributor */ 185 | /*==============================================================*/ 186 | create table Contributor ( 187 | idContrubutor int identity(1,1) not null, 188 | contrubtor varchar(max) null, 189 | constraint PK_CONTRIBUTOR primary key nonclustered (idContrubutor), 190 | constraint AK_IDENTIFIER_1_CONTRIBU unique (idContrubutor), 191 | constraint AK_IDENTIFIER_2_CONTRIBU unique (idContrubutor) 192 | ) 193 | 194 | 195 | /*==============================================================*/ 196 | /* Table: Language */ 197 | /*==============================================================*/ 198 | create table Language ( 199 | idLanguage int identity(1,1) not null, 200 | language varchar(max) null, 201 | constraint PK_LANGUAGE primary key nonclustered (idLanguage) 202 | ) 203 | go 204 | 205 | /*==============================================================*/ 206 | /* Table: Repo_Topic */ 207 | /*==============================================================*/ 208 | create table Repo_Topic ( 209 | idTopic int, 210 | idRepo bigint, 211 | constraint PK_REPO_TOPIC primary key (idTopic, idRepo) 212 | ) 213 | go 214 | 215 | /*==============================================================*/ 216 | /* Index: REPO_TOPIC_FK */ 217 | /*==============================================================*/ 218 | create index REPO_TOPIC_FK on Repo_Topic ( 219 | idTopic ASC 220 | ) 221 | go 222 | 223 | /*==============================================================*/ 224 | /* Index: REPO_TOPIC_FK2 */ 225 | /*==============================================================*/ 226 | create index REPO_TOPIC_FK2 on Repo_Topic ( 227 | idRepo ASC 228 | ) 229 | go 230 | 231 | /*==============================================================*/ 232 | /* Table: Repository */ 233 | /*==============================================================*/ 234 | create table Repository ( 235 | idRepo bigint identity(1,1) not null, 236 | idTypeOwner int null, 237 | idLanguage int not null, 238 | fullName varchar(max) null, 239 | createdAt datetime null, 240 | startCount int null, 241 | description text null, 242 | htmlUrl text null, 243 | contributorsUrl text null, 244 | openIssuesCount_ int null, 245 | forks int null, 246 | constraint PK_REPOSITORY primary key nonclustered (idRepo), 247 | constraint AK_IDENTIFIER_1_REPOSITO unique (idRepo), 248 | constraint AK_IDENTIFIER_2_REPOSITO unique (idRepo), 249 | constraint AK_IDENTIFIER_3_REPOSITO unique (idRepo), 250 | constraint AK_IDENTIFIER_4_REPOSITO unique (idRepo), 251 | constraint AK_IDENTIFIER_5_REPOSITO unique (idRepo) 252 | ) 253 | go 254 | 255 | /*==============================================================*/ 256 | /* Index: ASSOCIATION5_FK */ 257 | /*==============================================================*/ 258 | create index ASSOCIATION5_FK on Repository ( 259 | idLanguage ASC 260 | ) 261 | go 262 | 263 | /*==============================================================*/ 264 | /* Index: ASSOCIATION6_FK */ 265 | /*==============================================================*/ 266 | create index ASSOCIATION6_FK on Repository ( 267 | idTypeOwner ASC 268 | ) 269 | go 270 | 271 | /*==============================================================*/ 272 | /* Table: Topic */ 273 | /*==============================================================*/ 274 | create table Topic ( 275 | idTopic int identity(1,1) not null, 276 | topic varchar(100) null, 277 | constraint PK_TOPIC primary key nonclustered (idTopic), 278 | constraint AK_IDENTIFIER_1_TOPIC unique (idTopic) 279 | ) 280 | go 281 | 282 | /*==============================================================*/ 283 | /* Table: TypeOwner */ 284 | /*==============================================================*/ 285 | create table TypeOwner ( 286 | idTypeOwner int identity(1,1) not null, 287 | typeOwner varchar(100) null, 288 | constraint PK_TYPEOWNER primary key nonclustered (idTypeOwner), 289 | constraint AK_IDENTIFIER_1_TYPEOWNE unique (idTypeOwner) 290 | ) 291 | go 292 | 293 | alter table Contribution 294 | add constraint FK_CONTRIBU_ASSOCIATI_CONTRIBU foreign key (idContrubutor) 295 | references Contributor (idContrubutor) 296 | go 297 | 298 | alter table Contribution 299 | add constraint FK_CONTRIBU_ASSOCIATI_REPOSITO foreign key (idRepo) 300 | references Repository (idRepo) 301 | go 302 | 303 | alter table Repo_Topic 304 | add constraint FK_REPO_TOP_ASSOCIATI_REPOSITO foreign key (idRepo) 305 | references Repository (idRepo) 306 | go 307 | 308 | alter table Repo_Topic 309 | add constraint FK_REPO_TOP_ASSOCIATI_TOPIC foreign key (idTopic) 310 | references Topic (idTopic) 311 | go 312 | 313 | alter table Repository 314 | add constraint FK_REPOSITO_ASSOCIATI_LANGUAGE foreign key (idLanguage) 315 | references Language (idLanguage) 316 | go 317 | 318 | alter table Repository 319 | add constraint FK_REPOSITO_ASSOCIATI_TYPEOWNE foreign key (idTypeOwner) 320 | references TypeOwner (idTypeOwner) 321 | go 322 | /* 323 | select * from Contribution 324 | select * from Repo_Topic 325 | */ 326 | select * from Contributor 327 | select * from TypeOwner 328 | SELECT * from Language 329 | select * from Repository 330 | select * from Topic 331 | 332 | -------------------------------------------------------------------------------- /script_insert_github_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import pyodbc\n", 11 | "df=pd.read_csv(\"clean_data_github.csv\")\n", 12 | "df_con=pd.read_csv(\"updated_data.csv\")\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# inserting data to Language table and TypeOwner Table" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import pyodbc\n", 29 | "# Language\n", 30 | "connection_string = 'Driver={SQL Server};Server=LAPTOP-5VUQJT8J\\SQLEXPRESS;Database=GITHUB_DATA;UID=;PWD=;'\n", 31 | "connection = pyodbc.connect(connection_string)\n", 32 | "cursor = connection.cursor()\n", 33 | "\n", 34 | "df_language_unique = df.drop_duplicates(subset=['language'])\n", 35 | "for _, row in df_language_unique.iterrows():\n", 36 | " query = \"INSERT INTO Language (language) VALUES (?)\"\n", 37 | " values = (row['language'],)\n", 38 | " cursor.execute(query, values)\n", 39 | "\n", 40 | "# TypeOwner\n", 41 | "df_type_owner_unique = df.drop_duplicates(subset=['type_owner'])\n", 42 | "for _, row in df_type_owner_unique.iterrows():\n", 43 | " query = \"INSERT INTO TypeOwner (typeOwner) VALUES (?)\"\n", 44 | " values = (row['type_owner'],)\n", 45 | " cursor.execute(query, values)\n", 46 | "\n", 47 | "\n", 48 | "connection.commit()\n", 49 | "\n", 50 | "cursor.close()\n", 51 | "connection.close()\n", 52 | "\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# inserting data to Repository table" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "\n", 69 | "\n", 70 | "connection = pyodbc.connect(connection_string)\n", 71 | "cursor = connection.cursor()\n", 72 | "\n", 73 | "def get_id_type_owner(type_owner):\n", 74 | " cursor.execute(\"SELECT idTypeOwner FROM TypeOwner WHERE typeOwner = ?\", type_owner)\n", 75 | " result = cursor.fetchone()\n", 76 | " return result[0] if result else None\n", 77 | "\n", 78 | "def get_id_language(language):\n", 79 | " cursor.execute(\"SELECT idLanguage FROM Language WHERE language = ?\", language)\n", 80 | " result = cursor.fetchone()\n", 81 | " return result[0] if result else None\n", 82 | "\n", 83 | "# Iterate through the DataFrame rows and insert data into the \"Repository\" table\n", 84 | "for _, row in df.iterrows():\n", 85 | " # Get idTypeOwner and idLanguage from the corresponding tables\n", 86 | " id_type_owner = get_id_type_owner(row['type_owner'])\n", 87 | " id_language = get_id_language(row['language'])\n", 88 | " \n", 89 | " # Check if the repository already exists in the table\n", 90 | " cursor.execute(\"SELECT COUNT(*) FROM Repository WHERE idRepo = ?\", row['id'])\n", 91 | " if cursor.fetchone()[0] == 0:\n", 92 | " # Convert date strings to datetime objects\n", 93 | " insert_query = '''\n", 94 | " INSERT INTO Repository (\n", 95 | " idTypeOwner, idLanguage, fullName, createdAt, startCount, description, htmlUrl, contributorsUrl,\n", 96 | " openIssuesCount_, forks\n", 97 | " )\n", 98 | " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'''\n", 99 | " values = (id_type_owner, id_language, row['full_name'], row['created_at'], row['stargazers_count'],\n", 100 | " row['description'], row['html_url'], row['contributors_url'], row['open_issues_count'], row['forks'])\n", 101 | " \n", 102 | " cursor.execute(insert_query,values)\n", 103 | "connection.commit()\n", 104 | "cursor.close()\n", 105 | "connection.close()\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "# inserting data to Topic table" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "import ast\n", 122 | "df['topics'] = df['topics'].apply(ast.literal_eval)\n", 123 | "df_topics=df.explode('topics')\n", 124 | "df_topics_unique = df_topics.drop_duplicates(subset=['topics'])\n", 125 | "df_topics_unique\n", 126 | "\n", 127 | "connection = pyodbc.connect(connection_string)\n", 128 | "cursor = connection.cursor()\n", 129 | "df_topics=df.explode('topics')\n", 130 | "df_topics_unique = df_topics.drop_duplicates(subset=['topics'])\n", 131 | "for _, row in df_topics_unique.iterrows():\n", 132 | " query = \"INSERT INTO Topic (topic) VALUES (?)\"\n", 133 | " values = (row['topics'],)\n", 134 | " cursor.execute(query, values)\n", 135 | "connection.commit()\n", 136 | "\n", 137 | "cursor.close()\n", 138 | "connection.close()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# inserting data to Repo_Topic table" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 4, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "connection = pyodbc.connect(connection_string)\n", 155 | "cursor = connection.cursor()\n", 156 | "\n", 157 | "def get_id_topic(topic):\n", 158 | " cursor.execute(\"SELECT idTopic FROM Topic WHERE topic = ?\", topic)\n", 159 | " result = cursor.fetchone()\n", 160 | " return result[0] if result else None\n", 161 | "\n", 162 | "def get_id_repo(fullName):\n", 163 | " cursor.execute(\"SELECT idRepo FROM Repository WHERE fullName = ?\", fullName)\n", 164 | " result = cursor.fetchone()\n", 165 | " return result[0] if result else None\n", 166 | "\n", 167 | "try:\n", 168 | " connection.autocommit = False\n", 169 | "\n", 170 | " for _, row in df_topics.iterrows():\n", 171 | " id_topic = get_id_topic(row['topics'])\n", 172 | " id_repo = get_id_repo(row['full_name'])\n", 173 | "\n", 174 | " if id_topic is not None and id_repo is not None:\n", 175 | " query = \"INSERT INTO Repo_Topic (idTopic, idRepo) VALUES (?, ?)\"\n", 176 | " values = (id_topic, id_repo)\n", 177 | "\n", 178 | " cursor.execute(query, values)\n", 179 | "\n", 180 | " connection.commit()\n", 181 | "except Exception as e:\n", 182 | " connection.rollback()\n", 183 | " print(\"Error occurred during data insertion:\", str(e))\n", 184 | "\n", 185 | "finally:\n", 186 | " cursor.close()\n", 187 | " connection.close()\n" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "# inserting data to Contributor table" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 5, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "connection = pyodbc.connect(connection_string)\n", 204 | "cursor = connection.cursor()\n", 205 | "df_sql_cont_df_unique = new_df_con.drop_duplicates(subset=['login'])\n", 206 | "for _, row in df_sql_cont_df_unique.iterrows():\n", 207 | " query = \"INSERT INTO Contributor (contrubtor) VALUES (?)\"\n", 208 | " values = (row['login'],)\n", 209 | " cursor.execute(query, values)\n", 210 | "\n", 211 | "connection.commit()\n", 212 | "cursor.close()\n", 213 | "connection.close()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "# inserting data to Contribution table" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 6, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "connection = pyodbc.connect(connection_string)\n", 230 | "cursor = connection.cursor()\n", 231 | "\n", 232 | "def get_id_Contributor(Contributor):\n", 233 | " cursor.execute(\"SELECT idContrubutor FROM Contributor WHERE contrubtor = ?\", Contributor)\n", 234 | " result = cursor.fetchone()\n", 235 | " return result[0] if result else None\n", 236 | "\n", 237 | "def get_id(id_fl):\n", 238 | " cursor.execute(\"SELECT idRepo FROM Repository WHERE fullName = ?\", id_fl)\n", 239 | " result = cursor.fetchone()\n", 240 | " return result[0] if result else None\n", 241 | "\n", 242 | "try:\n", 243 | " connection.autocommit = False\n", 244 | "\n", 245 | " for _, row in df_sql_cont_df_unique.iterrows():\n", 246 | " # Get idContrubutor from the 'Contributor' table\n", 247 | " id_contrubutor = get_id_Contributor(row['login'])\n", 248 | " id_fl = get_id(row['full_name'])\n", 249 | "\n", 250 | " # Check if 'idRepo' is not None, meaning it was found in the 'Repository' table\n", 251 | " if id_fl is not None:\n", 252 | " query = \"INSERT INTO Contribution (idContrubutor, idRepo, number_Contributions) VALUES (?, ?, ?)\"\n", 253 | " values = (\n", 254 | " id_contrubutor,\n", 255 | " id_fl,\n", 256 | " row['contributions'],\n", 257 | " )\n", 258 | " cursor.execute(query, values)\n", 259 | "\n", 260 | " connection.commit()\n", 261 | "except Exception as e:\n", 262 | " connection.rollback()\n", 263 | " print(\"Error occurred during data insertion:\", str(e))\n", 264 | "\n", 265 | "finally:\n", 266 | " cursor.close()\n", 267 | " connection.close()\n" 268 | ] 269 | } 270 | ], 271 | "metadata": { 272 | "kernelspec": { 273 | "display_name": "Python 3", 274 | "language": "python", 275 | "name": "python3" 276 | }, 277 | "language_info": { 278 | "codemirror_mode": { 279 | "name": "ipython", 280 | "version": 3 281 | }, 282 | "file_extension": ".py", 283 | "mimetype": "text/x-python", 284 | "name": "python", 285 | "nbconvert_exporter": "python", 286 | "pygments_lexer": "ipython3", 287 | "version": "3.11.3" 288 | }, 289 | "orig_nbformat": 4 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 2 293 | } 294 | --------------------------------------------------------------------------------