├── .gitignore
├── Customer Contracts$.csv
├── Customer Demo.csv
├── Customer Engagements.csv
├── README.md
├── csv_import_functions.py
└── main.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/Customer Contracts$.csv:
--------------------------------------------------------------------------------
 1 | customer_name,start_date,end_date,contract_amount_m,invoice_sent,paid
 2 | Nike,01-02-2019,12-20-2020,2.98,Yes,Yes
 3 | Reebox,06-20-2017,,3.9,No,No
 4 | Adidas,12-07-2015,6-20-2018,4.82,Yes,Yes
 5 | Google,05-25-2014,03-20-2017,5.74,Yes,No
 6 | Amazon,11-10-2012,12-20-2015,6.66,No,Yes
 7 | Facebook,04-29-2011,,7.58,Yes,No
 8 | Apple,10-15-2009,,8.5,Yes,Yes
 9 | Airbnb,04-02-2008,,9.42,No,No
10 | Nest,09-19-2006,,3.0,Yes,Yes
11 | Canon,03-07-2005,09-20-2009,11.26,Yes,No
12 | 


--------------------------------------------------------------------------------
/Customer Demo.csv:
--------------------------------------------------------------------------------
 1 | customer_id,customer_name,employee_count,office_location
 2 | 101,Nike,120000,Oregon
 3 | 102,Reebox,5000,California
 4 | 103,Adidas,8000,CA
 5 | 104,Google,500000,CA
 6 | 105,Amazon,200000,Washington
 7 | 106,Facebook,40002,CA
 8 | 107,Apple,8000,CA
 9 | 108,Airbnb,500000,CA
10 | 109,Nest,200000,CA
11 | 110,Canon,40002,NY
12 | 


--------------------------------------------------------------------------------
/Customer Engagements.csv:
--------------------------------------------------------------------------------
 1 | customer_id,num_of_users,_of_all_employees,sso,launched
 2 | 101,10000,65%,Y,Yes
 3 | 102,23423,80%,N,No
 4 | 103,34556,30%,Y,Yes
 5 | 104,123123,33%,N,No
 6 | 105,19832,36%,Y,Yes
 7 | 106,1243,75%,N,No
 8 | 107,1231,42%,Y,Yes
 9 | 108,12200,90%,N,No
10 | 109,200,48%,Y,Yes
11 | 110,1100,51%,N,No
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CSV File to Database Import Automation
 2 | A python script that automates CSV file imports to a postgres database
 3 | 
 4 | By: Nate from StrataScratch (nate@stratascratch.com)
 5 | 
 6 | Dec 3, 2020
 7 | ___
 8 | 
 9 | ## Video Links
10 | Here are the video tutorials showing you how to build the entire python script
11 | - [Part 1: Building the Functionality](https://youtu.be/wqBFgaMgFQA)
12 | - [Part 2: Automating the Script](https://youtu.be/TDwy1lSjEZo)
13 | - [Part 3: Reusing & Scaling the Functions By Applying Software Development Fundamentals](https://youtu.be/TDwy1lSjEZo)
14 | 
15 | Follow my [Youtube channel here](https://www.youtube.com/channel/UCW8Ews7tdKKkBT6GdtQaXvQ) for more data science resources. We cover a lot of coding techniques in both python and SQL.
16 | 
17 | ## Description
18 | 
19 | This script will automatically import CSV files to your postgres database. Just place the CSV files in the same directory as the notebook and run the notebook. The notebook will automatically clean the file name and column headers, create the db table, and copy the file over to the database. The table names are the same names as the file names. However, all upper case characters are changed to lower case, spaces are converted to underscores, and all symbols are removed. 
20 | 
21 | Importing CSV files to a database is a common task needed for data science and analytics and it can be done completely with python using pandas dataframes, numpy, os library, and the posgres database wrapper psycopg2.
22 | 
23 | ## Other Data Science Resources
24 | If you want more coding practice, whether to improve in analytics or to prepare for coding interviews, check out [StrataScratch](https://platform.stratascratch.com/), It's a coding platform with over 500+ coding questions from real data science companies. All questions are free and you can execute SQL and python code in the IDE.
25 | 


--------------------------------------------------------------------------------
/csv_import_functions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pandas as pd
  4 | import psycopg2
  5 | 
  6 | def csv_files():
  7 | 
  8 |     #get names of only csv files
  9 |     csv_files = []
 10 |     for file in os.listdir(os.getcwd()):
 11 |         if file.endswith(".csv"):
 12 |             csv_files.append(file)
 13 | 
 14 |     return csv_files
 15 | 
 16 | 
 17 | def configure_dataset_directory(csv_files, dataset_dir):
 18 |   
 19 |     #make dataset folder to process csv files
 20 |     try: 
 21 |       mkdir = 'mkdir {0}'.format(dataset_dir)
 22 |       os.system(mkdir)
 23 |     except:
 24 |       pass
 25 | 
 26 |     #move csv files to dataset folder
 27 |     for csv in csv_files:
 28 |       mv_file = "mv '{0}' {1}".format(csv, dataset_dir)
 29 |       os.system(mv_file)
 30 | 
 31 |     return
 32 | 
 33 | 
 34 | def create_df(dataset_dir, csv_files):
 35 |   
 36 |     data_path = os.getcwd()+'/'+dataset_dir+'/'
 37 | 
 38 |     #loop through the files and create the dataframe
 39 |     df = {}
 40 |     for file in csv_files:
 41 |         try:
 42 |             df[file] = pd.read_csv(data_path+file)
 43 |         except UnicodeDecodeError:
 44 |             df[file] = pd.read_csv(data_path+file, encoding="ISO-8859-1") #if utf-8 encoding error
 45 |         print(file)
 46 |     
 47 |     return df
 48 | 
 49 | 
 50 | def clean_tbl_name(filename):
 51 |   
 52 |     #rename csv, force lower case, no spaces, no dashes
 53 |     clean_tbl_name = filename.lower().replace(" ", "").replace("-","_").replace(r"/","_").replace("\\","_").replace("$","").replace("%","")
 54 |     
 55 |     tbl_name = '{0}'.format(clean_tbl_name.split('.')[0])
 56 | 
 57 |     return tbl_name
 58 | 
 59 | 
 60 | 
 61 | def clean_colname(dataframe):
 62 |   
 63 |     #force column names to be lower case, no spaces, no dashes
 64 |     dataframe.columns = [x.lower().replace(" ", "_").replace("-","_").replace(r"/","_").replace("\\","_").replace(".","_").replace("$","").replace("%","") for x in dataframe.columns]
 65 |     
 66 |     #processing data
 67 |     replacements = {
 68 |         'timedelta64[ns]': 'varchar',
 69 |         'object': 'varchar',
 70 |         'float64': 'float',
 71 |         'int64': 'int',
 72 |         'datetime64': 'timestamp'
 73 |     }
 74 | 
 75 |     col_str = ", ".join("{} {}".format(n, d) for (n, d) in zip(dataframe.columns, dataframe.dtypes.replace(replacements)))
 76 |     
 77 |     return col_str, dataframe.columns
 78 | 
 79 | 
 80 | 
 81 | def upload_to_db(host, dbname, user, password, tbl_name, col_str, file, dataframe, dataframe_columns):
 82 | 
 83 |     conn_string = "host=%s dbname=%s user=%s password=%s" % (host, dbname, user, password)
 84 |     conn = psycopg2.connect(conn_string)
 85 |     cursor = conn.cursor()
 86 |     print('opened database successfully')
 87 |     
 88 |     #drop table with same name
 89 |     cursor.execute("drop table if exists %s;" % (tbl_name))
 90 | 
 91 |     #create table
 92 |     cursor.execute("create table %s (%s);" % (tbl_name, col_str))
 93 |     print('{0} was created successfully'.format(tbl_name)) 
 94 |     
 95 |     #insert values to table
 96 | 
 97 |     #save df to csv
 98 |     dataframe.to_csv(file, header=dataframe_columns, index=False, encoding='utf-8')
 99 | 
100 |     #open the csv file, save it as an object
101 |     my_file = open(file)
102 |     print('file opened in memory')
103 |     
104 |     #upload to db
105 |     SQL_STATEMENT = """
106 |     COPY %s FROM STDIN WITH
107 |         CSV
108 |         HEADER
109 |         DELIMITER AS ','
110 |     """
111 | 
112 |     cursor.copy_expert(sql=SQL_STATEMENT % tbl_name, file=my_file)
113 |     print('file copied to db')
114 |     
115 |     cursor.execute("grant select on table %s to public" % tbl_name)
116 |     conn.commit()
117 |     cursor.close()
118 |     print('table {0} imported to db completed'.format(tbl_name))
119 | 
120 |     return
121 | 
122 | 


--------------------------------------------------------------------------------
/main.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Import CSV files to a postgres database"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "#import libraries\n",
17 |     "#conda install psycopg2\n",
18 |     "#pip install psycopg2\n",
19 |     "\n",
20 |     "import os\n",
21 |     "import numpy as np\n",
22 |     "import pandas as pd\n",
23 |     "import psycopg2"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [],
31 |    "source": [
32 |     "#main \n",
33 |     "\n",
34 |     "from csv_import_functions import *\n",
35 |     "\n",
36 |     "#settings\n",
37 |     "dataset_dir = 'datasets'\n",
38 |     "\n",
39 |     "#db settings\n",
40 |     "host = 'add db ip'\n",
41 |     "dbname = 'add db name'\n",
42 |     "user = 'add db username'\n",
43 |     "password = 'add db pwd'\n",
44 |     "\n",
45 |     "#configure environment and create main df\n",
46 |     "csv_files = csv_files()\n",
47 |     "configure_dataset_directory(csv_files, dataset_dir)\n",
48 |     "df = create_df(dataset_dir, csv_files)\n",
49 |     "\n",
50 |     "for k in csv_files:\n",
51 |     "\n",
52 |     "    #call dataframe\n",
53 |     "    dataframe = df[k]\n",
54 |     "\n",
55 |     "    #clean table name\n",
56 |     "    tbl_name = clean_tbl_name(k)\n",
57 |     "    \n",
58 |     "    #clean column names\n",
59 |     "    col_str, dataframe.columns = clean_colname(dataframe)\n",
60 |     "    \n",
61 |     "    #upload data to db   \n",
62 |     "    upload_to_db(host, \n",
63 |     "                 dbname, \n",
64 |     "                 user, \n",
65 |     "                 password, \n",
66 |     "                 tbl_name, \n",
67 |     "                 col_str, \n",
68 |     "                 file=k, \n",
69 |     "                 dataframe=dataframe, \n",
70 |     "                 dataframe_columns=dataframe.columns)"
71 |    ]
72 |   }
73 |  ],
74 |  "metadata": {
75 |   "kernelspec": {
76 |    "display_name": "Python 3",
77 |    "language": "python",
78 |    "name": "python3"
79 |   },
80 |   "language_info": {
81 |    "codemirror_mode": {
82 |     "name": "ipython",
83 |     "version": 3
84 |    },
85 |    "file_extension": ".py",
86 |    "mimetype": "text/x-python",
87 |    "name": "python",
88 |    "nbconvert_exporter": "python",
89 |    "pygments_lexer": "ipython3",
90 |    "version": "3.8.5"
91 |   }
92 |  },
93 |  "nbformat": 4,
94 |  "nbformat_minor": 4
95 | }
96 | 


--------------------------------------------------------------------------------