├── .gitignore ├── Customer Contracts$.csv ├── Customer Demo.csv ├── Customer Engagements.csv ├── README.md ├── csv_import_functions.py └── main.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Customer Contracts$.csv: -------------------------------------------------------------------------------- 1 | customer_name,start_date,end_date,contract_amount_m,invoice_sent,paid 2 | Nike,01-02-2019,12-20-2020,2.98,Yes,Yes 3 | Reebox,06-20-2017,,3.9,No,No 4 | Adidas,12-07-2015,6-20-2018,4.82,Yes,Yes 5 | Google,05-25-2014,03-20-2017,5.74,Yes,No 6 | Amazon,11-10-2012,12-20-2015,6.66,No,Yes 7 | Facebook,04-29-2011,,7.58,Yes,No 8 | Apple,10-15-2009,,8.5,Yes,Yes 9 | Airbnb,04-02-2008,,9.42,No,No 10 | Nest,09-19-2006,,3.0,Yes,Yes 11 | Canon,03-07-2005,09-20-2009,11.26,Yes,No 12 | -------------------------------------------------------------------------------- /Customer Demo.csv: -------------------------------------------------------------------------------- 1 | customer_id,customer_name,employee_count,office_location 2 | 101,Nike,120000,Oregon 3 | 102,Reebox,5000,California 4 | 103,Adidas,8000,CA 5 | 104,Google,500000,CA 6 | 105,Amazon,200000,Washington 7 | 106,Facebook,40002,CA 8 | 107,Apple,8000,CA 9 | 108,Airbnb,500000,CA 10 | 109,Nest,200000,CA 11 | 110,Canon,40002,NY 12 | -------------------------------------------------------------------------------- /Customer Engagements.csv: -------------------------------------------------------------------------------- 1 | customer_id,num_of_users,_of_all_employees,sso,launched 2 | 101,10000,65%,Y,Yes 3 | 102,23423,80%,N,No 4 | 103,34556,30%,Y,Yes 5 | 104,123123,33%,N,No 6 | 105,19832,36%,Y,Yes 7 | 106,1243,75%,N,No 8 | 107,1231,42%,Y,Yes 9 | 108,12200,90%,N,No 10 | 109,200,48%,Y,Yes 11 | 110,1100,51%,N,No 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CSV File to Database Import Automation 2 | A python script that automates CSV file imports to a postgres database 3 | 4 | By: Nate from StrataScratch (nate@stratascratch.com) 5 | 6 | Dec 3, 2020 7 | ___ 8 | 9 | ## Video Links 10 | Here are the video tutorials showing you how to build the entire python script 11 | - [Part 1: Building the Functionality](https://youtu.be/wqBFgaMgFQA) 12 | - [Part 2: Automating the Script](https://youtu.be/TDwy1lSjEZo) 13 | - [Part 3: Reusing & Scaling the Functions By Applying Software Development Fundamentals](https://youtu.be/TDwy1lSjEZo) 14 | 15 | Follow my [Youtube channel here](https://www.youtube.com/channel/UCW8Ews7tdKKkBT6GdtQaXvQ) for more data science resources. We cover a lot of coding techniques in both python and SQL. 16 | 17 | ## Description 18 | 19 | This script will automatically import CSV files to your postgres database. Just place the CSV files in the same directory as the notebook and run the notebook. The notebook will automatically clean the file name and column headers, create the db table, and copy the file over to the database. The table names are the same names as the file names. However, all upper case characters are changed to lower case, spaces are converted to underscores, and all symbols are removed. 20 | 21 | Importing CSV files to a database is a common task needed for data science and analytics and it can be done completely with python using pandas dataframes, numpy, os library, and the posgres database wrapper psycopg2. 22 | 23 | ## Other Data Science Resources 24 | If you want more coding practice, whether to improve in analytics or to prepare for coding interviews, check out [StrataScratch](https://platform.stratascratch.com/), It's a coding platform with over 500+ coding questions from real data science companies. All questions are free and you can execute SQL and python code in the IDE. 25 | -------------------------------------------------------------------------------- /csv_import_functions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import psycopg2 5 | 6 | def csv_files(): 7 | 8 | #get names of only csv files 9 | csv_files = [] 10 | for file in os.listdir(os.getcwd()): 11 | if file.endswith(".csv"): 12 | csv_files.append(file) 13 | 14 | return csv_files 15 | 16 | 17 | def configure_dataset_directory(csv_files, dataset_dir): 18 | 19 | #make dataset folder to process csv files 20 | try: 21 | mkdir = 'mkdir {0}'.format(dataset_dir) 22 | os.system(mkdir) 23 | except: 24 | pass 25 | 26 | #move csv files to dataset folder 27 | for csv in csv_files: 28 | mv_file = "mv '{0}' {1}".format(csv, dataset_dir) 29 | os.system(mv_file) 30 | 31 | return 32 | 33 | 34 | def create_df(dataset_dir, csv_files): 35 | 36 | data_path = os.getcwd()+'/'+dataset_dir+'/' 37 | 38 | #loop through the files and create the dataframe 39 | df = {} 40 | for file in csv_files: 41 | try: 42 | df[file] = pd.read_csv(data_path+file) 43 | except UnicodeDecodeError: 44 | df[file] = pd.read_csv(data_path+file, encoding="ISO-8859-1") #if utf-8 encoding error 45 | print(file) 46 | 47 | return df 48 | 49 | 50 | def clean_tbl_name(filename): 51 | 52 | #rename csv, force lower case, no spaces, no dashes 53 | clean_tbl_name = filename.lower().replace(" ", "").replace("-","_").replace(r"/","_").replace("\\","_").replace("$","").replace("%","") 54 | 55 | tbl_name = '{0}'.format(clean_tbl_name.split('.')[0]) 56 | 57 | return tbl_name 58 | 59 | 60 | 61 | def clean_colname(dataframe): 62 | 63 | #force column names to be lower case, no spaces, no dashes 64 | dataframe.columns = [x.lower().replace(" ", "_").replace("-","_").replace(r"/","_").replace("\\","_").replace(".","_").replace("$","").replace("%","") for x in dataframe.columns] 65 | 66 | #processing data 67 | replacements = { 68 | 'timedelta64[ns]': 'varchar', 69 | 'object': 'varchar', 70 | 'float64': 'float', 71 | 'int64': 'int', 72 | 'datetime64': 'timestamp' 73 | } 74 | 75 | col_str = ", ".join("{} {}".format(n, d) for (n, d) in zip(dataframe.columns, dataframe.dtypes.replace(replacements))) 76 | 77 | return col_str, dataframe.columns 78 | 79 | 80 | 81 | def upload_to_db(host, dbname, user, password, tbl_name, col_str, file, dataframe, dataframe_columns): 82 | 83 | conn_string = "host=%s dbname=%s user=%s password=%s" % (host, dbname, user, password) 84 | conn = psycopg2.connect(conn_string) 85 | cursor = conn.cursor() 86 | print('opened database successfully') 87 | 88 | #drop table with same name 89 | cursor.execute("drop table if exists %s;" % (tbl_name)) 90 | 91 | #create table 92 | cursor.execute("create table %s (%s);" % (tbl_name, col_str)) 93 | print('{0} was created successfully'.format(tbl_name)) 94 | 95 | #insert values to table 96 | 97 | #save df to csv 98 | dataframe.to_csv(file, header=dataframe_columns, index=False, encoding='utf-8') 99 | 100 | #open the csv file, save it as an object 101 | my_file = open(file) 102 | print('file opened in memory') 103 | 104 | #upload to db 105 | SQL_STATEMENT = """ 106 | COPY %s FROM STDIN WITH 107 | CSV 108 | HEADER 109 | DELIMITER AS ',' 110 | """ 111 | 112 | cursor.copy_expert(sql=SQL_STATEMENT % tbl_name, file=my_file) 113 | print('file copied to db') 114 | 115 | cursor.execute("grant select on table %s to public" % tbl_name) 116 | conn.commit() 117 | cursor.close() 118 | print('table {0} imported to db completed'.format(tbl_name)) 119 | 120 | return 121 | 122 | -------------------------------------------------------------------------------- /main.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Import CSV files to a postgres database" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "#import libraries\n", 17 | "#conda install psycopg2\n", 18 | "#pip install psycopg2\n", 19 | "\n", 20 | "import os\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import psycopg2" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "#main \n", 33 | "\n", 34 | "from csv_import_functions import *\n", 35 | "\n", 36 | "#settings\n", 37 | "dataset_dir = 'datasets'\n", 38 | "\n", 39 | "#db settings\n", 40 | "host = 'add db ip'\n", 41 | "dbname = 'add db name'\n", 42 | "user = 'add db username'\n", 43 | "password = 'add db pwd'\n", 44 | "\n", 45 | "#configure environment and create main df\n", 46 | "csv_files = csv_files()\n", 47 | "configure_dataset_directory(csv_files, dataset_dir)\n", 48 | "df = create_df(dataset_dir, csv_files)\n", 49 | "\n", 50 | "for k in csv_files:\n", 51 | "\n", 52 | " #call dataframe\n", 53 | " dataframe = df[k]\n", 54 | "\n", 55 | " #clean table name\n", 56 | " tbl_name = clean_tbl_name(k)\n", 57 | " \n", 58 | " #clean column names\n", 59 | " col_str, dataframe.columns = clean_colname(dataframe)\n", 60 | " \n", 61 | " #upload data to db \n", 62 | " upload_to_db(host, \n", 63 | " dbname, \n", 64 | " user, \n", 65 | " password, \n", 66 | " tbl_name, \n", 67 | " col_str, \n", 68 | " file=k, \n", 69 | " dataframe=dataframe, \n", 70 | " dataframe_columns=dataframe.columns)" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.8.5" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 4 95 | } 96 | --------------------------------------------------------------------------------