├── applications ├── .Rapp.history ├── item_numbers_to_find.csv ├── historical_files │ ├── suppliers.xls │ ├── suppliers.xlsx │ └── suppliers_2012.csv ├── output_files │ ├── 3output.csv │ ├── 3output-clinton.csv │ ├── 2output.csv │ ├── 2output-clinton.csv │ └── 1output.csv ├── 3parse_text_file.py ├── customer_category_history.csv ├── 3parse_text_file_skip_first_space.py ├── mysql_server_error_log.txt ├── 2calculate_statistic_by_category.py └── 1search_for_items_write_found.py ├── database ├── data_for_updating.csv ├── Suppliers.db ├── data_for_updating_mysql.csv ├── output_files │ ├── 5output.csv │ └── 5output-clinton.csv ├── supplier_data.csv ├── supplier_data_for_mysql_database.csv ├── 5db_mysql_write_to_file.py ├── 6db_mysql_update_from_csv.py ├── 1db_count_rows.py ├── 2db_insert_rows.py ├── 4db_mysql_load_from_csv.py └── 3db_update_rows.py ├── excel ├── sales_2013.xlsx ├── sales_2014.xlsx ├── sales_2015.xlsx ├── output_files │ ├── 2output.xls │ ├── 3output.xls │ ├── 4output.xls │ ├── 5output.xls │ ├── 6output.xls │ ├── 7output.xls │ ├── 8output.xls │ ├── 9output.xls │ ├── 10output.xls │ ├── 11output.xls │ ├── 13output.xls │ ├── 14output.xls │ └── pandas_output.xls ├── pandas_parsing_and_write_keep_dates.py ├── 1excel_introspect_workbook.py ├── pandas_column_by_index.py ├── pandas_column_by_name.py ├── pandas_value_matches_pattern.py ├── pandas_value_meets_condition.py ├── pandas_value_in_set.py ├── pandas_column_by_name_all_worksheets.py ├── 2excel_parsing_and_write.py ├── pandas_value_meets_condition_all_worksheets.py ├── 12excel_introspect_all_workbooks.py ├── pandas_value_meets_condition_set_of_worksheets.py ├── pandas_concat_data_from_multiple_workbooks.py ├── 3excel_parsing_and_write_keep_dates.py ├── 7excel_column_by_index.py ├── 4excel_value_meets_condition.py ├── 8excel_column_by_name.py ├── 6excel_value_matches_pattern.py ├── 13excel_concat_data_from_multiple_workbooks.py ├── 10excel_column_by_name_all_worksheets.py ├── 5excel_value_in_set.py ├── 9excel_value_meets_condition_all_worksheets.py ├── 11excel_value_meets_condition_set_of_worksheets.py ├── 14excel_sum_average_multiple_workbooks.py └── pandas_sum_average_multiple_workbooks.py ├── letters.txt ├── numbers.txt ├── csv ├── output_files │ ├── 5output.csv │ ├── 7output.csv │ ├── 6output.csv │ ├── 3output.csv │ ├── 4output.csv │ ├── 1output.csv │ ├── 2output.csv │ ├── 11output.csv │ ├── pandas_output.csv │ ├── 12output.csv │ └── 9output.csv ├── pandas_parsing_and_write.py ├── pandas_column_by_index.py ├── pandas_column_by_name.py ├── pandas_add_header_row.py ├── sales_march_2014.csv ├── pandas_value_matches_pattern.py ├── sales_february_2014.csv ├── sales_january_2014.csv ├── pandas_value_in_set.py ├── pandas_select_contiguous_rows.py ├── 2csv_reader_parsing_and_write.py ├── pandas_value_meets_condition.py ├── 11csv_reader_select_contiguous_rows.py ├── pandas_concat_rows_from_multiple_files.py ├── 12csv_reader_add_header_row.py ├── 6csv_reader_column_by_index.py ├── 4csv_reader_value_in_set.py ├── supplier_data.csv ├── supplier_data_no_header_row.csv ├── 1csv_simple_parsing_and_write.py ├── 3csv_reader_value_meets_condition.py ├── 8csv_reader_counts_for_multiple_files.py ├── 5csv_reader_value_matches_pattern.py ├── 9csv_reader_concat_rows_from_multiple_files.py ├── 7csv_reader_column_by_name.py ├── supplier_data_unnecessary_header_footer.csv ├── pandas_sum_average_from_multiple_files.py └── 10csv_reader_sum_average_from_multiple_files.py ├── plots ├── matplotlib_basic_bar.py ├── matplotlib_basic_histogram.py ├── matplotlib_basic_scatter.py ├── matplotlib_basic_line.py ├── matplotlib_basic_boxplot.py ├── ggplot_plots.py ├── pandas_plots.py └── seaborn_plots.py ├── README.md ├── statistics ├── wine_quality.py └── customer_churn.py └── first_script.py /applications/.Rapp.history: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /applications/item_numbers_to_find.csv: -------------------------------------------------------------------------------- 1 | 1234 2 | 2345 3 | 4567 4 | 6789 5 | 7890 6 | -------------------------------------------------------------------------------- /database/data_for_updating.csv: -------------------------------------------------------------------------------- 1 | amount,date,customer 2 | 4.25,5/11/2014,Richard Lucas 3 | 6.75,5/12/2014,Jenny Kim 4 | -------------------------------------------------------------------------------- /database/Suppliers.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/database/Suppliers.db -------------------------------------------------------------------------------- /excel/sales_2013.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/sales_2013.xlsx -------------------------------------------------------------------------------- /excel/sales_2014.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/sales_2014.xlsx -------------------------------------------------------------------------------- /excel/sales_2015.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/sales_2015.xlsx -------------------------------------------------------------------------------- /letters.txt: -------------------------------------------------------------------------------- 1 | a b 2 | c d 3 | e f 4 | g h 5 | i j 6 | k l 7 | m n 8 | o p 9 | q r 10 | s t 11 | u v 12 | w x 13 | y z -------------------------------------------------------------------------------- /database/data_for_updating_mysql.csv: -------------------------------------------------------------------------------- 1 | Cost,Purchase Date,Supplier Name 2 | 600.00,2014-01-22,Supplier X 3 | 200.00,2014-02-01,Supplier Y 4 | -------------------------------------------------------------------------------- /excel/output_files/2output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/2output.xls -------------------------------------------------------------------------------- /excel/output_files/3output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/3output.xls -------------------------------------------------------------------------------- /excel/output_files/4output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/4output.xls -------------------------------------------------------------------------------- /excel/output_files/5output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/5output.xls -------------------------------------------------------------------------------- /excel/output_files/6output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/6output.xls -------------------------------------------------------------------------------- /excel/output_files/7output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/7output.xls -------------------------------------------------------------------------------- /excel/output_files/8output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/8output.xls -------------------------------------------------------------------------------- /excel/output_files/9output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/9output.xls -------------------------------------------------------------------------------- /excel/output_files/10output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/10output.xls -------------------------------------------------------------------------------- /excel/output_files/11output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/11output.xls -------------------------------------------------------------------------------- /excel/output_files/13output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/13output.xls -------------------------------------------------------------------------------- /excel/output_files/14output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/14output.xls -------------------------------------------------------------------------------- /numbers.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 3 4 3 | 5 6 4 | 7 8 5 | 9 10 6 | 11 12 7 | 13 14 8 | 15 16 9 | 17 18 10 | 19 20 11 | 21 22 12 | 23 24 13 | 25 26 -------------------------------------------------------------------------------- /excel/output_files/pandas_output.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/excel/output_files/pandas_output.xls -------------------------------------------------------------------------------- /applications/historical_files/suppliers.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/applications/historical_files/suppliers.xls -------------------------------------------------------------------------------- /applications/historical_files/suppliers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bluesfer2007/foundations-for-analytics-with-python/master/applications/historical_files/suppliers.xlsx -------------------------------------------------------------------------------- /database/output_files/5output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,5467,750.0,2014-01-20 3 | Supplier X,001-1001,5467,750.0,2014-01-20 4 | -------------------------------------------------------------------------------- /database/output_files/5output-clinton.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,5467,750.0,2014-01-20 3 | Supplier X,001-1001,5467,750.0,2014-01-20 4 | -------------------------------------------------------------------------------- /csv/output_files/5output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00,1/20/14 3 | Supplier X,001-1001,2341,$500.00,1/20/14 4 | Supplier X,001-1001,5467,$750.00,1/20/14 5 | Supplier X,001-1001,5467,$750.00,1/20/14 6 | -------------------------------------------------------------------------------- /csv/pandas_parsing_and_write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import pandas as pd 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_csv(input_file) 9 | print(data_frame) 10 | data_frame.to_csv(output_file, index=False) -------------------------------------------------------------------------------- /csv/output_files/7output.csv: -------------------------------------------------------------------------------- 1 | Invoice Number,Purchase Date 2 | 001-1001,1/20/14 3 | 001-1001,1/20/14 4 | 001-1001,1/20/14 5 | 001-1001,1/20/14 6 | 50-9501,1/30/14 7 | 50-9501,1/30/14 8 | 50-9505,2/3/14 9 | 50-9505,2/3/14 10 | 920-4803,2/3/14 11 | 920-4804,2/10/14 12 | 920-4805,2/17/14 13 | 920-4806,2/24/14 14 | -------------------------------------------------------------------------------- /csv/pandas_column_by_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_csv(input_file) 9 | data_frame_column_by_index = data_frame.iloc[:, [0, 3]] 10 | 11 | data_frame_column_by_index.to_csv(output_file, index=False) -------------------------------------------------------------------------------- /csv/output_files/6output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Cost 2 | Supplier X,$500.00 3 | Supplier X,$500.00 4 | Supplier X,$750.00 5 | Supplier X,$750.00 6 | Supplier Y,$250.00 7 | Supplier Y,$250.00 8 | Supplier Y,$125.00 9 | Supplier Y,$125.00 10 | Supplier Z,$615.00 11 | Supplier Z,$615.00 12 | Supplier Z,$615.00 13 | Supplier Z,$615.00 14 | -------------------------------------------------------------------------------- /csv/output_files/3output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,5467,$750.00,1/20/14 3 | Supplier X,001-1001,5467,$750.00,1/20/14 4 | Supplier Z,920-4803,3321,$615.00,2/3/14 5 | Supplier Z,920-4804,3321,$615.00,2/10/14 6 | Supplier Z,920-4805,3321,$615.00,2/17/14 7 | Supplier Z,920-4806,3321,$615.00,2/24/14 8 | -------------------------------------------------------------------------------- /csv/output_files/4output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00,1/20/14 3 | Supplier X,001-1001,2341,$500.00,1/20/14 4 | Supplier X,001-1001,5467,$750.00,1/20/14 5 | Supplier X,001-1001,5467,$750.00,1/20/14 6 | Supplier Y,50-9501,7009,$250.00,1/30/14 7 | Supplier Y,50-9501,7009,$250.00,1/30/14 8 | -------------------------------------------------------------------------------- /csv/pandas_column_by_name.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_csv(input_file) 9 | data_frame_column_by_name = data_frame.loc[:, ['Invoice Number', 'Purchase Date']] 10 | 11 | data_frame_column_by_name.to_csv(output_file, index=False) -------------------------------------------------------------------------------- /excel/pandas_parsing_and_write_keep_dates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_excel(input_file, sheetname='january_2013') 9 | 10 | writer = pd.ExcelWriter(output_file) 11 | data_frame.to_excel(writer, sheet_name='jan_13_output', index=False) 12 | writer.save() -------------------------------------------------------------------------------- /applications/output_files/3output.csv: -------------------------------------------------------------------------------- 1 | Date,InnoDB: Compressed tables use zlib 1.2.3,InnoDB: Using atomics to ref count buffer pool pages,InnoDB: 5.6.16 started; log sequence number 1234567,/usr/local/mysql/bin/mysqld: Shutdown complete,InnoDB: Completed initialization of buffer pool,InnoDB: IPv6 is available. 2 | 2014-10-27,0,0,1,1,2,2 3 | 2014-03-07,3,1,1,1,0,0 4 | 2014-02-03,2,2,1,1,0,0 5 | -------------------------------------------------------------------------------- /csv/pandas_add_header_row.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | header_list = ['Supplier Name', 'Invoice Number', \ 9 | 'Part Number', 'Cost', 'Purchase Date'] 10 | data_frame = pd.read_csv(input_file, header=None, names=header_list) 11 | 12 | data_frame.to_csv(output_file, index=False) -------------------------------------------------------------------------------- /applications/output_files/3output-clinton.csv: -------------------------------------------------------------------------------- 1 | Date,InnoDB: Compressed tables use zlib 1.2.3,InnoDB: Using atomics to ref count buffer pool pages,InnoDB: 5.6.16 started; log sequence number 1234567,/usr/local/mysql/bin/mysqld: Shutdown complete,InnoDB: Completed initialization of buffer pool,InnoDB: IPv6 is available. 2 | 2014-10-27,0,0,1,1,2,2 3 | 2014-03-07,3,1,1,1,0,0 4 | 2014-02-03,2,2,1,1,0,0 5 | -------------------------------------------------------------------------------- /csv/sales_march_2014.csv: -------------------------------------------------------------------------------- 1 | Customer ID,Customer Name,Invoice Number,Sale Amount,Purchase Date 2 | 1234,John Smith,100-0014,"$1,350.00",3/4/14 3 | 8765,Tony Song,100-0015,"$1,167.00",3/8/14 4 | 2345,Mary Harrison,100-0016,"$1,789.00",3/17/14 5 | 6543,Rachel Paz,100-0017,"$2,042.00",3/22/14 6 | 3456,Lucy Gomez,100-0018,"$1,511.00",3/28/14 7 | 4321,Susan Wallace,100-0019,"$2,280.00",3/30/14 8 | -------------------------------------------------------------------------------- /csv/pandas_value_matches_pattern.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_csv(input_file) 9 | data_frame_value_matches_pattern = data_frame.ix[data_frame['Invoice Number']\ 10 | .str.startswith("001-"), :] 11 | 12 | data_frame_value_matches_pattern.to_csv(output_file, index=False) -------------------------------------------------------------------------------- /csv/sales_february_2014.csv: -------------------------------------------------------------------------------- 1 | Customer ID,Customer Name,Invoice Number,Sale Amount,Purchase Date 2 | 9876,Daniel Farber,100-0008,"$1,115.00",2/2/14 3 | 8765,Laney Stone,100-0009,"$1,367.00",2/8/14 4 | 7654,Roger Lipney,100-0010,"$2,135.00",2/15/14 5 | 6543,Thomas Haines,100-0011,"$1,346.00",2/17/14 6 | 5432,Anushka Vaz,100-0012,"$1,560.00",2/21/14 7 | 4321,Harriet Cooper,100-0013,"$1,852.00",2/25/14 8 | -------------------------------------------------------------------------------- /csv/sales_january_2014.csv: -------------------------------------------------------------------------------- 1 | Customer ID,Customer Name,Invoice Number,Sale Amount,Purchase Date 2 | 1234,John Smith,100-0002,"$1,200.00",1/1/14 3 | 2345,Mary Harrison,100-0003,"$1,425.00",1/6/14 4 | 3456,Lucy Gomez,100-0004,"$1,390.00",1/11/14 5 | 4567,Rupert Jones,100-0005,"$1,257.00",1/18/14 6 | 5678,Jenny Walters,100-0006,"$1,725.00",1/24/14 7 | 6789,Samantha Donaldson,100-0007,"$1,995.00",1/31/14 8 | -------------------------------------------------------------------------------- /excel/1excel_introspect_workbook.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from xlrd import open_workbook 4 | 5 | input_file = sys.argv[1] 6 | 7 | workbook = open_workbook(input_file) 8 | print('Number of worksheets:', workbook.nsheets) 9 | for worksheet in workbook.sheets(): 10 | print("Worksheet name:", worksheet.name, "\tRows:", \ 11 | worksheet.nrows, "\tColumns:", worksheet.ncols) 12 | -------------------------------------------------------------------------------- /csv/pandas_value_in_set.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_csv(input_file) 9 | 10 | important_dates = ['1/20/14', '1/30/14'] 11 | data_frame_value_in_set = data_frame.loc[data_frame['Purchase Date']\ 12 | .isin(important_dates), :] 13 | 14 | data_frame_value_in_set.to_csv(output_file, index=False) -------------------------------------------------------------------------------- /csv/pandas_select_contiguous_rows.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_csv(input_file, header=None) 9 | 10 | data_frame = data_frame.drop([0,1,2,16,17,18]) 11 | data_frame.columns = data_frame.iloc[0] 12 | data_frame = data_frame.reindex(data_frame.index.drop(3)) 13 | 14 | data_frame.to_csv(output_file, index=False) -------------------------------------------------------------------------------- /excel/pandas_column_by_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_excel(input_file, 'january_2013', index_col=None) 9 | 10 | data_frame_column_by_index = data_frame.iloc[:, [1, 4]] 11 | 12 | writer = pd.ExcelWriter(output_file) 13 | data_frame_column_by_index.to_excel(writer, sheet_name='jan_13_output', index=False) 14 | writer.save() -------------------------------------------------------------------------------- /csv/2csv_reader_parsing_and_write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | with open(input_file, 'r', newline='') as csv_in_file: 9 | with open(output_file, 'w', newline='') as csv_out_file: 10 | filereader = csv.reader(csv_in_file, delimiter=',') 11 | filewriter = csv.writer(csv_out_file, delimiter=',') 12 | for row_list in filereader: 13 | filewriter.writerow(row_list) -------------------------------------------------------------------------------- /excel/pandas_column_by_name.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_excel(input_file, 'january_2013', index_col=None) 9 | 10 | data_frame_column_by_name = data_frame.loc[:, ['Customer ID', 'Purchase Date']] 11 | 12 | writer = pd.ExcelWriter(output_file) 13 | data_frame_column_by_name.to_excel(writer, sheet_name='jan_13_output', index=False) 14 | writer.save() -------------------------------------------------------------------------------- /applications/output_files/2output.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Category,Total Time (in Days) 2 | Wayne Thompson,Silver,157 3 | Wayne Thompson,Bronze,167 4 | Bruce Johnson,Gold,160 5 | Bruce Johnson,Silver,60 6 | Bruce Johnson,Bronze,77 7 | Annie Lee,Gold,192 8 | Annie Lee,Silver,44 9 | Annie Lee,Bronze,26 10 | Priya Patel,Silver,99 11 | Priya Patel,Gold,54 12 | Mary Yu,Silver,72 13 | Mary Yu,Gold,231 14 | John Smith,Gold,206 15 | John Smith,Silver,39 16 | John Smith,Bronze,70 17 | -------------------------------------------------------------------------------- /applications/output_files/2output-clinton.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Category,Total Time (in Days) 2 | Wayne Thompson,Silver,198 3 | Wayne Thompson,Bronze,167 4 | Bruce Johnson,Gold,201 5 | Bruce Johnson,Silver,60 6 | Bruce Johnson,Bronze,77 7 | Annie Lee,Gold,233 8 | Annie Lee,Silver,44 9 | Annie Lee,Bronze,26 10 | Priya Patel,Silver,99 11 | Priya Patel,Gold,54 12 | Mary Yu,Silver,72 13 | Mary Yu,Gold,272 14 | John Smith,Gold,247 15 | John Smith,Silver,39 16 | John Smith,Bronze,70 17 | -------------------------------------------------------------------------------- /csv/pandas_value_meets_condition.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_csv(input_file) 9 | 10 | data_frame['Cost'] = data_frame['Cost'].str.strip('$').astype(float) 11 | data_frame_value_meets_condition = data_frame.loc[(data_frame['Supplier Name']\ 12 | .str.contains('Z')) | (data_frame['Cost'] > 600.0), :] 13 | 14 | data_frame_value_meets_condition.to_csv(output_file, index=False) -------------------------------------------------------------------------------- /excel/pandas_value_matches_pattern.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_excel(input_file, 'january_2013', index_col=None) 9 | 10 | data_frame_value_matches_pattern = data_frame[data_frame['Customer Name'].str.startswith("J")] 11 | 12 | writer = pd.ExcelWriter(output_file) 13 | data_frame_value_matches_pattern.to_excel(writer, sheet_name='jan_13_output', index=False) 14 | writer.save() -------------------------------------------------------------------------------- /excel/pandas_value_meets_condition.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_excel(input_file, 'january_2013', index_col=None) 9 | data_frame_value_meets_condition = \ 10 | data_frame[data_frame['Sale Amount'].astype(float) > 1400.0] 11 | 12 | writer = pd.ExcelWriter(output_file) 13 | data_frame_value_meets_condition.to_excel(writer, sheet_name='jan_13_output', index=False) 14 | writer.save() -------------------------------------------------------------------------------- /excel/pandas_value_in_set.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import string 4 | import sys 5 | 6 | input_file = sys.argv[1] 7 | output_file = sys.argv[2] 8 | 9 | data_frame = pd.read_excel(input_file, 'january_2013', index_col=None) 10 | 11 | important_dates = ['01/24/2013','01/31/2013'] 12 | data_frame_value_in_set = data_frame[data_frame['Purchase Date'].isin(important_dates)] 13 | 14 | writer = pd.ExcelWriter(output_file) 15 | data_frame_value_in_set.to_excel(writer, sheet_name='jan_13_output', index=False) 16 | writer.save() -------------------------------------------------------------------------------- /csv/11csv_reader_select_contiguous_rows.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | row_counter = 0 9 | with open(input_file, 'r', newline='') as csv_in_file: 10 | with open(output_file, 'w', newline='') as csv_out_file: 11 | filereader = csv.reader(csv_in_file) 12 | filewriter = csv.writer(csv_out_file) 13 | for row in filereader: 14 | if row_counter >= 3 and row_counter <= 15: 15 | filewriter.writerow([value.strip() for value in row]) 16 | row_counter += 1 -------------------------------------------------------------------------------- /csv/pandas_concat_rows_from_multiple_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import glob 4 | import os 5 | import sys 6 | 7 | input_path = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | all_files = glob.glob(os.path.join(input_path,'sales_*')) 11 | 12 | all_data_frames = [] 13 | for file in all_files: 14 | data_frame = pd.read_csv(file, index_col=None) 15 | all_data_frames.append(data_frame) 16 | data_frame_concat = pd.concat(all_data_frames, axis=0, ignore_index=True) 17 | 18 | data_frame_concat.to_csv(output_file, index = False) -------------------------------------------------------------------------------- /csv/12csv_reader_add_header_row.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | with open(input_file, 'r', newline='') as csv_in_file: 9 | with open(output_file, 'w', newline='') as csv_out_file: 10 | filereader = csv.reader(csv_in_file) 11 | filewriter = csv.writer(csv_out_file) 12 | header_list = ['Supplier Name', 'Invoice Number', \ 13 | 'Part Number', 'Cost', 'Purchase Date'] 14 | filewriter.writerow(header_list) 15 | for row in filereader: 16 | filewriter.writerow (row) -------------------------------------------------------------------------------- /csv/6csv_reader_column_by_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | my_columns = [0, 3] 9 | 10 | with open(input_file, 'r', newline='') as csv_in_file: 11 | with open(output_file, 'w', newline='') as csv_out_file: 12 | filereader = csv.reader(csv_in_file) 13 | filewriter = csv.writer(csv_out_file) 14 | for row_list in filereader: 15 | row_list_output = [ ] 16 | for index_value in my_columns: 17 | row_list_output.append(row_list[index_value]) 18 | filewriter.writerow(row_list_output) -------------------------------------------------------------------------------- /applications/historical_files/suppliers_2012.csv: -------------------------------------------------------------------------------- 1 | Item Number,Description,Supplier,Cost,Date 2 | 1234,Widget 1,Supplier A,"$1,100.00",6/2/2012 3 | ,Widget 1 Service,Supplier A,$600.00,6/3/2012 4 | 2345,Widget 2,Supplier A,"$2,300.00",6/17/2012 5 | ,Widget 2 Maintenance,Supplier A,"$1,000.00",6/30/2012 6 | 3456,Widget 3,Supplier B,$950.00,7/3/2012 7 | 4567,Widget 4,Supplier B,"$1,300.00",7/4/2012 8 | 5678,Widget 5,Supplier B,"$1,050.00",7/11/2012 9 | ,Widget 5 Service,Supplier B,$550.00,7/15/2012 10 | 6789,Widget 6,Supplier C,"$1,175.00",7/23/2012 11 | 7890,Widget 7,Supplier C,"$1,200.00",7/27/2012 12 | -------------------------------------------------------------------------------- /csv/4csv_reader_value_in_set.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | important_dates = ['1/20/14', '1/30/14'] 9 | 10 | with open(input_file, 'r', newline='') as csv_in_file: 11 | with open(output_file, 'w', newline='') as csv_out_file: 12 | filereader = csv.reader(csv_in_file) 13 | filewriter = csv.writer(csv_out_file) 14 | header = next(filereader) 15 | filewriter.writerow(header) 16 | for row_list in filereader: 17 | a_date = row_list[4] 18 | if a_date in important_dates: 19 | filewriter.writerow(row_list) -------------------------------------------------------------------------------- /csv/supplier_data.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00,1/20/14 3 | Supplier X,001-1001,2341,$500.00,1/20/14 4 | Supplier X,001-1001,5467,$750.00,1/20/14 5 | Supplier X,001-1001,5467,$750.00,1/20/14 6 | Supplier Y,50-9501,7009,$250.00,1/30/14 7 | Supplier Y,50-9501,7009,$250.00,1/30/14 8 | Supplier Y,50-9505,6650,$125.00,2/3/14 9 | Supplier Y,50-9505,6650,$125.00,2/3/14 10 | Supplier Z,920-4803,3321,$615.00,2/3/14 11 | Supplier Z,920-4804,3321,$615.00,2/10/14 12 | Supplier Z,920-4805,3321,$615.00,2/17/14 13 | Supplier Z,920-4806,3321,$615.00,2/24/14 14 | -------------------------------------------------------------------------------- /csv/supplier_data_no_header_row.csv: -------------------------------------------------------------------------------- 1 | Supplier X,001-1001,2341,$500.00 ,1/20/2014 2 | Supplier X,001-1001,2341,$500.00 ,1/20/2014 3 | Supplier X,001-1001,5467,$750.00 ,1/20/2014 4 | Supplier X,001-1001,5467,$750.00 ,1/20/2014 5 | Supplier Y,50-9501,7009,$250.00 ,1/30/2014 6 | Supplier Y,50-9501,7009,$250.00 ,1/30/2014 7 | Supplier Y,50-9505,6650,$125.00 ,2/3/2014 8 | Supplier Y,50-9505,6650,$125.00 ,2/3/2014 9 | Supplier Z,920-4803,3321,$615.00 ,2/3/2014 10 | Supplier Z,920-4804,3321,$615.00 ,2/10/2014 11 | Supplier Z,920-4805,3321,$615.00 ,2/17/2014 12 | Supplier Z,920-4806,3321,$615.00 ,2/24/2014 13 | -------------------------------------------------------------------------------- /csv/1csv_simple_parsing_and_write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | input_file = sys.argv[1] 5 | output_file = sys.argv[2] 6 | 7 | with open(input_file, 'r', newline='') as filereader: 8 | with open(output_file, 'w', newline='') as filewriter: 9 | header = filereader.readline() 10 | header = header.strip() 11 | header_list = header.split(',') 12 | print(header_list) 13 | filewriter.write(','.join(map(str,header_list))+'\n') 14 | for row in filereader: 15 | row = row.strip() 16 | row_list = row.split(',') 17 | print(row_list) 18 | filewriter.write(','.join(map(str,row_list))+'\n') -------------------------------------------------------------------------------- /csv/output_files/1output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00,1/20/14 3 | Supplier X,001-1001,2341,$500.00,1/20/14 4 | Supplier X,001-1001,5467,$750.00,1/20/14 5 | Supplier X,001-1001,5467,$750.00,1/20/14 6 | Supplier Y,50-9501,7009,$250.00,1/30/14 7 | Supplier Y,50-9501,7009,$250.00,1/30/14 8 | Supplier Y,50-9505,6650,$125.00,2/3/14 9 | Supplier Y,50-9505,6650,$125.00,2/3/14 10 | Supplier Z,920-4803,3321,$615.00,2/3/14 11 | Supplier Z,920-4804,3321,$615.00,2/10/14 12 | Supplier Z,920-4805,3321,$615.00,2/17/14 13 | Supplier Z,920-4806,3321,$615.00,2/24/14 14 | -------------------------------------------------------------------------------- /csv/output_files/2output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00,1/20/14 3 | Supplier X,001-1001,2341,$500.00,1/20/14 4 | Supplier X,001-1001,5467,$750.00,1/20/14 5 | Supplier X,001-1001,5467,$750.00,1/20/14 6 | Supplier Y,50-9501,7009,$250.00,1/30/14 7 | Supplier Y,50-9501,7009,$250.00,1/30/14 8 | Supplier Y,50-9505,6650,$125.00,2/3/14 9 | Supplier Y,50-9505,6650,$125.00,2/3/14 10 | Supplier Z,920-4803,3321,$615.00,2/3/14 11 | Supplier Z,920-4804,3321,$615.00,2/10/14 12 | Supplier Z,920-4805,3321,$615.00,2/17/14 13 | Supplier Z,920-4806,3321,$615.00,2/24/14 14 | -------------------------------------------------------------------------------- /database/supplier_data.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00,1/20/14 3 | Supplier X,001-1001,2341,$500.00,1/20/14 4 | Supplier X,001-1001,5467,$750.00,1/20/14 5 | Supplier X,001-1001,5467,$750.00,1/20/14 6 | Supplier Y,50-9501,7009,$250.00,1/30/14 7 | Supplier Y,50-9501,7009,$250.00,1/30/14 8 | Supplier Y,50-9505,6650,$125.00,2/3/14 9 | Supplier Y,50-9505,6650,$125.00,2/3/14 10 | Supplier Z,920-4803,3321,$615.00,2/3/14 11 | Supplier Z,920-4804,3321,$615.00,2/10/14 12 | Supplier Z,920-4805,3321,$615.00,2/17/14 13 | Supplier Z,920-4806,3321,$615.00,2/24/14 14 | -------------------------------------------------------------------------------- /excel/pandas_column_by_name_all_worksheets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_excel(input_file, sheetname=None, index_col=None) 9 | 10 | column_output = [] 11 | for worksheet_name, data in data_frame.items(): 12 | column_output.append(data.loc[:, ['Customer Name', 'Sale Amount']]) 13 | selected_columns = pd.concat(column_output, axis=0, ignore_index=True) 14 | 15 | writer = pd.ExcelWriter(output_file) 16 | selected_columns.to_excel(writer, sheet_name='selected_columns_all_worksheets', index=False) 17 | writer.save() -------------------------------------------------------------------------------- /excel/2excel_parsing_and_write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from xlrd import open_workbook 4 | from xlwt import Workbook 5 | 6 | input_file = sys.argv[1] 7 | output_file = sys.argv[2] 8 | 9 | output_workbook = Workbook() 10 | output_worksheet = output_workbook.add_sheet('jan_2013_output') 11 | 12 | with open_workbook(input_file) as workbook: 13 | worksheet = workbook.sheet_by_name('january_2013') 14 | for row_index in range(worksheet.nrows): 15 | for column_index in range(worksheet.ncols): 16 | output_worksheet.write(row_index, column_index, worksheet.cell_value(row_index, column_index)) 17 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /csv/output_files/11output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00,1/20/2014 3 | Supplier X,001-1001,2341,$500.00,1/20/2014 4 | Supplier X,001-1001,5467,$750.00,1/20/2014 5 | Supplier X,001-1001,5467,$750.00,1/20/2014 6 | Supplier Y,50-9501,7009,$250.00,1/30/2014 7 | Supplier Y,50-9501,7009,$250.00,1/30/2014 8 | Supplier Y,50-9505,6650,$125.00,2/3/2014 9 | Supplier Y,50-9505,6650,$125.00,2/3/2014 10 | Supplier Z,920-4803,3321,$615.00,2/3/2014 11 | Supplier Z,920-4804,3321,$615.00,2/10/2014 12 | Supplier Z,920-4805,3321,$615.00,2/17/2014 13 | Supplier Z,920-4806,3321,$615.00,2/24/2014 14 | -------------------------------------------------------------------------------- /excel/pandas_value_meets_condition_all_worksheets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | data_frame = pd.read_excel(input_file, sheetname=None, index_col=None) 9 | 10 | row_output = [] 11 | for worksheet_name, data in data_frame.items(): 12 | row_output.append(data[data['Sale Amount'].replace('$', '').replace(',', '').astype(float) > 2000.0]) 13 | filtered_rows = pd.concat(row_output, axis=0, ignore_index=True) 14 | 15 | writer = pd.ExcelWriter(output_file) 16 | filtered_rows.to_excel(writer, sheet_name='sale_amount_gt2000', index=False) 17 | writer.save() 18 | -------------------------------------------------------------------------------- /csv/output_files/pandas_output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00 ,1/20/2014 3 | Supplier X,001-1001,2341,$500.00 ,1/20/2014 4 | Supplier X,001-1001,5467,$750.00 ,1/20/2014 5 | Supplier X,001-1001,5467,$750.00 ,1/20/2014 6 | Supplier Y,50-9501,7009,$250.00 ,1/30/2014 7 | Supplier Y,50-9501,7009,$250.00 ,1/30/2014 8 | Supplier Y,50-9505,6650,$125.00 ,2/3/2014 9 | Supplier Y,50-9505,6650,$125.00 ,2/3/2014 10 | Supplier Z,920-4803,3321,$615.00 ,2/3/2014 11 | Supplier Z,920-4804,3321,$615.00 ,2/10/2014 12 | Supplier Z,920-4805,3321,$615.00 ,2/17/2014 13 | Supplier Z,920-4806,3321,$615.00 ,2/24/2014 14 | -------------------------------------------------------------------------------- /csv/3csv_reader_value_meets_condition.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | with open(input_file, 'r', newline='') as csv_in_file: 9 | with open(output_file, 'w', newline='') as csv_out_file: 10 | filereader = csv.reader(csv_in_file) 11 | filewriter = csv.writer(csv_out_file) 12 | header = next(filereader) 13 | filewriter.writerow(header) 14 | for row_list in filereader: 15 | supplier = str(row_list[0]).strip() 16 | cost = str(row_list[3]).strip('$').replace(',', '') 17 | if supplier == 'Supplier Z' or float(cost) > 600.0: 18 | filewriter.writerow(row_list) -------------------------------------------------------------------------------- /csv/8csv_reader_counts_for_multiple_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import glob 4 | import os 5 | import sys 6 | 7 | input_path = sys.argv[1] 8 | 9 | file_counter = 0 10 | for input_file in glob.glob(os.path.join(input_path,'sales_*')): 11 | row_counter = 1 12 | with open(input_file, 'r', newline='') as csv_in_file: 13 | filereader = csv.reader(csv_in_file) 14 | header = next(filereader) 15 | for row in filereader: 16 | row_counter += 1 17 | print('{0!s}: \t{1:d} rows \t{2:d} columns'.format(\ 18 | os.path.basename(input_file), row_counter, len(header))) 19 | file_counter += 1 20 | print('Number of files: {0:d}'.format(file_counter)) -------------------------------------------------------------------------------- /csv/5csv_reader_value_matches_pattern.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import re 4 | import sys 5 | 6 | input_file = sys.argv[1] 7 | output_file = sys.argv[2] 8 | 9 | pattern = re.compile(r'(?P^001-.*)', re.I) 10 | 11 | with open(input_file, 'r', newline='') as csv_in_file: 12 | with open(output_file, 'w', newline='') as csv_out_file: 13 | filereader = csv.reader(csv_in_file) 14 | filewriter = csv.writer(csv_out_file) 15 | header = next(filereader) 16 | filewriter.writerow(header) 17 | for row_list in filereader: 18 | invoice_number = row_list[1] 19 | if pattern.search(invoice_number): 20 | filewriter.writerow(row_list) -------------------------------------------------------------------------------- /csv/output_files/12output.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,$500.00 ,1/20/2014 3 | Supplier X,001-1001,2341,$500.00 ,1/20/2014 4 | Supplier X,001-1001,5467,$750.00 ,1/20/2014 5 | Supplier X,001-1001,5467,$750.00 ,1/20/2014 6 | Supplier Y,50-9501,7009,$250.00 ,1/30/2014 7 | Supplier Y,50-9501,7009,$250.00 ,1/30/2014 8 | Supplier Y,50-9505,6650,$125.00 ,2/3/2014 9 | Supplier Y,50-9505,6650,$125.00 ,2/3/2014 10 | Supplier Z,920-4803,3321,$615.00 ,2/3/2014 11 | Supplier Z,920-4804,3321,$615.00 ,2/10/2014 12 | Supplier Z,920-4805,3321,$615.00 ,2/17/2014 13 | Supplier Z,920-4806,3321,$615.00 ,2/24/2014 14 | -------------------------------------------------------------------------------- /database/supplier_data_for_mysql_database.csv: -------------------------------------------------------------------------------- 1 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 2 | Supplier X,001-1001,2341,500.00,2014-01-20 3 | Supplier X,001-1001,2341,500.00,2014-01-20 4 | Supplier X,001-1001,5467,750.00,2014-01-20 5 | Supplier X,001-1001,5467,750.00,2014-01-20 6 | Supplier Y,50-9501,7009,250.00,2014-01-30 7 | Supplier Y,50-9501,7009,250.00,2014-01-30 8 | Supplier Y,50-9505,6650,125.00,2014-02-03 9 | Supplier Y,50-9505,6650,125.00,2014-02-03 10 | Supplier Z,920-4803,3321,615.00,2014-02-03 11 | Supplier Z,920-4804,3321,615.00,2014-02-10 12 | Supplier Z,920-4805,3321,615.00,2014-02-17 13 | Supplier Z,920-4806,3321,615.00,2014-02-24 14 | -------------------------------------------------------------------------------- /excel/12excel_introspect_all_workbooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import glob 3 | import os 4 | import sys 5 | from xlrd import open_workbook 6 | 7 | input_directory = sys.argv[1] 8 | 9 | workbook_counter = 0 10 | for input_file in glob.glob(os.path.join(input_directory, '*.xls*')): 11 | workbook = open_workbook(input_file) 12 | print('Workbook: {}'.format(os.path.basename(input_file))) 13 | print('Number of worksheets: {}'.format(workbook.nsheets)) 14 | for worksheet in workbook.sheets(): 15 | print('Worksheet name:', worksheet.name, '\tRows:',\ 16 | worksheet.nrows, '\tColumns:', worksheet.ncols) 17 | workbook_counter += 1 18 | print('Number of Excel workbooks: {}'.format(workbook_counter)) -------------------------------------------------------------------------------- /excel/pandas_value_meets_condition_set_of_worksheets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | my_sheets = [0,1] 9 | threshold = 1900.0 10 | 11 | data_frame = pd.read_excel(input_file, sheetname=my_sheets, index_col=None) 12 | 13 | row_list = [] 14 | for worksheet_name, data in data_frame.items(): 15 | row_list.append(data[data['Sale Amount'].replace('$', '').replace(',', '').astype(float) > threshold]) 16 | filtered_rows = pd.concat(row_list, axis=0, ignore_index=True) 17 | 18 | writer = pd.ExcelWriter(output_file) 19 | filtered_rows.to_excel(writer, sheet_name='set_of_worksheets', index=False) 20 | writer.save() 21 | -------------------------------------------------------------------------------- /excel/pandas_concat_data_from_multiple_workbooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import glob 4 | import os 5 | import sys 6 | 7 | input_path = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | all_workbooks = glob.glob(os.path.join(input_path,'*.xls*')) 11 | data_frames = [] 12 | for workbook in all_workbooks: 13 | all_worksheets = pd.read_excel(workbook, sheetname=None, index_col=None) 14 | for worksheet_name, data in all_worksheets.items(): 15 | data_frames.append(data) 16 | all_data_concatenated = pd.concat(data_frames, axis=0, ignore_index=True) 17 | 18 | writer = pd.ExcelWriter(output_file) 19 | all_data_concatenated.to_excel(writer, sheet_name='all_data_all_workbooks', index=False) 20 | writer.save() -------------------------------------------------------------------------------- /plots/matplotlib_basic_bar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import matplotlib.pyplot as plt 3 | plt.style.use('ggplot') 4 | 5 | customers = ['ABC', 'DEF', 'GHI', 'JKL', 'MNO'] 6 | customers_index = range(len(customers)) 7 | sale_amounts = [127, 90, 201, 111, 232] 8 | 9 | fig = plt.figure() 10 | ax1 = fig.add_subplot(1,1,1) 11 | ax1.bar(customers_index, sale_amounts, align='center', color='darkblue') 12 | ax1.xaxis.set_ticks_position('bottom') 13 | ax1.yaxis.set_ticks_position('left') 14 | plt.xticks(customers_index, customers, rotation=0, fontsize='small') 15 | 16 | plt.xlabel('Customer Name') 17 | plt.ylabel('Sale Amount') 18 | plt.title('Sale Amount per Customer') 19 | 20 | plt.savefig('bar_plot.png', dpi=400, bbox_inches='tight') 21 | plt.show() -------------------------------------------------------------------------------- /csv/9csv_reader_concat_rows_from_multiple_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import glob 4 | import os 5 | import sys 6 | 7 | input_path = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | first_file = True 11 | for input_file in glob.glob(os.path.join(input_path,'sales_*')): 12 | print(os.path.basename(input_file)) 13 | with open(input_file, 'r', newline='') as csv_in_file: 14 | with open(output_file, 'a', newline='') as csv_out_file: 15 | filereader = csv.reader(csv_in_file) 16 | filewriter = csv.writer(csv_out_file) 17 | if first_file: 18 | for row in filereader: 19 | filewriter.writerow(row) 20 | first_file = False 21 | else: 22 | header = next(filereader) 23 | for row in filereader: 24 | filewriter.writerow(row) -------------------------------------------------------------------------------- /plots/matplotlib_basic_histogram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | plt.style.use('ggplot') 5 | 6 | mu1, mu2, sigma = 100, 130, 15 7 | x1 = mu1 + sigma*np.random.randn(10000) 8 | x2 = mu2 + sigma*np.random.randn(10000) 9 | 10 | fig = plt.figure() 11 | ax1 = fig.add_subplot(1,1,1) 12 | n, bins, patches = ax1.hist(x1, bins=50, normed=False, color='darkgreen') 13 | n, bins, patches = ax1.hist(x2, bins=50, normed=False, color='orange', alpha=0.5) 14 | ax1.xaxis.set_ticks_position('bottom') 15 | ax1.yaxis.set_ticks_position('left') 16 | 17 | plt.xlabel('Bins') 18 | plt.ylabel('Number of Values in Bin') 19 | fig.suptitle('Histograms', fontsize=14, fontweight='bold') 20 | ax1.set_title('Two Frequency Distributions') 21 | 22 | plt.savefig('histogram.png', dpi=400, bbox_inches='tight') 23 | plt.show() -------------------------------------------------------------------------------- /database/5db_mysql_write_to_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import MySQLdb 4 | import sys 5 | 6 | # Path to and name of a CSV output file 7 | output_file = sys.argv[1] 8 | 9 | # Connect to a MySQL database 10 | con = MySQLdb.connect(host='localhost', port=3306, db='my_suppliers', \ 11 | user='root', passwd='my_password') 12 | c = con.cursor() 13 | 14 | # Create a file writer object and write the header row 15 | filewriter = csv.writer(open(output_file, 'w', newline=''), delimiter=',') 16 | header = ['Supplier Name','Invoice Number','Part Number','Cost','Purchase Date'] 17 | filewriter.writerow(header) 18 | 19 | # Query the Suppliers table and write the output to a CSV file 20 | c.execute("""SELECT * 21 | FROM Suppliers 22 | WHERE Cost > 700.0;""") 23 | rows = c.fetchall() 24 | for row in rows: 25 | filewriter.writerow(row) 26 | -------------------------------------------------------------------------------- /csv/7csv_reader_column_by_name.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | output_file = sys.argv[2] 7 | 8 | my_columns = ['Invoice Number', 'Purchase Date'] 9 | my_columns_index = [] 10 | 11 | with open(input_file, 'r', newline='') as csv_in_file: 12 | with open(output_file, 'w', newline='') as csv_out_file: 13 | filereader = csv.reader(csv_in_file) 14 | filewriter = csv.writer(csv_out_file) 15 | header = next(filereader) 16 | for index_value in range(len(header)): 17 | if header[index_value] in my_columns: 18 | my_columns_index.append(index_value) 19 | filewriter.writerow(my_columns) 20 | for row_list in filereader: 21 | row_list_output = [ ] 22 | for index_value in my_columns_index: 23 | row_list_output.append(row_list[index_value]) 24 | filewriter.writerow(row_list_output) -------------------------------------------------------------------------------- /csv/supplier_data_unnecessary_header_footer.csv: -------------------------------------------------------------------------------- 1 | I don't care about this row,,,, 2 | I don't care about this row,,,, 3 | I don't care about this row,,,, 4 | Supplier Name,Invoice Number,Part Number,Cost,Purchase Date 5 | Supplier X,001-1001,2341,$500.00 ,1/20/2014 6 | Supplier X,001-1001,2341,$500.00 ,1/20/2014 7 | Supplier X,001-1001,5467,$750.00 ,1/20/2014 8 | Supplier X,001-1001,5467,$750.00 ,1/20/2014 9 | Supplier Y,50-9501,7009,$250.00 ,1/30/2014 10 | Supplier Y,50-9501,7009,$250.00 ,1/30/2014 11 | Supplier Y,50-9505,6650,$125.00 ,2/3/2014 12 | Supplier Y,50-9505,6650,$125.00 ,2/3/2014 13 | Supplier Z,920-4803,3321,$615.00 ,2/3/2014 14 | Supplier Z,920-4804,3321,$615.00 ,2/10/2014 15 | Supplier Z,920-4805,3321,$615.00 ,2/17/2014 16 | Supplier Z,920-4806,3321,$615.00 ,2/24/2014 17 | I don't want this row either,,,, 18 | I don't want this row either,,,, 19 | I don't want this row either,,,, 20 | -------------------------------------------------------------------------------- /plots/matplotlib_basic_scatter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | plt.style.use('ggplot') 5 | 6 | x = np.arange(start=1., stop=15., step=1.) 7 | y_linear = x + 5. * np.random.randn(14.) 8 | y_quadratic = x**2 + 10. * np.random.randn(14.) 9 | 10 | fn_linear = np.poly1d(np.polyfit(x, y_linear, deg=1)) 11 | fn_quadratic = np.poly1d(np.polyfit(x, y_quadratic, deg=2)) 12 | 13 | fig = plt.figure() 14 | ax1 = fig.add_subplot(1,1,1) 15 | ax1.plot(x, y_linear, 'bo', x, y_quadratic, 'go', \ 16 | x, fn_linear(x), 'b-', x, fn_quadratic(x), 'g-', linewidth=2.) 17 | ax1.xaxis.set_ticks_position('bottom') 18 | ax1.yaxis.set_ticks_position('left') 19 | 20 | ax1.set_title('Scatter Plots with Best Fit Lines') 21 | plt.xlabel('x') 22 | plt.ylabel('f(x)') 23 | plt.xlim((min(x)-1., max(x)+1.)) 24 | plt.ylim((min(y_quadratic)-10., max(y_quadratic)+10.)) 25 | 26 | plt.savefig('scatter_plot.png', dpi=400, bbox_inches='tight') 27 | plt.show() -------------------------------------------------------------------------------- /csv/output_files/9output.csv: -------------------------------------------------------------------------------- 1 | Customer ID,Customer Name,Invoice Number,Sale Amount,Purchase Date 2 | 9876,Daniel Farber,100-0008,"$1,115.00",2/2/14 3 | 8765,Laney Stone,100-0009,"$1,367.00",2/8/14 4 | 7654,Roger Lipney,100-0010,"$2,135.00",2/15/14 5 | 6543,Thomas Haines,100-0011,"$1,346.00",2/17/14 6 | 5432,Anushka Vaz,100-0012,"$1,560.00",2/21/14 7 | 4321,Harriet Cooper,100-0013,"$1,852.00",2/25/14 8 | 1234,John Smith,100-0002,"$1,200.00",1/1/14 9 | 2345,Mary Harrison,100-0003,"$1,425.00",1/6/14 10 | 3456,Lucy Gomez,100-0004,"$1,390.00",1/11/14 11 | 4567,Rupert Jones,100-0005,"$1,257.00",1/18/14 12 | 5678,Jenny Walters,100-0006,"$1,725.00",1/24/14 13 | 6789,Samantha Donaldson,100-0007,"$1,995.00",1/31/14 14 | 1234,John Smith,100-0014,"$1,350.00",3/4/14 15 | 8765,Tony Song,100-0015,"$1,167.00",3/8/14 16 | 2345,Mary Harrison,100-0016,"$1,789.00",3/17/14 17 | 6543,Rachel Paz,100-0017,"$2,042.00",3/22/14 18 | 3456,Lucy Gomez,100-0018,"$1,511.00",3/28/14 19 | 4321,Susan Wallace,100-0019,"$2,280.00",3/30/14 20 | -------------------------------------------------------------------------------- /plots/matplotlib_basic_line.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from numpy.random import randn 3 | import matplotlib.pyplot as plt 4 | plt.style.use('ggplot') 5 | 6 | plot_data1 = randn(50).cumsum() 7 | plot_data2 = randn(50).cumsum() 8 | plot_data3 = randn(50).cumsum() 9 | plot_data4 = randn(50).cumsum() 10 | 11 | fig = plt.figure() 12 | ax1 = fig.add_subplot(1,1,1) 13 | ax1.plot(plot_data1, marker=r'o', color=u'blue', linestyle='-', label='Blue Solid') 14 | ax1.plot(plot_data2, marker=r'+', color=u'red', linestyle='--', label='Red Dashed') 15 | ax1.plot(plot_data3, marker=r'*', color=u'green', linestyle='-.', label='Green Dash Dot') 16 | ax1.plot(plot_data4, marker=r's', color=u'orange', linestyle=':', label='Orange Dotted') 17 | ax1.xaxis.set_ticks_position('bottom') 18 | ax1.yaxis.set_ticks_position('left') 19 | 20 | ax1.set_title('Line Plots: Markers, Colors, and Linestyles') 21 | plt.xlabel('Draw') 22 | plt.ylabel('Random Number') 23 | plt.legend(loc='best') 24 | 25 | plt.savefig('line_plot.png', dpi=400, bbox_inches='tight') 26 | plt.show() -------------------------------------------------------------------------------- /plots/matplotlib_basic_boxplot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | plt.style.use('ggplot') 5 | 6 | N = 500 7 | normal = np.random.normal(loc=0.0, scale=1.0, size=N) 8 | lognormal = np.random.lognormal(mean=0.0, sigma=1.0, size=N) 9 | index_value = np.random.random_integers(low=0, high=N-1, size=N) 10 | normal_sample = normal[index_value] 11 | lognormal_sample = lognormal[index_value] 12 | box_plot_data = [normal,normal_sample,lognormal,lognormal_sample] 13 | 14 | fig = plt.figure() 15 | ax1 = fig.add_subplot(1,1,1) 16 | 17 | box_labels = ['normal','normal_sample','lognormal','lognormal_sample'] 18 | ax1.boxplot(box_plot_data, notch=False, sym='.', vert=True, whis=1.5, \ 19 | showmeans=True, labels=box_labels) 20 | ax1.xaxis.set_ticks_position('bottom') 21 | ax1.yaxis.set_ticks_position('left') 22 | ax1.set_title('Box Plots: Resampling of Two Distributions') 23 | ax1.set_xlabel('Distribution') 24 | ax1.set_ylabel('Value') 25 | 26 | plt.savefig('box_plot.png', dpi=400, bbox_inches='tight') 27 | plt.show() -------------------------------------------------------------------------------- /database/6db_mysql_update_from_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import MySQLdb 4 | import sys 5 | 6 | # Path to and name of a CSV input file 7 | input_file = sys.argv[1] 8 | 9 | # Connect to a MySQL database 10 | con = MySQLdb.connect(host='localhost', port=3306, db='my_suppliers', \ 11 | user='root', passwd='my_password') 12 | c = con.cursor() 13 | 14 | # Read the CSV file and update the specific rows 15 | file_reader = csv.reader(open(input_file, 'r', newline=''), delimiter=',') 16 | header = next(file_reader, None) 17 | for row in file_reader: 18 | data = [] 19 | for column_index in range(len(header)): 20 | data.append(str(row[column_index]).strip()) 21 | print(data) 22 | c.execute("""UPDATE Suppliers SET Cost=%s, Purchase_Date=%s WHERE Supplier_Name=%s;""", data) 23 | con.commit() 24 | 25 | # Query the Suppliers table 26 | c.execute("SELECT * FROM Suppliers") 27 | rows = c.fetchall() 28 | for row in rows: 29 | output = [] 30 | for column_index in range(len(row)): 31 | output.append(str(row[column_index])) 32 | print(output) 33 | -------------------------------------------------------------------------------- /csv/pandas_sum_average_from_multiple_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import glob 4 | import os 5 | import sys 6 | 7 | input_path = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | all_files = glob.glob(os.path.join(input_path,'sales_*')) 11 | all_data_frames = [] 12 | for input_file in all_files: 13 | data_frame = pd.read_csv(input_file, index_col=None) 14 | 15 | total_sales = pd.DataFrame([float(str(value).strip('$').replace(',','')) \ 16 | for value in data_frame.loc[:, 'Sale Amount']]).sum() 17 | 18 | average_sales = pd.DataFrame([float(str(value).strip('$').replace(',','')) \ 19 | for value in data_frame.loc[:, 'Sale Amount']]).mean() 20 | 21 | data = {'file_name': os.path.basename(input_file), 22 | 'total_sales': total_sales, 23 | 'average_sales': average_sales} 24 | 25 | all_data_frames.append(pd.DataFrame(data, columns=['file_name', 'total_sales', 'average_sales'])) 26 | 27 | data_frames_concat = pd.concat(all_data_frames, axis=0, ignore_index=True) 28 | 29 | data_frames_concat.to_csv(output_file, index = False) -------------------------------------------------------------------------------- /database/1db_count_rows.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sqlite3 3 | 4 | # Create an in-memory SQLite3 database 5 | # Create a table called sales with four attributes 6 | con = sqlite3.connect(':memory:') 7 | query = """CREATE TABLE sales 8 | (customer VARCHAR(20), 9 | product VARCHAR(40), 10 | amount FLOAT, 11 | date DATE);""" 12 | con.execute(query) 13 | con.commit() 14 | 15 | # Insert a few rows of data into the table 16 | data = [('Richard Lucas', 'Notepad', 2.50, '2014-01-02'), 17 | ('Jenny Kim', 'Binder', 4.15, '2014-01-15'), 18 | ('Svetlana Crow', 'Printer', 155.75, '2014-02-03'), 19 | ('Stephen Randolph', 'Computer', 679.40, '2014-02-20')] 20 | statement = "INSERT INTO sales VALUES(?, ?, ?, ?)" 21 | con.executemany(statement, data) 22 | con.commit() 23 | 24 | # Query the sales table 25 | cursor = con.execute("SELECT * FROM sales") 26 | rows = cursor.fetchall() 27 | 28 | # Count the number of rows in the output 29 | row_counter = 0 30 | for row in rows: 31 | print(row) 32 | row_counter += 1 33 | print('Number of rows: {}'.format(row_counter)) 34 | -------------------------------------------------------------------------------- /plots/ggplot_plots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from ggplot import * 3 | """ 4 | print(mtcars.head()) 5 | plt1 = ggplot(aes(x='mpg'), data=mtcars) +\ 6 | geom_histogram(fill='darkblue', binwidth=2) +\ 7 | xlim(10, 35) + ylim(0, 10) +\ 8 | xlab("MPG") + ylab("Frequency") +\ 9 | ggtitle("Histogram of MPG") +\ 10 | theme_matplotlib() 11 | print(plt1) 12 | 13 | print(meat.head()) 14 | plt2 = ggplot(aes(x='date', y='beef'), data=meat) +\ 15 | geom_line(color='purple', size=1.5, alpha=0.75) +\ 16 | stat_smooth(colour='blue', size=2.0, span=0.15) +\ 17 | xlab("Year") + ylab("Head of Cattle Slaughtered") +\ 18 | ggtitle("Beef Consumption Over Time") +\ 19 | theme_seaborn() 20 | print(plt2) 21 | """ 22 | print(diamonds.head()) 23 | plt3 = ggplot(diamonds, aes(x='carat', y='price', colour='cut')) +\ 24 | geom_point(alpha=0.5) +\ 25 | scale_color_gradient(low='#05D9F6', high='#5011D1') +\ 26 | xlim(0, 6) + ylim(0, 20000) +\ 27 | xlab("Carat") + ylab("Price") +\ 28 | ggtitle("Diamond Price by Carat and Cut") +\ 29 | theme_gray() 30 | print(plt3) 31 | 32 | ggsave(plt3, "ggplot_plots.png") -------------------------------------------------------------------------------- /csv/10csv_reader_sum_average_from_multiple_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import glob 4 | import os 5 | import string 6 | import sys 7 | 8 | input_path = sys.argv[1] 9 | output_file = sys.argv[2] 10 | 11 | output_header_list = ['file_name', 'total_sales', 'average_sales'] 12 | 13 | csv_out_file = open(output_file, 'a', newline='') 14 | filewriter = csv.writer(csv_out_file) 15 | filewriter.writerow(output_header_list) 16 | 17 | for input_file in glob.glob(os.path.join(input_path,'sales_*')): 18 | with open(input_file, 'r', newline='') as csv_in_file: 19 | filereader = csv.reader(csv_in_file) 20 | output_list = [ ] 21 | output_list.append(os.path.basename(input_file)) 22 | header = next(filereader) 23 | total_sales = 0.0 24 | number_of_sales = 0.0 25 | for row in filereader: 26 | sale_amount = row[3] 27 | total_sales += float(str(sale_amount).strip('$').replace(',','')) 28 | number_of_sales += 1.0 29 | average_sales = '{0:.2f}'.format(total_sales / number_of_sales) 30 | output_list.append(total_sales) 31 | output_list.append(average_sales) 32 | filewriter.writerow(output_list) 33 | csv_out_file.close() 34 | -------------------------------------------------------------------------------- /excel/3excel_parsing_and_write_keep_dates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from datetime import date 4 | from xlrd import open_workbook, xldate_as_tuple 5 | from xlwt import Workbook 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | output_workbook = Workbook() 11 | output_worksheet = output_workbook.add_sheet('jan_2013_output') 12 | 13 | with open_workbook(input_file) as workbook: 14 | worksheet = workbook.sheet_by_name('january_2013') 15 | for row_index in range(worksheet.nrows): 16 | row_list_output = [] 17 | for col_index in range(worksheet.ncols): 18 | if worksheet.cell_type(row_index, col_index) == 3: 19 | date_cell = xldate_as_tuple(worksheet.cell_value\ 20 | (row_index, col_index),workbook.datemode) 21 | date_cell = date(*date_cell[0:3]).strftime\ 22 | ('%m/%d/%Y') 23 | row_list_output.append(date_cell) 24 | output_worksheet.write(row_index, col_index, date_cell) 25 | else: 26 | non_date_cell = worksheet.cell_value\ 27 | (row_index,col_index) 28 | row_list_output.append(non_date_cell) 29 | output_worksheet.write(row_index, col_index,\ 30 | non_date_cell) 31 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /applications/3parse_text_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | 4 | input_file = sys.argv[1] 5 | output_file = sys.argv[2] 6 | 7 | messages = {} 8 | notes = [] 9 | with open(input_file, 'r', newline='') as text_file: 10 | for row in text_file: 11 | if '[Note]' in row: 12 | row_list = row.split(' ', 4) 13 | day = row_list[0].strip() 14 | note = row_list[4].strip('\n').strip() 15 | if note not in notes: 16 | notes.append(note) 17 | if day not in messages: 18 | messages[day] = {} 19 | if note not in messages[day]: 20 | messages[day][note] = 1 21 | else: 22 | messages[day][note] += 1 23 | 24 | filewriter = open(output_file, 'w', newline='') 25 | header = ['Date'] 26 | header.extend(notes) 27 | header = ','.join(map(str,header)) + '\n' 28 | print(header) 29 | filewriter.write(header) 30 | for day, day_value in messages.items(): 31 | row_of_output = [] 32 | row_of_output.append(day) 33 | for index in range(len(notes)): 34 | if notes[index] in day_value.keys(): 35 | row_of_output.append(day_value[notes[index]]) 36 | else: 37 | row_of_output.append(0) 38 | output = ','.join(map(str,row_of_output)) + '\n' 39 | print(output) 40 | filewriter.write(output) 41 | filewriter.close() -------------------------------------------------------------------------------- /excel/7excel_column_by_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from datetime import date 4 | from xlrd import open_workbook, xldate_as_tuple 5 | from xlwt import Workbook 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | output_workbook = Workbook() 11 | output_worksheet = output_workbook.add_sheet('jan_2013_output') 12 | 13 | my_columns = [1, 4] 14 | 15 | with open_workbook(input_file) as workbook: 16 | worksheet = workbook.sheet_by_name('january_2013') 17 | data = [] 18 | for row_index in range(worksheet.nrows): 19 | row_list = [] 20 | for column_index in my_columns: 21 | cell_value = worksheet.cell_value(row_index,column_index) 22 | cell_type = worksheet.cell_type(row_index, column_index) 23 | if cell_type == 3: 24 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 25 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 26 | row_list.append(date_cell) 27 | else: 28 | row_list.append(cell_value) 29 | data.append(row_list) 30 | 31 | for list_index, output_list in enumerate(data): 32 | for element_index, element in enumerate(output_list): 33 | output_worksheet.write(list_index, element_index, element) 34 | 35 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /database/2db_insert_rows.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sqlite3 4 | import sys 5 | 6 | # Path to and name of a CSV input file 7 | input_file = sys.argv[1] 8 | 9 | # Create an in-memory SQLite3 database 10 | # Create a table called Suppliers with five attributes 11 | con = sqlite3.connect('Suppliers.db') 12 | c = con.cursor() 13 | create_table = """CREATE TABLE IF NOT EXISTS Suppliers 14 | (Supplier_Name VARCHAR(20), 15 | Invoice_Number VARCHAR(20), 16 | Part_Number VARCHAR(20), 17 | Cost FLOAT, 18 | Purchase_Date DATE);""" 19 | c.execute(create_table) 20 | con.commit() 21 | 22 | # Read the CSV file 23 | # Insert the data into the Suppliers table 24 | file_reader = csv.reader(open(input_file, 'r'), delimiter=',') 25 | header = next(file_reader, None) 26 | for row in file_reader: 27 | data = [] 28 | for column_index in range(len(header)): 29 | data.append(row[column_index]) 30 | print(data) 31 | c.execute("INSERT INTO Suppliers VALUES (?, ?, ?, ?, ?);", data) 32 | con.commit() 33 | 34 | # Query the Suppliers table 35 | output = c.execute("SELECT * FROM Suppliers") 36 | rows = output.fetchall() 37 | for row in rows: 38 | output = [] 39 | for column_index in range(len(row)): 40 | output.append(str(row[column_index])) 41 | print(output) 42 | -------------------------------------------------------------------------------- /plots/pandas_plots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | plt.style.use('ggplot') 6 | 7 | fig, axes = plt.subplots(nrows=1, ncols=2) 8 | ax1, ax2 = axes.ravel() 9 | 10 | data_frame = pd.DataFrame(np.random.rand(5, 3), 11 | index=['Customer 1', 'Customer 2', 'Customer 3', 'Customer 4', 'Customer 5'], 12 | columns=pd.Index(['Metric 1', 'Metric 2', 'Metric 3'], name='Metrics')) 13 | 14 | data_frame.plot(kind='bar', ax=ax1, alpha=0.75, title='Bar Plot') 15 | plt.setp(ax1.get_xticklabels(), rotation=45, fontsize=10) 16 | plt.setp(ax1.get_yticklabels(), rotation=0, fontsize=10) 17 | ax1.set_xlabel('Customer') 18 | ax1.set_ylabel('Value') 19 | ax1.xaxis.set_ticks_position('bottom') 20 | ax1.yaxis.set_ticks_position('left') 21 | 22 | colors = dict(boxes='DarkBlue', whiskers='Gray', medians='Red', caps='Black') 23 | data_frame.plot(kind='box', color=colors, sym='r.', ax=ax2, title='Box Plot') 24 | plt.setp(ax2.get_xticklabels(), rotation=45, fontsize=10) 25 | plt.setp(ax2.get_yticklabels(), rotation=0, fontsize=10) 26 | ax2.set_xlabel('Metric') 27 | ax2.set_ylabel('Value') 28 | ax2.xaxis.set_ticks_position('bottom') 29 | ax2.yaxis.set_ticks_position('left') 30 | 31 | plt.savefig('pandas_plots.png', dpi=400, bbox_inches='tight') 32 | plt.show() -------------------------------------------------------------------------------- /applications/customer_category_history.csv: -------------------------------------------------------------------------------- 1 | Customer Name,Category,Price,Date,,,, 2 | John Smith,Bronze,$20.00,1/22/2014,,,, 3 | John Smith,Bronze,$25.00,3/15/2014,,,, 4 | John Smith,Silver,$30.00,4/2/2014,,,, 5 | John Smith,Gold,$40.00,5/11/2014,,,, 6 | John Smith,Gold,$45.00,7/13/2014,,,, 7 | Mary Yu,Silver,$30.00,2/3/2014,,,, 8 | Mary Yu,Gold,$40.00,4/16/2014,,,, 9 | Mary Yu,Gold,$45.00,6/23/2014,,,, 10 | Wayne Thompson,Bronze,$20.00,1/13/2014,,,, 11 | Wayne Thompson,Bronze,$25.00,3/24/2014,,,, 12 | Wayne Thompson,Bronze,$30.00,5/21/2014,,,, 13 | Wayne Thompson,Silver,$30.00,6/29/2014,,,, 14 | Bruce Johnson,Bronze,$20.00,2/9/2014,,,, 15 | Bruce Johnson,Bronze,$25.00,3/22/2014,,,, 16 | Bruce Johnson,Silver,$30.00,4/27/2014,,,, 17 | Bruce Johnson,Silver,$35.00,5/8/2014,,,, 18 | Bruce Johnson,Gold,$40.00,6/26/2014,,,, 19 | Bruce Johnson,Gold,$45.00,7/21/2014,,,, 20 | Annie Lee,Bronze,$20.00,3/16/2014,,,, 21 | Annie Lee,Silver,$30.00,4/11/2014,,,, 22 | Annie Lee,Gold,$40.00,5/25/2014,,,, 23 | Annie Lee,Gold,$45.00,7/14/2014,,,, 24 | Annie Lee,Gold,$50.00,7/21/2014,,,, 25 | Priya Patel,Silver,$30.00,1/19/2014,,,, 26 | Priya Patel,Silver,$35.00,2/28/2014,,,, 27 | Priya Patel,Silver,$40.00,3/26/2014,,,, 28 | Priya Patel,Gold,$40.00,4/28/2014,,,, 29 | Priya Patel,Gold,$45.00,5/12/2014,,,, 30 | Priya Patel,Gold,$50.00,6/21/2014,,,, 31 | -------------------------------------------------------------------------------- /database/4db_mysql_load_from_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import MySQLdb 4 | import sys 5 | from datetime import datetime, date 6 | 7 | # Path to and name of a CSV input file 8 | input_file = sys.argv[1] 9 | 10 | # Connect to a MySQL database 11 | con = MySQLdb.connect(host='localhost', port=3306, db='my_suppliers', user='python_training', passwd='python_training') 12 | c = con.cursor() 13 | 14 | # Read the CSV file 15 | # Insert the data into the Suppliers table 16 | file_reader = csv.reader(open(input_file, 'r'), delimiter=',') 17 | header = next(file_reader) 18 | for row in file_reader: 19 | data = [] 20 | for column_index in range(len(header)): 21 | if column_index < 4: 22 | data.append(str(row[column_index]).lstrip('$')\ 23 | .replace(',', '').strip()) 24 | else: 25 | a_date = datetime.date(datetime.strptime(\ 26 | str(row[column_index]), '%m/%d/%Y')) 27 | # %Y: year is 2016; %y: year is 15 28 | a_date = a_date.strftime('%Y-%m-%d') 29 | data.append(a_date) 30 | print(data) 31 | c.execute("""INSERT INTO Suppliers VALUES (%s, %s, %s, %s, %s);""", data) 32 | con.commit() 33 | 34 | # Query the Suppliers table 35 | c.execute("SELECT * FROM Suppliers") 36 | rows = c.fetchall() 37 | for row in rows: 38 | row_list_output = [] 39 | for column_index in range(len(row)): 40 | row_list_output.append(str(row[column_index])) 41 | print(row_list_output) 42 | -------------------------------------------------------------------------------- /excel/4excel_value_meets_condition.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from datetime import date 4 | from xlrd import open_workbook, xldate_as_tuple 5 | from xlwt import Workbook 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | output_workbook = Workbook() 11 | output_worksheet = output_workbook.add_sheet('jan_2013_output') 12 | 13 | sale_amount_column_index = 3 14 | with open_workbook(input_file) as workbook: 15 | worksheet = workbook.sheet_by_name('january_2013') 16 | data = [] 17 | header = worksheet.row_values(0) 18 | data.append(header) 19 | for row_index in range(1,worksheet.nrows): 20 | row_list = [] 21 | sale_amount = worksheet.cell_value(row_index, sale_amount_column_index) 22 | if sale_amount > 1400.0: 23 | for column_index in range(worksheet.ncols): 24 | cell_value = worksheet.cell_value(row_index,column_index) 25 | cell_type = worksheet.cell_type(row_index, column_index) 26 | if cell_type == 3: 27 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 28 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 29 | row_list.append(date_cell) 30 | else: 31 | row_list.append(cell_value) 32 | if row_list: 33 | data.append(row_list) 34 | 35 | for list_index, output_list in enumerate(data): 36 | for element_index, element in enumerate(output_list): 37 | output_worksheet.write(list_index, element_index, element) 38 | 39 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /excel/8excel_column_by_name.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from datetime import date 4 | from xlrd import open_workbook, xldate_as_tuple 5 | from xlwt import Workbook 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | output_workbook = Workbook() 11 | output_worksheet = output_workbook.add_sheet('jan_2013_output') 12 | 13 | my_columns = ['Customer ID', 'Purchase Date'] 14 | 15 | with open_workbook(input_file) as workbook: 16 | worksheet = workbook.sheet_by_name('january_2013') 17 | data = [my_columns] 18 | header_list = worksheet.row_values(0) 19 | header_index_list = [] 20 | for header_index in range(len(header_list)): 21 | if header_list[header_index] in my_columns: 22 | header_index_list.append(header_index) 23 | for row_index in range(1,worksheet.nrows): 24 | row_list = [] 25 | for column_index in header_index_list: 26 | cell_value = worksheet.cell_value(row_index,column_index) 27 | cell_type = worksheet.cell_type(row_index, column_index) 28 | if cell_type == 3: 29 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 30 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 31 | row_list.append(date_cell) 32 | else: 33 | row_list.append(cell_value) 34 | data.append(row_list) 35 | 36 | for list_index, output_list in enumerate(data): 37 | for element_index, element in enumerate(output_list): 38 | output_worksheet.write(list_index, element_index, element) 39 | 40 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /excel/6excel_value_matches_pattern.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import re 3 | import sys 4 | from datetime import date 5 | from xlrd import open_workbook, xldate_as_tuple 6 | from xlwt import Workbook 7 | 8 | input_file = sys.argv[1] 9 | output_file = sys.argv[2] 10 | 11 | output_workbook = Workbook() 12 | output_worksheet = output_workbook.add_sheet('jan_2013_output') 13 | 14 | pattern = re.compile(r'(?P^J.*)') 15 | 16 | customer_name_column_index = 1 17 | with open_workbook(input_file) as workbook: 18 | worksheet = workbook.sheet_by_name('january_2013') 19 | data = [] 20 | header = worksheet.row_values(0) 21 | data.append(header) 22 | for row_index in range(1, worksheet.nrows): 23 | row_list = [] 24 | if pattern.search(worksheet.cell_value(row_index, customer_name_column_index)): 25 | for column_index in range(worksheet.ncols): 26 | cell_value = worksheet.cell_value(row_index,column_index) 27 | cell_type = worksheet.cell_type(row_index, column_index) 28 | if cell_type == 3: 29 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 30 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 31 | row_list.append(date_cell) 32 | else: 33 | row_list.append(cell_value) 34 | if row_list: 35 | data.append(row_list) 36 | 37 | for list_index, output_list in enumerate(data): 38 | for element_index, element in enumerate(output_list): 39 | output_worksheet.write(list_index, element_index, element) 40 | 41 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /applications/3parse_text_file_skip_first_space.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import string 3 | import sys 4 | 5 | input_file = sys.argv[1] 6 | #output_file = sys.argv[2] 7 | 8 | messages = {} 9 | notes = [] 10 | with open(input_file, 'rU') as text_file: 11 | for row in text_file: 12 | if '[Note]' in row: 13 | n = 2 14 | groups = row.split(' ') 15 | date_time = ' '.join(groups[:n]) 16 | rest_of_line_string = ' '.join(groups[n:]) 17 | rest_of_line_list = rest_of_line_string.split(' ', 2) 18 | note = rest_of_line_list[2].strip('\n').strip() 19 | row_list = [] 20 | row_list.append(date_time) 21 | row_list.append(note) 22 | print row_list 23 | 24 | day = row_list[0] 25 | note = row_list[1] 26 | if note not in notes: 27 | notes.append(note) 28 | if day not in messages: 29 | messages[day] = {} 30 | if note not in messages[day]: 31 | messages[day][note] = 1 32 | else: 33 | messages[day][note] += 1 34 | 35 | #filewriter = open(output_file, 'wb') 36 | header = ['Date'] 37 | header.extend(notes) 38 | header = ','.join(map(str,header)) + '\n' 39 | print header 40 | #filewriter.write(header) 41 | for day, day_value in messages.items(): 42 | row_of_output = [] 43 | row_of_output.append(day) 44 | for index in range(len(notes)): 45 | if notes[index] in day_value.keys(): 46 | row_of_output.append(day_value[notes[index]]) 47 | else: 48 | row_of_output.append(0) 49 | output = ','.join(map(str,row_of_output)) + '\n' 50 | print output 51 | #filewriter.write(output) 52 | #filewriter.close() -------------------------------------------------------------------------------- /excel/13excel_concat_data_from_multiple_workbooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import glob 3 | import os 4 | import sys 5 | from datetime import date 6 | from xlrd import open_workbook, xldate_as_tuple 7 | from xlwt import Workbook 8 | 9 | input_folder = sys.argv[1] 10 | output_file = sys.argv[2] 11 | 12 | output_workbook = Workbook() 13 | output_worksheet = output_workbook.add_sheet('all_data_all_workbooks') 14 | 15 | data = [] 16 | first_worksheet = True 17 | for input_file in glob.glob(os.path.join(input_folder, '*.xls*')): 18 | print os.path.basename(input_file) 19 | with open_workbook(input_file) as workbook: 20 | for worksheet in workbook.sheets(): 21 | if first_worksheet: 22 | header_row = worksheet.row_values(0) 23 | data.append(header_row) 24 | first_worksheet = False 25 | for row_index in range(1,worksheet.nrows): 26 | row_list = [] 27 | for column_index in range(worksheet.ncols): 28 | cell_value = worksheet.cell_value(row_index,column_index) 29 | cell_type = worksheet.cell_type(row_index, column_index) 30 | if cell_type == 3: 31 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 32 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 33 | row_list.append(date_cell) 34 | else: 35 | row_list.append(cell_value) 36 | data.append(row_list) 37 | 38 | for list_index, output_list in enumerate(data): 39 | for element_index, element in enumerate(output_list): 40 | output_worksheet.write(list_index, element_index, element) 41 | 42 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /database/3db_update_rows.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sqlite3 4 | import sys 5 | 6 | # Path to and name of a CSV input file 7 | input_file = sys.argv[1] 8 | 9 | # Create an in-memory SQLite3 database 10 | # Create a table called sales with four attributes 11 | con = sqlite3.connect(':memory:') 12 | query = """CREATE TABLE IF NOT EXISTS sales 13 | (customer VARCHAR(20), 14 | product VARCHAR(40), 15 | amount FLOAT, 16 | date DATE);""" 17 | con.execute(query) 18 | con.commit() 19 | 20 | # Insert a few rows of data into the table 21 | data = [('Richard Lucas', 'Notepad', 2.50, '2014-01-02'), 22 | ('Jenny Kim', 'Binder', 4.15, '2014-01-15'), 23 | ('Svetlana Crow', 'Printer', 155.75, '2014-02-03'), 24 | ('Stephen Randolph', 'Computer', 679.40, '2014-02-20')] 25 | for tuple in data: 26 | print(tuple) 27 | statement = "INSERT INTO sales VALUES(?, ?, ?, ?)" 28 | con.executemany(statement, data) 29 | con.commit() 30 | 31 | # Read the CSV file and update the specific rows 32 | file_reader = csv.reader(open(input_file, 'r'), delimiter=',') 33 | header = next(file_reader, None) 34 | for row in file_reader: 35 | data = [] 36 | for column_index in range(len(header)): 37 | data.append(row[column_index]) 38 | print(data) 39 | con.execute("UPDATE sales SET amount=?, date=? WHERE customer=?;", data) 40 | con.commit() 41 | 42 | # Query the sales table 43 | cursor = con.execute("SELECT * FROM sales") 44 | rows = cursor.fetchall() 45 | for row in rows: 46 | output = [] 47 | for column_index in range(len(row)): 48 | output.append(str(row[column_index])) 49 | print(output) -------------------------------------------------------------------------------- /excel/10excel_column_by_name_all_worksheets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from datetime import date 4 | from xlrd import open_workbook, xldate_as_tuple 5 | from xlwt import Workbook 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | output_workbook = Workbook() 11 | output_worksheet = output_workbook.add_sheet('selected_columns_all_worksheets') 12 | 13 | my_columns = ['Customer Name', 'Sale Amount'] 14 | 15 | first_worksheet = True 16 | with open_workbook(input_file) as workbook: 17 | data = [my_columns] 18 | index_of_cols_to_keep = [] 19 | for worksheet in workbook.sheets(): 20 | if first_worksheet: 21 | header = worksheet.row_values(0) 22 | for column_index in range(len(header)): 23 | if header[column_index] in my_columns: 24 | index_of_cols_to_keep.append(column_index) 25 | first_worksheet = False 26 | for row_index in range(1, worksheet.nrows): 27 | row_list = [] 28 | for column_index in index_of_cols_to_keep: 29 | cell_value = worksheet.cell_value(row_index, column_index) 30 | cell_type = worksheet.cell_type(row_index, column_index) 31 | if cell_type == 3: 32 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 33 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 34 | row_list.append(date_cell) 35 | else: 36 | row_list.append(cell_value) 37 | data.append(row_list) 38 | 39 | for list_index, output_list in enumerate(data): 40 | for element_index, element in enumerate(output_list): 41 | output_worksheet.write(list_index, element_index, element) 42 | 43 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /excel/5excel_value_in_set.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from datetime import date 4 | from xlrd import open_workbook, xldate_as_tuple 5 | from xlwt import Workbook 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | output_workbook = Workbook() 11 | output_worksheet = output_workbook.add_sheet('jan_2013_output') 12 | 13 | important_dates = ['01/24/2013', '01/31/2013'] 14 | 15 | purchase_date_column_index = 4 16 | with open_workbook(input_file) as workbook: 17 | worksheet = workbook.sheet_by_name('january_2013') 18 | data = [] 19 | header = worksheet.row_values(0) 20 | data.append(header) 21 | for row_index in range(1, worksheet.nrows): 22 | purchase_datetime = xldate_as_tuple(worksheet.cell_value(row_index, purchase_date_column_index),workbook.datemode) 23 | purchase_date = date(*purchase_datetime[0:3]).strftime('%m/%d/%Y') 24 | row_list = [] 25 | if purchase_date in important_dates: 26 | for column_index in range(worksheet.ncols): 27 | cell_value = worksheet.cell_value(row_index,column_index) 28 | cell_type = worksheet.cell_type(row_index, column_index) 29 | if cell_type == 3: 30 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 31 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 32 | row_list.append(date_cell) 33 | else: 34 | row_list.append(cell_value) 35 | if row_list: 36 | data.append(row_list) 37 | 38 | for list_index, output_list in enumerate(data): 39 | for element_index, element in enumerate(output_list): 40 | output_worksheet.write(list_index, element_index, element) 41 | 42 | output_workbook.save(output_file) 43 | -------------------------------------------------------------------------------- /excel/9excel_value_meets_condition_all_worksheets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from datetime import date 4 | from xlrd import open_workbook, xldate_as_tuple 5 | from xlwt import Workbook 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | output_workbook = Workbook() 11 | output_worksheet = output_workbook.add_sheet('filtered_rows_all_worksheets') 12 | 13 | sales_column_index = 3 14 | threshold = 2000.0 15 | 16 | first_worksheet = True 17 | with open_workbook(input_file) as workbook: 18 | data = [] 19 | for worksheet in workbook.sheets(): 20 | if first_worksheet: 21 | header_row = worksheet.row_values(0) 22 | data.append(header_row) 23 | first_worksheet = False 24 | for row_index in range(1,worksheet.nrows): 25 | row_list = [] 26 | sale_amount = worksheet.cell_value(row_index, sales_column_index) 27 | sale_amount = float(str(sale_amount).replace('$', '').replace(',', '')) 28 | if sale_amount > threshold: 29 | for column_index in range(worksheet.ncols): 30 | cell_value = worksheet.cell_value(row_index,column_index) 31 | cell_type = worksheet.cell_type(row_index, column_index) 32 | if cell_type == 3: 33 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 34 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 35 | row_list.append(date_cell) 36 | else: 37 | row_list.append(cell_value) 38 | if row_list: 39 | data.append(row_list) 40 | 41 | for list_index, output_list in enumerate(data): 42 | for element_index, element in enumerate(output_list): 43 | output_worksheet.write(list_index, element_index, element) 44 | 45 | output_workbook.save(output_file) 46 | -------------------------------------------------------------------------------- /excel/11excel_value_meets_condition_set_of_worksheets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from datetime import date 4 | from xlrd import open_workbook, xldate_as_tuple 5 | from xlwt import Workbook 6 | 7 | input_file = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | output_workbook = Workbook() 11 | output_worksheet = output_workbook.add_sheet('set_of_worksheets') 12 | 13 | my_sheets = [0,1] 14 | threshold = 1900.0 15 | sales_column_index = 3 16 | 17 | first_worksheet = True 18 | with open_workbook(input_file) as workbook: 19 | data = [] 20 | for sheet_index in range(workbook.nsheets): 21 | if sheet_index in my_sheets: 22 | worksheet = workbook.sheet_by_index(sheet_index) 23 | if first_worksheet: 24 | header_row = worksheet.row_values(0) 25 | data.append(header_row) 26 | first_worksheet = False 27 | for row_index in range(1,worksheet.nrows): 28 | row_list = [] 29 | sale_amount = worksheet.cell_value(row_index, sales_column_index) 30 | if sale_amount > threshold: 31 | for column_index in range(worksheet.ncols): 32 | cell_value = worksheet.cell_value(row_index,column_index) 33 | cell_type = worksheet.cell_type(row_index, column_index) 34 | if cell_type == 3: 35 | date_cell = xldate_as_tuple(cell_value,workbook.datemode) 36 | date_cell = date(*date_cell[0:3]).strftime('%m/%d/%Y') 37 | row_list.append(date_cell) 38 | else: 39 | row_list.append(cell_value) 40 | if row_list: 41 | data.append(row_list) 42 | 43 | for list_index, output_list in enumerate(data): 44 | for element_index, element in enumerate(output_list): 45 | output_worksheet.write(list_index, element_index, element) 46 | 47 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /applications/mysql_server_error_log.txt: -------------------------------------------------------------------------------- 1 | 246824 10:40:55 mysqld_safe Starting mysqld daemon with databases from /usr/local/mysql/data 2 | 2014-02-03 10:40:55 98765 [Note] InnoDB: Compressed tables use zlib 1.2.3 3 | 2014-02-03 10:40:55 98765 [Note] InnoDB: Using atomics to ref count buffer pool pages 4 | 2014-02-03 10:40:55 98765 [Note] InnoDB: 5.6.16 started; log sequence number 1234567 5 | 2014-02-03 10:47:18 64208 [Note] InnoDB: Using atomics to ref count buffer pool pages 6 | 2014-02-03 10:47:18 64208 [Note] InnoDB: Compressed tables use zlib 1.2.3 7 | 2014-02-03 10:55:55 64208 [Note] /usr/local/mysql/bin/mysqld: Shutdown complete 8 | 9 | 135791 15:59:29 mysqld_safe Starting mysqld daemon with databases from /usr/local/mysql/data 10 | 2014-03-07 10:40:55 98765 [Note] InnoDB: Compressed tables use zlib 1.2.3 11 | 2014-03-07 10:40:55 98765 [Note] InnoDB: Compressed tables use zlib 1.2.3 12 | 2014-03-07 10:40:55 98765 [Note] InnoDB: 5.6.16 started; log sequence number 1234567 13 | 2014-03-07 10:47:18 64208 [Note] InnoDB: Using atomics to ref count buffer pool pages 14 | 2014-03-07 10:47:18 64208 [Note] InnoDB: Compressed tables use zlib 1.2.3 15 | 2014-03-07 10:55:55 64208 [Note] /usr/local/mysql/bin/mysqld: Shutdown complete 16 | 17 | 124578 15:59:29 mysqld_safe Starting mysqld daemon with databases from /usr/local/mysql/data 18 | 2014-10-27 10:40:55 98765 [Note] InnoDB: Completed initialization of buffer pool 19 | 2014-10-27 10:40:55 98765 [Note] InnoDB: IPv6 is available. 20 | 2014-10-27 10:40:55 98765 [Note] InnoDB: 5.6.16 started; log sequence number 1234567 21 | 2014-10-27 10:47:18 64208 [Note] InnoDB: Completed initialization of buffer pool 22 | 2014-10-27 10:47:18 64208 [Note] InnoDB: IPv6 is available. 23 | 2014-10-27 10:55:55 64208 [Note] /usr/local/mysql/bin/mysqld: Shutdown complete 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /applications/output_files/1output.csv: -------------------------------------------------------------------------------- 1 | 1234.0,Widget 1,Supplier A,1100,2013-06-02,suppliers.xls,suppliers_2013 2 | 2345.0,Widget 2,Supplier A,2300,2013-06-17,suppliers.xls,suppliers_2013 3 | 4567.0,Widget 4,Supplier B,1300,2013-07-04,suppliers.xls,suppliers_2013 4 | 6789.0,Widget 6,Supplier C,1175,2013-07-23,suppliers.xls,suppliers_2013 5 | 7890.0,Widget 7,Supplier C,1200,2013-07-27,suppliers.xls,suppliers_2013 6 | 1234.0,Widget 1,Supplier A,1100,2014-06-02,suppliers.xls,suppliers_2014 7 | 2345.0,Widget 2,Supplier A,2300,2014-06-17,suppliers.xls,suppliers_2014 8 | 4567.0,Widget 4,Supplier B,1300,2014-07-04,suppliers.xls,suppliers_2014 9 | 6789.0,Widget 6,Supplier C,1175,2014-07-23,suppliers.xls,suppliers_2014 10 | 7890.0,Widget 7,Supplier C,1200,2014-07-27,suppliers.xls,suppliers_2014 11 | 1234.0,Widget 1,Supplier A,1100,2013-06-02,suppliers.xlsx,suppliers_2013 12 | 2345.0,Widget 2,Supplier A,2300,2013-06-17,suppliers.xlsx,suppliers_2013 13 | 4567.0,Widget 4,Supplier B,1300,2013-07-04,suppliers.xlsx,suppliers_2013 14 | 6789.0,Widget 6,Supplier C,1175,2013-07-23,suppliers.xlsx,suppliers_2013 15 | 7890.0,Widget 7,Supplier C,1200,2013-07-27,suppliers.xlsx,suppliers_2013 16 | 1234.0,Widget 1,Supplier A,1100,2014-06-02,suppliers.xlsx,suppliers_2014 17 | 2345.0,Widget 2,Supplier A,2300,2014-06-17,suppliers.xlsx,suppliers_2014 18 | 4567.0,Widget 4,Supplier B,1300,2014-07-04,suppliers.xlsx,suppliers_2014 19 | 6789.0,Widget 6,Supplier C,1175,2014-07-23,suppliers.xlsx,suppliers_2014 20 | 7890.0,Widget 7,Supplier C,1200,2014-07-27,suppliers.xlsx,suppliers_2014 21 | 1234,Widget 1,Supplier A,1100,6/2/2012,suppliers_2012.csv 22 | 2345,Widget 2,Supplier A,2300,6/17/2012,suppliers_2012.csv 23 | 4567,Widget 4,Supplier B,1300,7/4/2012,suppliers_2012.csv 24 | 6789,Widget 6,Supplier C,1175,7/23/2012,suppliers_2012.csv 25 | 7890,Widget 7,Supplier C,1200,7/27/2012,suppliers_2012.csv 26 | -------------------------------------------------------------------------------- /excel/14excel_sum_average_multiple_workbooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import glob 3 | import os 4 | import sys 5 | from datetime import date 6 | from xlrd import open_workbook, xldate_as_tuple 7 | from xlwt import Workbook 8 | 9 | input_folder = sys.argv[1] 10 | output_file = sys.argv[2] 11 | 12 | output_workbook = Workbook() 13 | output_worksheet = output_workbook.add_sheet('sums_and_averages') 14 | 15 | all_data = [] 16 | sales_column_index = 3 17 | 18 | header = ['workbook', 'worksheet', 'worksheet_total', 'worksheet_average',\ 19 | 'workbook_total', 'workbook_average'] 20 | all_data.append(header) 21 | 22 | for input_file in glob.glob(os.path.join(input_folder, '*.xls*')): 23 | with open_workbook(input_file) as workbook: 24 | list_of_totals = [] 25 | list_of_numbers = [] 26 | workbook_output = [] 27 | for worksheet in workbook.sheets(): 28 | total_sales = 0 29 | number_of_sales = 0 30 | worksheet_list = [] 31 | worksheet_list.append(os.path.basename(input_file)) 32 | worksheet_list.append(worksheet.name) 33 | for row_index in range(1,worksheet.nrows): 34 | try: 35 | total_sales += float(str(worksheet.cell_value(row_index,sales_column_index)).strip('$').replace(',','')) 36 | number_of_sales += 1. 37 | except: 38 | total_sales += 0. 39 | number_of_sales += 0. 40 | average_sales = '%.2f' % (total_sales / number_of_sales) 41 | worksheet_list.append(total_sales) 42 | worksheet_list.append(float(average_sales)) 43 | list_of_totals.append(total_sales) 44 | list_of_numbers.append(float(number_of_sales)) 45 | workbook_output.append(worksheet_list) 46 | workbook_total = sum(list_of_totals) 47 | workbook_average = sum(list_of_totals)/sum(list_of_numbers) 48 | for list_element in workbook_output: 49 | list_element.append(workbook_total) 50 | list_element.append(workbook_average) 51 | all_data.extend(workbook_output) 52 | 53 | for list_index, output_list in enumerate(all_data): 54 | for element_index, element in enumerate(output_list): 55 | output_worksheet.write(list_index, element_index, element) 56 | 57 | output_workbook.save(output_file) -------------------------------------------------------------------------------- /excel/pandas_sum_average_multiple_workbooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import glob 4 | import os 5 | import sys 6 | 7 | input_path = sys.argv[1] 8 | output_file = sys.argv[2] 9 | 10 | all_workbooks = glob.glob(os.path.join(input_path,'*.xls*')) 11 | data_frames = [] 12 | for workbook in all_workbooks: 13 | all_worksheets = pd.read_excel(workbook, sheetname=None, index_col=None) 14 | workbook_total_sales = [] 15 | workbook_number_of_sales = [] 16 | worksheet_data_frames = [] 17 | worksheets_data_frame = None 18 | workbook_data_frame = None 19 | for worksheet_name, data in all_worksheets.items(): 20 | total_sales = pd.DataFrame([float(str(value).strip('$').replace(',','')) for value in data.ix[:, 'Sale Amount']]).sum() 21 | number_of_sales = len(data.loc[:, 'Sale Amount']) 22 | average_sales = pd.DataFrame(total_sales / number_of_sales) 23 | 24 | workbook_total_sales.append(total_sales) 25 | workbook_number_of_sales.append(number_of_sales) 26 | 27 | data = {'workbook': os.path.basename(workbook), 28 | 'worksheet': worksheet_name, 29 | 'worksheet_total': total_sales, 30 | 'worksheet_average': average_sales} 31 | 32 | worksheet_data_frames.append(pd.DataFrame(data, columns=['workbook', 'worksheet', 'worksheet_total', 'worksheet_average'])) 33 | worksheets_data_frame = pd.concat(worksheet_data_frames, axis=0, ignore_index=True) 34 | 35 | workbook_total = pd.DataFrame(workbook_total_sales).sum() 36 | workbook_total_number_of_sales = pd.DataFrame(workbook_number_of_sales).sum() 37 | workbook_average = pd.DataFrame(workbook_total / workbook_total_number_of_sales) 38 | 39 | workbook_stats = {'workbook': os.path.basename(workbook), 40 | 'workbook_total': workbook_total, 41 | 'workbook_average': workbook_average} 42 | 43 | workbook_stats = pd.DataFrame(workbook_stats, columns=['workbook', 'workbook_total', 'workbook_average']) 44 | workbook_data_frame = pd.merge(worksheets_data_frame, workbook_stats, on='workbook', how='left') 45 | data_frames.append(workbook_data_frame) 46 | 47 | all_data_concatenated = pd.concat(data_frames, axis=0, ignore_index=True) 48 | 49 | writer = pd.ExcelWriter(output_file) 50 | all_data_concatenated.to_excel(writer, sheet_name='sums_and_averages', index=False) 51 | writer.save() -------------------------------------------------------------------------------- /applications/2calculate_statistic_by_category.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import sys 4 | from datetime import date, datetime 5 | 6 | def date_diff(date1, date2): 7 | try: 8 | diff = str(datetime.strptime(date1, '%m/%d/%Y') - \ 9 | datetime.strptime(date2, '%m/%d/%Y')).split()[0] 10 | except: 11 | diff = 0 12 | if diff == '0:00:00': 13 | diff = 0 14 | return diff 15 | 16 | input_file = sys.argv[1] 17 | output_file = sys.argv[2] 18 | 19 | packages = {} 20 | previous_name = 'N/A' 21 | previous_package = 'N/A' 22 | previous_package_date = 'N/A' 23 | first_row = True 24 | today = date.today().strftime('%m/%d/%Y') 25 | 26 | with open(input_file, 'r', newline='') as input_csv_file: 27 | filereader = csv.reader(input_csv_file) 28 | header = next(filereader) 29 | for row in filereader: 30 | current_name = row[0] 31 | current_package = row[1] 32 | current_package_date = row[3] 33 | if current_name not in packages: 34 | packages[current_name] = {} 35 | if current_package not in packages[current_name]: 36 | packages[current_name][current_package] = 0 37 | if current_name != previous_name: 38 | if first_row: 39 | first_row = False 40 | else: 41 | diff = date_diff(today, previous_package_date) 42 | if previous_package not in packages[previous_name]: 43 | packages[previous_name][previous_package] = int(diff) 44 | else: 45 | packages[previous_name][previous_package] += int(diff) 46 | else: 47 | diff = date_diff(current_package_date, previous_package_date) 48 | packages[previous_name][previous_package] += int(diff) 49 | previous_name = current_name 50 | previous_package = current_package 51 | previous_package_date = current_package_date 52 | 53 | header = ['Customer Name', 'Category', 'Total Time (in Days)'] 54 | with open(output_file, 'w', newline='') as output_csv_file: 55 | filewriter = csv.writer(output_csv_file) 56 | filewriter.writerow(header) 57 | for customer_name, customer_name_value in packages.items(): 58 | for package_category, package_category_value in packages[customer_name].items(): 59 | row_of_output = [] 60 | print(customer_name, package_category, package_category_value) 61 | row_of_output.append(customer_name) 62 | row_of_output.append(package_category) 63 | row_of_output.append(package_category_value) 64 | filewriter.writerow(row_of_output) -------------------------------------------------------------------------------- /applications/1search_for_items_write_found.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import glob 4 | import os 5 | import sys 6 | from datetime import date 7 | from xlrd import open_workbook, xldate_as_tuple 8 | 9 | item_numbers_file = sys.argv[1] 10 | path_to_folder = sys.argv[2] 11 | output_file = sys.argv[3] 12 | 13 | item_numbers_to_find = [] 14 | with open(item_numbers_file, 'r', newline='') as item_numbers_csv_file: 15 | filereader = csv.reader(item_numbers_csv_file) 16 | for row in filereader: 17 | item_numbers_to_find.append(row[0]) 18 | print(item_numbers_to_find) 19 | 20 | filewriter = csv.writer(open(output_file, 'a', newline='')) 21 | 22 | file_counter = 0 23 | line_counter = 0 24 | count_of_item_numbers = 0 25 | for input_file in glob.glob(os.path.join(path_to_folder, '*.*')): 26 | file_counter += 1 27 | if input_file.split('.')[1] == 'csv': 28 | with open(input_file, 'r', newline='') as csv_in_file: 29 | filereader = csv.reader(csv_in_file) 30 | header = next(filereader) 31 | for row in filereader: 32 | row_of_output = [] 33 | for column in range(len(header)): 34 | if column < 3: 35 | cell_value = str(row[column]).strip() 36 | row_of_output.append(cell_value) 37 | elif column == 3: 38 | cell_value = str(row[column]).lstrip('$').replace(',','').split('.')[0].strip() 39 | row_of_output.append(cell_value) 40 | else: 41 | cell_value = str(row[column]).strip() 42 | row_of_output.append(cell_value) 43 | row_of_output.append(os.path.basename(input_file)) 44 | if row[0] in item_numbers_to_find: 45 | filewriter.writerow(row_of_output) 46 | count_of_item_numbers += 1 47 | line_counter += 1 48 | elif input_file.split('.')[1] == 'xls' or input_file.split('.')[1] == 'xlsx': 49 | workbook = open_workbook(input_file) 50 | for worksheet in workbook.sheets(): 51 | try: 52 | header = worksheet.row_values(0) 53 | except IndexError: 54 | pass 55 | for row in range(1, worksheet.nrows): 56 | row_of_output = [] 57 | for column in range(len(header)): 58 | if column < 3: 59 | cell_value = str(worksheet.cell_value(row,column)).strip() 60 | row_of_output.append(cell_value) 61 | elif column == 3: 62 | cell_value = str(worksheet.cell_value(row,column)).split('.')[0].strip() 63 | row_of_output.append(cell_value) 64 | else: 65 | cell_value = xldate_as_tuple(worksheet.cell(row,column).value,workbook.datemode) 66 | cell_value = str(date(*cell_value[0:3])).strip() 67 | row_of_output.append(cell_value) 68 | row_of_output.append(os.path.basename(input_file)) 69 | row_of_output.append(worksheet.name) 70 | if str(worksheet.cell(row,0).value).split('.')[0].strip() in item_numbers_to_find: 71 | filewriter.writerow(row_of_output) 72 | count_of_item_numbers += 1 73 | line_counter += 1 74 | print('Number of files: {}'.format(file_counter)) 75 | print('Number of lines: {}'.format(line_counter)) 76 | print('Number of item numbers: {}'.format(count_of_item_numbers)) -------------------------------------------------------------------------------- /plots/seaborn_plots.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import seaborn as sns 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | sns.set(color_codes=True) 8 | 9 | 10 | # Simple plot of linear, quadratic, and cubic curves 11 | x = np.linspace(0, 2, 100) 12 | plt.plot(x, x, label='linear') 13 | plt.plot(x, x**2, label='quadratic') 14 | plt.plot(x, x**3, label='cubic') 15 | plt.xlabel('x label') 16 | plt.ylabel('y label') 17 | plt.title("Simple Plot") 18 | plt.legend(loc="best") 19 | plt.show() 20 | 21 | 22 | # Histogram 23 | x = np.random.normal(size=1000) 24 | sns.distplot(x, bins=20, kde=True, rug=False, label="Histogram w/o Density") 25 | sns.axlabel("Value", "Frequency") 26 | plt.title("Histogram of a Random Sample from a Normal Distribution") 27 | plt.legend() 28 | plt.show() 29 | 30 | 31 | # Scatter plot 32 | mean, cov = [5, 10], [(1, .5), (.5, 1)] 33 | data = np.random.multivariate_normal(mean, cov, 200) 34 | data_frame = pd.DataFrame(data, columns=["x", "y"]) 35 | sns.jointplot(x="x", y="y", data=data_frame, kind="reg").set_axis_labels("x", "y") 36 | plt.suptitle("Joint Plot of Two Variables with Bivariate and Univariate Graphs") 37 | plt.show() 38 | 39 | 40 | # Pairwise bivariate 41 | #iris = sns.load_dataset("iris") 42 | #sns.pairplot(iris) 43 | #plt.show() 44 | 45 | 46 | # Linear regression model 47 | tips = sns.load_dataset("tips") 48 | #sns.lmplot(x="total_bill", y="tip", data=tips) 49 | sns.lmplot(x="size", y="tip", data=tips, x_jitter=.15, ci=None) 50 | #sns.lmplot(x="size", y="tip", data=tips, x_estimator=np.mean, ci=None) 51 | plt.show() 52 | 53 | 54 | # Box plots 55 | sns.boxplot(x="day", y="total_bill", hue="time", data=tips) 56 | #sns.factorplot(x="time", y="total_bill", hue="smoker", 57 | # col="day", data=tips, kind="box", size=4, aspect=.5) 58 | plt.show() 59 | 60 | 61 | # Bar plots 62 | titanic = sns.load_dataset("titanic") 63 | #sns.barplot(x="sex", y="survived", hue="class", data=titanic) 64 | #sns.countplot(y="deck", hue="class", data=titanic, palette="Greens_d") 65 | #plt.show() 66 | 67 | 68 | # Non-linear regression model 69 | anscombe = sns.load_dataset("anscombe") 70 | # polynomial 71 | #sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'II'"), 72 | # order=2, ci=False, scatter_kws={"s": 80}) 73 | #plt.show() 74 | 75 | 76 | # robust to outliers 77 | #sns.lmplot(x="x", y="y", data=anscombe.query("dataset == 'III'"), 78 | # robust=True, ci=False, scatter_kws={"s": 80}) 79 | #plt.show() 80 | 81 | 82 | # logistic 83 | #tips["big_tip"] = (tips.tip / tips.total_bill) > .15 84 | #sns.lmplot(x="total_bill", y="big_tip", data=tips, logistic=True, y_jitter=.03).set_axis_labels("Total Bill", "Big Tip") 85 | #plt.title("Logistic Regression of Big Tip vs. Total Bill") 86 | #plt.show() 87 | 88 | 89 | # lowess smoother 90 | #sns.lmplot(x="total_bill", y="tip", data=tips, lowess=True) 91 | #plt.show() 92 | 93 | 94 | # Condition on other variables 95 | #sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips, 96 | # markers=["o", "x"], palette="Set1") 97 | #sns.lmplot(x="total_bill", y="tip", hue="smoker", 98 | # col="time", row="sex", data=tips) 99 | #plt.show() 100 | 101 | 102 | # Control shape and size of plot 103 | #sns.lmplot(x="total_bill", y="tip", col="day", data=tips, col_wrap=2, size=3) 104 | #sns.lmplot(x="total_bill", y="tip", col="day", data=tips, aspect=.5) 105 | #plt.show() 106 | 107 | 108 | # Plotting regression in other contexts 109 | #sns.jointplot(x="total_bill", y="tip", data=tips, kind="reg") 110 | #sns.pairplot(tips, x_vars=["total_bill", "size"], y_vars=["tip"], 111 | # size=5, aspect=.8, kind="reg") 112 | #sns.pairplot(tips, x_vars=["total_bill", "size"], y_vars=["tip"], 113 | # hue="smoker", size=5, aspect=.8, kind="reg") 114 | #plt.show() 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | foundations-for-analytics-with-python 2 | ======================== 3 | 4 | This repository contains all of the Python scripts, input files, and output files associated with the book, Foundations for Analytics with Python.
5 | 6 | About
7 | My Blog Post: Foundations for Analytics with Python
8 | 9 | Shop
10 | O'Reilly Media
11 | Foundations for Analytics with Python
12 | 13 | Amazon
14 | Foundations for Analytics with Python
15 | 16 | Advance Praise
17 | "This book is a useful learning resource for new Python programmers working with data. The tutorial style and accompanying exercises will help users get their feet wet with the Python language, programming environment, and a number of the most important packages in the ecosystem." - Wes McKinney, Creator of pandas library
18 | 19 | "This is a must read book for anyone who feels limited by spreadsheets and wants to master the basics of coding and automation for business applications. This is also good primer on programmatic approaches to conducting the most common statistical methods, incluing correlations, t-tests, and regressions." - Rajiv Krishnamurthy, Manager, Infra Data Science, Facebook
20 | 21 | "Foundations for Analytics with Python is an extremely well-written introduction to Python for analysts, giving clear and practical guidance for the new programmer. It connects principles and best-practices effectively, as if Mr. Brownley was sitting next to you, guiding you each step of the way." - Dean Abbott, Co-Founder and Chief Data Scientist at SmarterHQ
22 | 23 | "Data analysis is an essential skill for the modern professional and Clinton's book is the perfect primer to move beyond the pre-defined tools into truly flexible analytics with real code. Even if you haven't written a single line of code before." - Chandika Jayasundara, CEO & Co-Founder, Creately
24 | 25 | "Python is widely used for data analysis -- it is in fact one of the most popular tools/languages for data analysis and data science. Via this book, Clinton is adding to the field in a much needed manner: by teaching the reader to learn how to program as well as automate and scale their data analyses. Everyone today would be well served to learn to code and to apply programming to data analysis. This book serves exactly that purpose: it targets non-coders and teaches them fundamentals of Analytics using Python -- the tool of choice for data scientists today!" - Sameer Chopra, Chief Analytics Officer, GoDaddy
26 | 27 | to download 28 | ======================== 29 | Mac computer:
30 | 1. Open a Terminal window
31 | 2. Navigate to the folder where you want to download the foundations-for-analytics-with-python folder
32 |     For example, to download the foundations-for-analytics-with-python folder onto your Desktop:
33 |         First, type the following and then hit Enter: `cd`
34 |         Second, type the following and then hit Enter: `cd Desktop/`
35 | 3. Finally, to download the foundations-for-analytics-with-python folder, type the following and then hit Enter:
36 |     `git clone https://github.com/cbrownley/foundations-for-analytics-with-python.git`
37 | 38 | Windows computer:
39 | 1. Go to: https://github.com/cbrownley/foundations-for-analytics-with-python
40 | 2. Click 'Clone or download' and then 'Download ZIP' in the right side of the page
41 | 3. Click on the zipped folder to open it in File Explorer
42 | 4. Click 'Extract all'
43 | 5. Edit the path to save the foundations-for-analytics-with-python folder on your Desktop
44 | 6. Click 'Extract'
45 | -------------------------------------------------------------------------------- /statistics/wine_quality.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import numpy as np 3 | import pandas as pd 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import statsmodels.api as sm 7 | import statsmodels.formula.api as smf 8 | from statsmodels.formula.api import ols, glm 9 | 10 | 11 | # Read the data set into a pandas DataFrame 12 | wine = pd.read_csv('winequality-both.csv', sep=',', header=0) 13 | wine.columns = wine.columns.str.replace(' ', '_') 14 | print(wine.head()) 15 | 16 | # Display descriptive statistics for all variables 17 | print(wine.describe()) 18 | 19 | # Identify unique values 20 | print(sorted(wine.quality.unique())) 21 | 22 | # Calculate value frequencies 23 | print(wine.quality.value_counts()) 24 | 25 | # Display descriptive statistics for quality by wine type 26 | print(wine.groupby('type')[['alcohol']].describe().unstack('type')) 27 | 28 | # Calculate specific quantiles 29 | print(wine.groupby('type')[['quality']].quantile([0.25, 0.75]).unstack('type')) 30 | 31 | # Calculate correlation matrix for all variables 32 | print(wine.corr()) 33 | 34 | # Look at relationship between pairs of variables 35 | # Take a "small" sample of red and white wines for plotting 36 | def take_sample(data_frame, replace=False, n=200): 37 | return data_frame.loc[np.random.choice(data_frame.index, replace=replace, size=n)] 38 | reds = wine.loc[wine['type']=='red', :] 39 | whites = wine.loc[wine['type']=='white', :] 40 | reds_sample = take_sample(wine.loc[wine['type']=='red', :]) 41 | whites_sample = take_sample(wine.loc[wine['type']=='white', :]) 42 | wine_sample = pd.concat([reds_sample, whites_sample]) 43 | wine['in_sample'] = np.where(wine.index.isin(wine_sample.index), 1.,0.) 44 | 45 | reds_sample = reds.ix[np.random.choice(reds.index, 100)] 46 | whites_sample = whites.ix[np.random.choice(whites.index, 100)] 47 | wine_sample = pd.concat([reds_sample, whites_sample], ignore_index=True) 48 | 49 | print(wine['in_sample']) 50 | print(pd.crosstab(wine.in_sample, wine.type, margins=True)) 51 | 52 | sns.set_style("dark") 53 | sns.set_style("darkgrid", {"legend.scatterpoints": 0}) 54 | pg = sns.PairGrid(wine_sample, hue="type", hue_order=["red", "white"], \ 55 | palette=dict(red="red", white="white"), hue_kws={"marker": ["o", "s"]}, vars=['quality', 'alcohol', 'residual_sugar']) 56 | pg.x = wine_sample.ix[wine_sample['type']=='red', 'quality'] 57 | pg = pg.map_diag(plt.hist) 58 | pg.x = wine_sample.ix[wine_sample['type']=='white', 'quality'] 59 | pg = pg.map_diag(plt.hist) 60 | pg = pg.map_offdiag(plt.scatter, edgecolor="black", s=10, alpha=0.25) 61 | #plt.show() 62 | 63 | g = sns.pairplot(wine_sample, kind='reg', plot_kws={"ci": False, "x_jitter": 0.25, "y_jitter": 0.25}, \ 64 | hue='type', diag_kind='hist', diag_kws={"bins": 10, "alpha": 1.0}, palette=dict(red="red", white="white"), \ 65 | markers=["o", "s"], vars=['quality', 'alcohol', 'residual_sugar']) 66 | sns.set_style({'legend.frameon': True,'legend.numpoints': 0,'legend.scatterpoints': 0}) 67 | wine_all_plot = sns.pairplot(wine, kind='reg', hue='type', palette=dict(red="red", white="white"), markers=["o", "s"], vars=['quality', 'alcohol', 'residual_sugar']) 68 | wine_sample_plot = sns.pairplot(wine_sample, kind='reg', hue='type', palette=dict(red="red", white="white"), markers=["o", "s"], vars=['quality', 'alcohol', 'residual_sugar']) 69 | 70 | wine['ln_fixed_acidity'] = np.log(wine.ix[:, 'fixed_acidity']) 71 | sns.distplot(wine.ix[:, 'fixed_acidity']) 72 | sns.distplot(wine.ix[:, 'ln_fixed_acidity']) 73 | print(g) 74 | plt.suptitle('Histograms and Scatter Plots of Quality, Alcohol, and Residual Sugar', fontsize=14, \ 75 | horizontalalignment='center', verticalalignment='top', 76 | x=0.5, y=0.999) 77 | #plt.show() 78 | 79 | # Look at the distribution of quality by wine type 80 | red_wine = wine.ix[wine['type']=='red', 'quality'] 81 | white_wine = wine.ix[wine['type']=='white', 'quality'] 82 | 83 | sns.set_style("dark") 84 | print(sns.distplot(red_wine, \ 85 | norm_hist=True, kde=False, color="red", label="Red wine")) 86 | print(sns.distplot(white_wine, \ 87 | norm_hist=True, kde=False, color="white", label="White wine")) 88 | sns.axlabel("Quality Score", "Density") 89 | plt.title("Distribution of Quality by Wine Type") 90 | plt.legend() 91 | #plt.show() 92 | 93 | # Test whether mean quality is different between red and white wines 94 | print(wine.groupby(['type'])[['quality']].agg(['std', 'mean'])) 95 | tstat, pvalue, df = sm.stats.ttest_ind(red_wine, white_wine) 96 | print('tstat: %.3f pvalue: %.4f' % (tstat, pvalue)) 97 | 98 | # Fit a multivariate linear regression model 99 | #wine_standardized = (wine - wine.mean()) / wine.std() 100 | #formula_all = 'quality ~ alcohol + chlorides + citric_acid + density + fixed_acidity + free_sulfur_dioxide + pH + residual_sugar + sulphates + total_sulfur_dioxide + volatile_acidity' 101 | my_formula = 'quality ~ alcohol + chlorides + citric_acid + density + fixed_acidity + free_sulfur_dioxide + pH + residual_sugar + sulphates + total_sulfur_dioxide + volatile_acidity' 102 | #formula_all = 'quality ~ fixed_acidity + volatile_acidity + citric_acid + residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide + density + pH + sulphates + alcohol' 103 | #formula = 'quality ~ residual_sugar + alcohol' 104 | lm = ols(my_formula, data=wine).fit() 105 | #lm = glm(my_formula, data=wine, family=sm.families.Gaussian()).fit() 106 | #lm = smf.glm(formula_all, data=wine_standardized, family=sm.families.Gaussian()).fit() 107 | print(lm.summary()) 108 | print("\nQuantities you can extract from the result:\n%s" % dir(lm)) 109 | print("\nCoefficients:\n%s" % lm.params) 110 | print("\nCoefficient Std Errors:\n%s" % lm.bse) 111 | print("\nAdj. R-squared:\n%.2f" % lm.rsquared_adj) 112 | print("\nF-statistic: %.1f P-value: %.2f" % (lm.fvalue, lm.f_pvalue)) 113 | print("\nNumber of obs: %d Number of fitted values: %s" % (lm.nobs, len(lm.fittedvalues))) 114 | 115 | # Fit a multivariate linear model with standardized independent variables 116 | dependent_variable = wine['quality'] 117 | independent_variables = wine[wine.columns.difference(['quality', 'type', 'in_sample'])] 118 | independent_variables_standardized = (independent_variables - independent_variables.mean()) / independent_variables.std() 119 | wine_standardized = pd.concat([dependent_variable, independent_variables_standardized], axis=1) 120 | lm_standardized = ols(my_formula, data=wine_standardized).fit() 121 | print(lm_standardized.summary()) 122 | 123 | # Predict quality scores for "new" observations 124 | new_observations = wine.ix[wine.index.isin(xrange(10)), independent_variables.columns] 125 | y_predicted = lm.predict(new_observations) 126 | y_predicted_rounded = [round(score, 2) for score in y_predicted] 127 | print(y_predicted_rounded) 128 | -------------------------------------------------------------------------------- /statistics/customer_churn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import numpy as np 3 | import pandas as pd 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | import statsmodels.api as sm 7 | import statsmodels.formula.api as smf 8 | 9 | # Read the data set into a pandas DataFrame 10 | churn = pd.read_csv('churn.csv', sep=',', header=0) 11 | 12 | churn.columns = [heading.lower() for heading in \ 13 | churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')] 14 | 15 | churn['churn01'] = np.where(churn['churn'] == 'True.', 1., 0.) 16 | print(churn.head()) 17 | print(churn.describe()) 18 | 19 | 20 | # Calculate descriptive statistics for grouped data 21 | print(churn.groupby(['churn'])[['day_charge', 'eve_charge', 'night_charge', 'intl_charge', 'account_length', 'custserv_calls']].agg(['count', 'mean', 'std'])) 22 | 23 | # Specify different statistics for different variables 24 | print(churn.groupby(['churn']).agg({'day_charge' : ['mean', 'std'], 25 | 'eve_charge' : ['mean', 'std'], 26 | 'night_charge' : ['mean', 'std'], 27 | 'intl_charge' : ['mean', 'std'], 28 | 'account_length' : ['count', 'min', 'max'], 29 | 'custserv_calls' : ['count', 'min', 'max']})) 30 | 31 | # Create total_charges, split it into 5 groups, and 32 | # calculate statistics for each of the groups 33 | churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \ 34 | churn['night_charge'] + churn['intl_charge'] 35 | factor_cut = pd.cut(churn.total_charges, 5, precision=2) 36 | def get_stats(group): 37 | return {'min' : group.min(), 'max' : group.max(), 38 | 'count' : group.count(), 'mean' : group.mean(), 39 | 'std' : group.std()} 40 | grouped = churn.custserv_calls.groupby(factor_cut) 41 | print(grouped.apply(get_stats).unstack()) 42 | 43 | # Split account_length into quantiles and 44 | # calculate statistics for each of the quantiles 45 | factor_qcut = pd.qcut(churn.account_length, [0., 0.25, 0.5, 0.75, 1.]) 46 | grouped = churn.custserv_calls.groupby(factor_qcut) 47 | print(grouped.apply(get_stats).unstack()) 48 | 49 | # Create binary/dummy indicator variables for intl_plan and vmail_plan 50 | # and join them with the churn column in a new DataFrame 51 | intl_dummies = pd.get_dummies(churn['intl_plan'], prefix='intl_plan') 52 | vmail_dummies = pd.get_dummies(churn['vmail_plan'], prefix='vmail_plan') 53 | churn_with_dummies = churn[['churn']].join([intl_dummies, vmail_dummies]) 54 | print(churn_with_dummies.head()) 55 | 56 | # Split total_charges into quartiles, create binary indicator variables 57 | # for each of the quartiles, and add them to the churn DataFrame 58 | qcut_names = ['1st_quartile', '2nd_quartile', '3rd_quartile', '4th_quartile'] 59 | total_charges_quartiles = pd.qcut(churn.total_charges, 4, labels=qcut_names) 60 | dummies = pd.get_dummies(total_charges_quartiles, prefix='total_charges') 61 | churn_with_dummies = churn.join(dummies) 62 | print(churn_with_dummies.head()) 63 | 64 | # Create pivot tables 65 | print(churn.pivot_table(['total_charges'], index=['churn', 'custserv_calls'])) 66 | print(churn.pivot_table(['total_charges'], index=['churn'], columns=['custserv_calls'])) 67 | print(churn.pivot_table(['total_charges'], index=['custserv_calls'], columns=['churn'], \ 68 | aggfunc='mean', fill_value='NaN', margins=True)) 69 | 70 | # Fit a logistic regression model 71 | dependent_variable = churn['churn01'] 72 | independent_variables = churn[['account_length', 'custserv_calls', 'total_charges']] 73 | independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True) 74 | logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit() 75 | #logit_model = smf.glm(output_variable, input_variables, family=sm.families.Binomial()).fit() 76 | print(logit_model.summary()) 77 | print("\nQuantities you can extract from the result:\n%s" % dir(logit_model)) 78 | print("\nCoefficients:\n%s" % logit_model.params) 79 | print("\nCoefficient Std Errors:\n%s" % logit_model.bse) 80 | #logit_marginal_effects = logit_model.get_margeff(method='dydx', at='overall') 81 | #print(logit_marginal_effects.summary()) 82 | 83 | print("\ninvlogit(-7.2205 + 0.0012*mean(account_length) + 0.4443*mean(custserv_calls) + 0.0729*mean(total_charges))") 84 | 85 | def inverse_logit(model_formula): 86 | from math import exp 87 | return (1.0 / (1.0 + exp(-model_formula)))*100.0 88 | 89 | at_means = float(logit_model.params[0]) + \ 90 | float(logit_model.params[1])*float(churn['account_length'].mean()) + \ 91 | float(logit_model.params[2])*float(churn['custserv_calls'].mean()) + \ 92 | float(logit_model.params[3])*float(churn['total_charges'].mean()) 93 | 94 | print(churn['account_length'].mean()) 95 | print(churn['custserv_calls'].mean()) 96 | print(churn['total_charges'].mean()) 97 | print(at_means) 98 | print("Probability of churn when independent variables are at their mean values: %.2f" % inverse_logit(at_means)) 99 | 100 | cust_serv_mean = float(logit_model.params[0]) + \ 101 | float(logit_model.params[1])*float(churn['account_length'].mean()) + \ 102 | float(logit_model.params[2])*float(churn['custserv_calls'].mean()) + \ 103 | float(logit_model.params[3])*float(churn['total_charges'].mean()) 104 | 105 | cust_serv_mean_minus_one = float(logit_model.params[0]) + \ 106 | float(logit_model.params[1])*float(churn['account_length'].mean()) + \ 107 | float(logit_model.params[2])*float(churn['custserv_calls'].mean()-1.0) + \ 108 | float(logit_model.params[3])*float(churn['total_charges'].mean()) 109 | 110 | print(cust_serv_mean) 111 | print(churn['custserv_calls'].mean()-1.0) 112 | print(cust_serv_mean_minus_one) 113 | print("Probability of churn when account length changes by 1: %.2f" % (inverse_logit(cust_serv_mean) - inverse_logit(cust_serv_mean_minus_one))) 114 | 115 | # Predict churn for "new" observations 116 | new_observations = churn.ix[churn.index.isin(xrange(10)), independent_variables.columns] 117 | new_observations_with_constant = sm.add_constant(new_observations, prepend=True) 118 | y_predicted = logit_model.predict(new_observations_with_constant) 119 | y_predicted_rounded = [round(score, 2) for score in y_predicted] 120 | print(y_predicted_rounded) 121 | 122 | # Fit a logistic regression model 123 | output_variable = churn['churn01'] 124 | vars_to_keep = churn[['account_length', 'custserv_calls', 'total_charges']] 125 | inputs_standardized = (vars_to_keep - vars_to_keep.mean()) / vars_to_keep.std() 126 | input_variables = sm.add_constant(inputs_standardized, prepend=False) 127 | logit_model = sm.Logit(output_variable, input_variables).fit() 128 | #logit_model = smf.glm(output_variable, input_variables, family=sm.families.Binomial()).fit() 129 | print(logit_model.summary()) 130 | print(logit_model.params) 131 | print(logit_model.bse) 132 | #logit_marginal_effects = logit_model.get_margeff(method='dydx', at='overall') 133 | #print(logit_marginal_effects.summary()) 134 | 135 | # Predict output value for a new observation based on its mean standardized input values 136 | input_variables = [0., 0., 0., 1.] 137 | predicted_value = logit_model.predict(input_variables) 138 | print("Predicted value: %.5f") % predicted_value -------------------------------------------------------------------------------- /first_script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from math import exp, log, sqrt 3 | import re 4 | from datetime import date, time, datetime, timedelta 5 | from operator import itemgetter 6 | import sys 7 | import glob 8 | import os 9 | 10 | # Print a simple string 11 | print("Output #1: I'm excited to learn Python.") 12 | 13 | # Add two numbers together 14 | x = 4 15 | y = 5 16 | z = x + y 17 | print("Output #2: Four plus five equals {0:d}.".format(z)) 18 | 19 | # Add two lists together 20 | a = [1, 2, 3, 4] 21 | b = ["first", "second", "third", "fourth"] 22 | c = a + b 23 | print("Output #3: {0}, {1}, {2}".format(a, b, c)) 24 | 25 | # INTEGERS 26 | x = 9 27 | print("Output #4: {0}".format(x)) 28 | print("Output #5: {0}".format(3**4)) 29 | print("Output #6: {0}".format(int(8.3)/int(2.7))) 30 | 31 | # FLOATING-POINT NUMBERS 32 | print("Output #7: {0:.3f}".format(8.3/2.7)) 33 | y = 2.5*4.8 34 | print("Output #8: {0:.1f}".format(y)) 35 | r = 8/float(3) 36 | print("Output #9: {0:.2f}".format(r)) 37 | print("Output #10: {0:.4f}".format(8.0/3)) 38 | 39 | # Some mathematical functions available in the math module 40 | print("Output #11: {0:.4f}".format(exp(3))) 41 | print("Output #12: {0:.2f}".format(log(4))) 42 | print("Output #13: {0:.1f}".format(sqrt(81))) 43 | 44 | # STRINGS 45 | # A string with single quotes, so include a backslash before the single quote 46 | print("Output #14: {0:s}".format('I\'m enjoying learning Python')) 47 | 48 | # A one-line string, but if the string is long and running off the page on the right 49 | # you can use a "\" to separate the long string into smaller strings on separate lines 50 | print("Output #15: {0:s}".format("This is a long string. Without the backslash \ 51 | it would run off of the page on the right in the text editor and be very \ 52 | difficult to read and edit. By using the backslash you can split the long \ 53 | string into smaller strings on separate lines so that the whole string is easy \ 54 | to view in the text editor.")) 55 | 56 | # Use triple single or double quotes if you want the string to span multiple lines 57 | # and you don't want to use the "\" 58 | print("Output #16: {0:s}".format('''You can use triple single quotes 59 | for multi-line comment strings''')) 60 | 61 | print("Output #17: {0:s}".format("""You can also use triple double quotations 62 | for multi-line comment strings""")) 63 | 64 | # Add two strings together 65 | string1 = "This is a " 66 | string2 = "short string." 67 | sentence = string1 + string2 68 | print("Output #18: {0:s}".format(sentence)) 69 | 70 | # Repeat a string four times 71 | print("Output #19: {0:s} {1:s}{2:s}".format("She is", "very "*4, "beautiful.")) 72 | 73 | # Determine the number of characters in a string, including spaces and punctuation 74 | m = len(sentence) 75 | print("Output #20: {0:d}".format(m)) 76 | 77 | # split() 78 | string1 = "My deliverable is due in May" 79 | string1_list1 = string1.split() 80 | string1_list2 = string1.split(" ", 2) 81 | print("Output #21: {0}".format(string1_list1)) 82 | print("Output #22: FIRST PIECE:{0} SECOND PIECE:{1} THIRD PIECE:{2}"\ 83 | .format(string1_list2[0], string1_list2[1], string1_list2[2])) 84 | 85 | string2 = "Your,deliverable,is,due,in,June" 86 | string2_list = string2.split(',') 87 | print("Output #23: {0}".format(string2_list)) 88 | print("Output #24: {0} {1} {2}".format(string2_list[1], string2_list[5], string2_list[-1])) 89 | 90 | # join() 91 | print("Output #25: {0}".format(','.join(string2_list))) 92 | 93 | # strip() 94 | string3 = " Remove unwanted characters from this string\t\t \n" 95 | print("Output #26: string3: {0:s}".format(string3)) 96 | string3_lstrip = string3.lstrip() 97 | print("Output #27: lstrip: {0:s}".format(string3_lstrip)) 98 | string3_rstrip = string3.rstrip() 99 | print("Output #28: rstrip: {0:s}".format(string3_rstrip)) 100 | string3_strip = string3.strip() 101 | print("Output #29: strip: {0:s}".format(string3_strip)) 102 | 103 | string4 = "$$Here's another string that has unwanted characters.__---++" 104 | print("Output #30: {0:s}".format(string4)) 105 | string4 = "$$The unwanted characters have been removed.__---++" 106 | string4_strip = string4.strip('$_-+') 107 | print("Output #31: {0:s}".format(string4_strip)) 108 | 109 | # replace() 110 | string5 = "Let's replace the spaces in this sentence with other characters." 111 | string5_replace = string5.replace(" ", "!@!") 112 | print("Output #32 (with !@!): {0:s}".format(string5_replace)) 113 | string5_replace = string5.replace(" ", ",") 114 | print("Output #33 (with commas): {0:s}".format(string5_replace)) 115 | 116 | # lower(), upper(), capitalize() 117 | string6 = "Here's WHAT Happens WHEN You Use lower." 118 | print("Output #34: {0:s}".format(string6.lower())) 119 | 120 | string7 = "Here's what Happens when You Use UPPER." 121 | print("Output #35: {0:s}".format(string7.upper())) 122 | 123 | string8 = "here's WHAT Happens WHEN you use Capitalize." 124 | print("Output #36: {0:s}".format(string8.capitalize())) 125 | string8_list = string8.split() 126 | print("Output #37 (on each word):") 127 | for word in string8_list: 128 | print("{0:s}".format(word.capitalize())) 129 | 130 | # REGULAR EXPRESSIONS / PATTERN MATCHING 131 | # Count the number of times a pattern appears in a string 132 | string = "The quick brown fox jumps over the lazy dog." 133 | string_list = string.split() 134 | pattern = re.compile(r"The", re.I) 135 | count = 0 136 | for word in string_list: 137 | if pattern.search(word): 138 | count += 1 139 | print("Output #38: {0:d}".format(count)) 140 | 141 | # Print the pattern each time it is found in the string 142 | string = "The quick brown fox jumps over the lazy dog." 143 | string_list = string.split() 144 | pattern = re.compile(r"(?PThe)", re.I) 145 | print("Output #39:") 146 | for word in string_list: 147 | if pattern.search(word): 148 | print("{:s}".format(pattern.search(word).group('match_word'))) 149 | 150 | # Substitute the letter "a" for the word "the" in the string 151 | string = "The quick brown fox jumps over the lazy dog." 152 | string_to_find = r"The" 153 | pattern = re.compile(string_to_find, re.I) 154 | print("Output #40: {:s}".format(pattern.sub("a", string))) 155 | 156 | # DATES 157 | # Print today's date, as well as the year, month, and day elements 158 | today = date.today() 159 | print("Output #41: today: {0!s}".format(today)) 160 | print("Output #42: {0!s}".format(today.year)) 161 | print("Output #43: {0!s}".format(today.month)) 162 | print("Output #44: {0!s}".format(today.day)) 163 | current_datetime = datetime.today() 164 | print("Output #45: {0!s}".format(current_datetime)) 165 | 166 | # Calculate a new date using a timedelta 167 | one_day = timedelta(days=-1) 168 | yesterday = today + one_day 169 | print("Output #46: yesterday: {0!s}".format(yesterday)) 170 | eight_hours = timedelta(hours=-8) 171 | print("Output #47: {0!s} {1!s}".format(eight_hours.days, eight_hours.seconds)) 172 | 173 | # Calculate the amount of time between two dates and grab the first element, the number of days 174 | date_diff = today - yesterday 175 | print("Output #48: {0!s}".format(date_diff)) 176 | print("Output #49: {0!s}".format(str(date_diff).split()[0])) 177 | 178 | # Create a string with a specific format from a date object 179 | print("Output #50: {:s}".format(today.strftime('%m/%d/%Y'))) 180 | print("Output #51: {:s}".format(today.strftime('%b %d, %Y'))) 181 | print("Output #52: {:s}".format(today.strftime('%Y-%m-%d'))) 182 | print("Output #53: {:s}".format(today.strftime('%B %d, %Y'))) 183 | 184 | # Create a datetime object with a specific format 185 | # from a string representing a date 186 | date1 = today.strftime('%m/%d/%Y') 187 | date2 = today.strftime('%b %d, %Y') 188 | date3 = today.strftime('%Y-%m-%d') 189 | date4 = today.strftime('%B %d, %Y') 190 | 191 | # Two datetime objects and two date objects 192 | # based on the four strings that have different date formats 193 | print("Output #54: {!s}".format(datetime.strptime(date1, '%m/%d/%Y'))) 194 | print("Output #55: {!s}".format(datetime.strptime(date2, '%b %d, %Y'))) 195 | 196 | # Show the date portion only 197 | print("Output #56: {!s}".format(datetime.date(datetime.strptime\ 198 | (date3, '%Y-%m-%d'))) 199 | print("Output #57: {!s}".format(datetime.date(datetime.strptime\ 200 | (date4, '%B %d, %Y'))) 201 | 202 | # LISTS 203 | # Use square brackets to create a list 204 | # len() counts the number of elements in a list 205 | # max() and min() find the maximum and minimum numbers in numeric lists 206 | # count() counts the number of times a value appears in a list 207 | a_list = [1, 2, 3] 208 | print("Output #58: {}".format(a_list)) 209 | print("Output #59: a_list has {} elements.".format(len(a_list))) 210 | print("Output #60: the maximum value in a_list is {}.".format(max(a_list))) 211 | print("Output #61: the minimum value in a_list is {}.".format(min(a_list))) 212 | another_list = ['printer', 5, ['star', 'circle', 9]] 213 | print("Output #62: {}".format(another_list)) 214 | print("Output #63: another_list also has {} elements.".format(len(another_list))) 215 | print("Output #64: 5 is in another_list {} time.".format(another_list.count(5))) 216 | 217 | # Use list indices to access specific values in a list 218 | # [0] is the first value; [-1] is the last value 219 | print("Output #65: {}".format(a_list[0])) 220 | print("Output #66: {}".format(a_list[1])) 221 | print("Output #67: {}".format(a_list[2])) 222 | print("Output #68: {}".format(a_list[-1])) 223 | print("Output #69: {}".format(a_list[-2])) 224 | print("Output #70: {}".format(a_list[-3])) 225 | print("Output #71: {}".format(another_list[2])) 226 | print("Output #72: {}".format(another_list[-1])) 227 | 228 | # Use list slices to access a subset of list values 229 | # Do not include the starting indice to start from the beginning 230 | # Do not include the ending indice to go all of the way to the end 231 | print("Output #73: {}".format(a_list[0:2])) 232 | print("Output #74: {}".format(another_list[:2])) 233 | print("Output #75: {}".format(a_list[1:3])) 234 | print("Output #76: {}".format(another_list[1:])) 235 | 236 | # Use [:] to make a copy of a list 237 | a_new_list = a_list[:] 238 | print("Output #77: {}".format(a_new_list)) 239 | 240 | # Use + to add two or more lists together 241 | a_longer_list = a_list + another_list # to add lists together 242 | print("Output #78: {}".format(a_longer_list)) 243 | 244 | # Use 'in' and 'not in' to check whether specific values are or are not in a list 245 | a = 2 in a_list 246 | print("Output #79: {}".format(a)) 247 | if 2 in a_list: 248 | print("Output #80: 2 is in {}.".format(a_list)) 249 | b = 6 not in a_list 250 | print("Output #81: {}".format(b)) 251 | if 6 not in a_list: 252 | print("Output #82: 6is not in {}.".format(a_list)) 253 | 254 | # Use append() to add additional values to the end of the list 255 | # Use remove() to remove specific values from the list 256 | # Use pop() to remove values from the end of the list 257 | a_list.append(4) 258 | a_list.append(5) 259 | a_list.append(6) 260 | print("Output #83: {}".format(a_list)) 261 | a_list.remove(5) 262 | print("Output #84: {}".format(a_list)) 263 | a_list.pop() 264 | a_list.pop() 265 | print("Output #85: {}".format(a_list)) 266 | 267 | # Use reverse() to reverse a list, in-place, meaning it changes the list 268 | # To reverse a list without changing the original list, make a copy first 269 | a_list.reverse() 270 | print("Output #86: {}".format(a_list)) 271 | a_list_copy = a_list[:] 272 | a_list_copy.reverse() 273 | print("Output #87: {}".format(a_list_copy)) 274 | 275 | # Use sort() to sort a list, in-place, meaning it changes the list 276 | # To sort a list without changing the original list, make a copy first 277 | unordered_list = [3, 5, 1, 7, 2, 8, 4, 9, 0, 6] 278 | print("Output #88: {}".format(unordered_list)) 279 | list_copy = unordered_list[:] 280 | list_copy.sort() 281 | print("Output #89: {}".format(list_copy)) 282 | print("Output #90: {}".format(unordered_list)) 283 | 284 | # Use sorted() to sort a collection of lists by a position in the lists 285 | my_lists = [[1,2,3,4], [4,3,2,1], [2,4,1,3]] 286 | my_lists_sorted_by_index_3 = sorted(my_lists, key=lambda index_value: index_value[3]) 287 | print("Output #91: {}".format(my_lists_sorted_by_index_3)) 288 | 289 | # Use itemgetter() to sort a collection of lists by two index positions 290 | my_lists = [[123,2,2,444], [22,6,6,444], [354,4,4,678], [236,5,5,678], \ 291 | [578,1,1,290], [461,1,1,290]] 292 | my_lists_sorted_by_index_3_and_0 = sorted(my_lists, key=itemgetter(3,0)) 293 | print("Output #92: {}".format(my_lists_sorted_by_index_3_and_0)) 294 | 295 | # TUPLES 296 | # Use parentheses to create a tuple 297 | my_tuple = ('x', 'y', 'z') 298 | print("Output #93: {}".format(my_tuple)) 299 | print("Output #94: my_tuple has {} elements".format(len(my_tuple))) 300 | print("Output #95: {}".format(my_tuple[1])) 301 | longer_tuple = my_tuple + my_tuple 302 | print("Output #96: {}".format(longer_tuple)) 303 | 304 | # Unpack tuples with the left-hand side of an assignment operator 305 | one, two, three = my_tuple 306 | print("Output #97: {0} {1} {2}".format(one, two, three)) 307 | var1 = 'red' 308 | var2 = 'robin' 309 | print("Output #98: {} {}".format(var1, var2)) 310 | # Swap values between variables 311 | var1, var2 = var2, var1 312 | print("Output #99: {} {}".format(var1, var2)) 313 | 314 | # Convert tuples to lists and lists to tuples 315 | my_list = [1, 2, 3] 316 | my_tuple = ('x', 'y', 'z') 317 | print("Output #100: {}".format(tuple(my_list))) 318 | print("Output #101: {}".format(list(my_tuple))) 319 | 320 | # DICTIONARIES 321 | # Use curly braces to create a dictionary 322 | # Use a colon between keys and values in each pair 323 | # len() counts the number of key-value pairs in a dictionary 324 | empty_dict = { } 325 | a_dict = {'one':1, 'two':2, 'three':3} 326 | print("Output #102: {}".format(a_dict)) 327 | print("Output #103: a_dict has {!s} elements".format(len(a_dict))) 328 | another_dict = {'x':'printer', 'y':5, 'z':['star', 'circle', 9]} 329 | print("Output #104: {}".format(another_dict)) 330 | print("Output #105: another_dict also has {!s} elements"\ 331 | .format(len(another_dict))) 332 | 333 | # Use keys to access specific values in a dictionary 334 | print("Output #106: {}".format(a_dict['two'])) 335 | print("Output #107: {}".format(another_dict['z'])) 336 | 337 | # Use copy() to make a copy of a dictionary 338 | a_new_dict = a_dict.copy() 339 | print("Output #108: {}".format(a_new_dict)) 340 | 341 | # Use keys(), values(), and items() to access 342 | # a dictionary's keys, values, and key-value pairs, respectively 343 | print("Output #109: {}".format(a_dict.keys())) 344 | a_dict_keys = a_dict.keys() 345 | print("Output #110: {}".format(a_dict_keys)) 346 | print("Output #111: {}".format(a_dict.values())) 347 | print("Output #112: {}".format(a_dict.items())) 348 | 349 | # Use in, not in, and get to test 350 | # whether a key is in a dictionary 351 | if 'y' in another_dict: 352 | print("Output #114: y is a key in another_dict: {}."\ 353 | .format(another_dict.keys())) 354 | 355 | if 'c' not in another_dict: 356 | print("Output #115: c is not a key in another_dict: {}."\ 357 | .format(another_dict.keys())) 358 | 359 | print("Output #116: {!s}".format(a_dict.get('three'))) 360 | print("Output #117: {!s}".format(a_dict.get('four'))) 361 | print("Output #118: {!s}".format(a_dict.get('four', 'Not in dict'))) 362 | 363 | # Use sorted() to sort a dictionary 364 | # To sort a dictionary without changing the original dictionary, 365 | # make a copy first 366 | print("Output #119: " + str(a_dict) 367 | dict_copy = a_dict.copy() 368 | ordered_dict1 = sorted(dict_copy.items(), key=lambda item: item[0]) 369 | print("Output #120 (order by keys): {}".format(ordered_dict1)) 370 | ordered_dict2 = sorted(dict_copy.items(), key=lambda item: item[1]) 371 | print("Output #121 (order by values): {}".format(ordered_dict2)) 372 | ordered_dict3 = sorted(dict_copy.items(), key=lambda x: x[1], reverse=True) 373 | print("Output #122 (order by values, descending): {}".format(ordered_dict3)) 374 | ordered_dict4 = sorted(dict_copy.items(), key=lambda x: x[1], reverse=False) 375 | print("Output #123 (order by values, ascending): {}".format(ordered_dict4)) 376 | 377 | # CONTROL FLOW 378 | # if-else statement 379 | x = 5 380 | if x > 4 or x != 9: 381 | print("Output #124: {}".format(x)) 382 | else: 383 | print("Output #125: x is not greater than 4") 384 | 385 | # if-elif-else statement 386 | if x > 6: 387 | print("Output #126: x is greater than six") 388 | elif x > 4 and x == 5: 389 | print("Output #127: {}".format(x*x)) 390 | else: 391 | print("Output #128: x is not greater than 4") 392 | 393 | # for loop 394 | y = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', \ 395 | 'Nov', 'Dec'] 396 | z = ['Annie', 'Betty', 'Claire', 'Daphne', 'Ellie', 'Franchesca', 'Greta', \ 397 | 'Holly', 'Isabel', 'Jenny'] 398 | 399 | print("Output #129:") 400 | for month in y: 401 | print("{!s}".format(month)) 402 | 403 | print("Output #130: (index value: name in list)") 404 | for i in range(len(z)): 405 | print("{0!s}: {1:s}".format(i, z[i])) 406 | 407 | print("Output #131: (access elements in y with z's index values)") 408 | for j in range(len(z)): 409 | if y[j].startswith('J'): 410 | print("{!s}".format(y[j])) 411 | 412 | print("Output #132:") 413 | for key, value in another_dict.items(): 414 | print("{0:s}, {1}".format(key, value)) 415 | 416 | # compact for loops 417 | # list, set, and dictionary comprehensions 418 | # Select specific rows using a list comprehension 419 | my_data = [[1,2,3], [4,5,6], [7,8,9]] 420 | rows_to_keep = [row for row in my_data if row[2] > 5] 421 | print("Output #133 (list comprehension): {}".format(rows_to_keep)) 422 | 423 | # Select a set of unique tuples in a list using a set comprehension 424 | my_data = [(1,2,3), (4,5,6), (7,8,9), (7,8,9)] 425 | set_of_tuples1 = {x for x in my_data} 426 | print("Output #134 (set comprehension): {}".format(set_of_tuples1)) 427 | set_of_tuples2 = set(my_data) 428 | print("Output #135 (set function): {}".format(set_of_tuples2)) 429 | 430 | # Select specific key-value pairs using a dictionary comprehension 431 | my_dictionary = {'customer1': 7, 'customer2': 9, 'customer3': 11} 432 | my_results = {key : value for key, value in my_dictionary.items() if \ 433 | value > 10} 434 | print("Output #136 (dictionary comprehension): {}".format(my_results)) 435 | 436 | # while loop 437 | print("Output #137:") 438 | x = 0 439 | while x < 11: 440 | print("{!s}".format(x)) 441 | x += 1 442 | 443 | # FUNCTIONS 444 | # Calculate the mean of a sequence of numeric values 445 | def getMean(numericValues): 446 | return sum(numericValues)/len(numericValues) if len(numericValues) > 0 \ 447 | else float('nan') 448 | 449 | my_list = [2, 2, 4, 4, 6, 6, 8, 8] 450 | print("Output #138 (mean): {!s}".format(getMean(my_list))) 451 | 452 | #import numpy as np 453 | #print np.mean(my_list) 454 | 455 | # EXCEPTIONS 456 | # Calculate the mean of a sequence of numeric values 457 | def getMean(numericValues): 458 | return sum(numericValues)/len(numericValues) 459 | 460 | my_list2 = [ ] 461 | # Short version 462 | try: 463 | print("Output #139: {}".format(getMean(my_list2))) 464 | except ZeroDivisionError as detail: 465 | print("Output #139 (Error): {}".format(float('nan'))) 466 | print("Output #139 (Error): {}".format(detail)) 467 | 468 | # Long version 469 | try: 470 | result = getMean(my_list2) 471 | except ZeroDivisionError as detail: 472 | print("Output #140 (Error): {}".format(float('nan'))) 473 | print("Output #140 (Error): {}".format(detail)) 474 | else: 475 | print("Output #140 (The mean is): {}".format(result)) 476 | finally: 477 | print("Output #140 (Finally): The finally block is executed every time") 478 | 479 | # READ A FILE 480 | # Read a single text file 481 | #input_file = sys.argv[1] 482 | 483 | ## Read a text file (older method) ## 484 | #print("Output #141:") 485 | #filereader = open(input_file, 'r', newline='') 486 | #for row in filereader: 487 | # print("{}".format(row.strip())) 488 | #filereader.close() 489 | 490 | ## Read a text file (newer method) ## 491 | #print("Output #142:") 492 | #with open(input_file, 'r', newline='') as filereader: 493 | # for row in filereader: 494 | # print("{}".format(row.strip())) 495 | 496 | #print("Output #143:") 497 | # READ MULTIPLE FILES 498 | # Read multiple text files 499 | #inputPath = sys.argv[1] 500 | #for input_file in glob.glob(os.path.join(inputPath,'*.txt')): 501 | # with open(input_file, 'r', newline='') as filereader: 502 | # for row in filereader: 503 | # print("{}".format(row.strip())) 504 | 505 | # WRITE TO A FILE 506 | # Write to a text file 507 | #my_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] 508 | #max_index = len(my_letters) 509 | #output_file = sys.argv[1] 510 | #filewriter = open(output_file, 'w') 511 | #for index_value in range(len(my_letters)): 512 | # if index_value < (max_index-1): 513 | # filewriter.write(my_letters[index_value]+'\t') 514 | # else: 515 | # filewriter.write(my_letters[index_value]+'\n') 516 | #filewriter.close() 517 | #print("Output #144: Output written to file") 518 | 519 | # Write to a CSV file 520 | #my_numbers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 521 | #max_index = len(my_numbers) 522 | #output_file = sys.argv[1] 523 | #filewriter = open(output_file, 'a') 524 | #for index_value in range(len(my_numbers)): 525 | # if index_value < (max_index-1): 526 | # filewriter.write(str(my_numbers[index_value])+',') 527 | # else: 528 | # filewriter.write(str(my_numbers[index_value])+'\n') 529 | #filewriter.close() 530 | #print("Output #145: Output appended to file") 531 | --------------------------------------------------------------------------------