├── src
    └── README.md
├── input
    └── README.md
├── output
    └── README.md
├── run.sh
├── insight_testsuite
    ├── tests
    │   └── test_1
    │   │   ├── output
    │   │       └── report.csv
    │   │   └── input
    │   │       └── Border_Crossing_Entry_Data.csv
    └── run_tests.sh
└── README.md


/src/README.md:
--------------------------------------------------------------------------------
1 | This is the directory where your souce code would reside
2 | 


--------------------------------------------------------------------------------
/input/README.md:
--------------------------------------------------------------------------------
1 | This is the directory where your program would find any test input files.
2 | 


--------------------------------------------------------------------------------
/output/README.md:
--------------------------------------------------------------------------------
1 | This is the directory where your program would find any test output files.
2 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Use this shell script to compile (if necessary) your code and then execute it. Belw is an example of what might be found in this file if your program was written in Python 3.7
4 | # python3.7 ./src/border_analytics.py ./input/Border_Crossing_Entry_Data.csv ./output/report.csv
5 | 


--------------------------------------------------------------------------------
/insight_testsuite/tests/test_1/output/report.csv:
--------------------------------------------------------------------------------
1 | Border,Date,Measure,Value,Average
2 | US-Mexico Border,03/01/2019 12:00:00 AM,Pedestrians,346158,114487
3 | US-Canada Border,03/01/2019 12:00:00 AM,Truck Containers Full,6483,0
4 | US-Canada Border,03/01/2019 12:00:00 AM,Trains,19,0
5 | US-Mexico Border,02/01/2019 12:00:00 AM,Pedestrians,172163,56810
6 | US-Canada Border,02/01/2019 12:00:00 AM,Truck Containers Empty,1319,0
7 | US-Mexico Border,01/01/2019 12:00:00 AM,Pedestrians,56810,0
8 | 


--------------------------------------------------------------------------------
/insight_testsuite/tests/test_1/input/Border_Crossing_Entry_Data.csv:
--------------------------------------------------------------------------------
1 | Port Name,State,Port Code,Border,Date,Measure,Value,Location
2 | Derby Line,Vermont,209,US-Canada Border,03/01/2019 12:00:00 AM,Truck Containers Full,6483,POINT (-72.09944 45.005)
3 | Norton,Vermont,211,US-Canada Border,03/01/2019 12:00:00 AM,Trains,19,POINT (-71.79528000000002 45.01)
4 | Calexico,California,2503,US-Mexico Border,03/01/2019 12:00:00 AM,Pedestrians,346158,POINT (-115.49806000000001 32.67889)
5 | Hidalgo,Texas,2305,US-Mexico Border,02/01/2019 12:00:00 AM,Pedestrians,156891,POINT (-98.26278 26.1)
6 | Frontier,Washington,3020,US-Canada Border,02/01/2019 12:00:00 AM,Truck Containers Empty,1319,POINT (-117.78134000000001 48.910160000000005)
7 | Presidio,Texas,2403,US-Mexico Border,02/01/2019 12:00:00 AM,Pedestrians,15272,POINT (-104.37167 29.56056)
8 | Eagle Pass,Texas,2303,US-Mexico Border,01/01/2019 12:00:00 AM,Pedestrians,56810,POINT (-100.49917 28.70889)
9 | 


--------------------------------------------------------------------------------
/insight_testsuite/run_tests.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | declare -r color_start="\033["
  4 | declare -r color_red="${color_start}0;31m"
  5 | declare -r color_green="${color_start}0;32m"
  6 | declare -r color_blue="${color_start}0;34m"
  7 | declare -r color_norm="${color_start}0m"
  8 | 
  9 | GRADER_ROOT=$(dirname ${BASH_SOURCE})
 10 | 
 11 | PROJECT_PATH=${GRADER_ROOT}/..
 12 | 
 13 | function print_dir_contents {
 14 |   local proj_path=$1
 15 |   echo "Project contents:"
 16 |   echo -e "${color_blue}$(ls ${proj_path})${color_norm}"
 17 | }
 18 | 
 19 | function find_file_or_dir_in_project {
 20 |   local proj_path=$1
 21 |   local file_or_dir_name=$2
 22 |   if [[ ! -e "${proj_path}/${file_or_dir_name}" ]]; then
 23 |     echo -e "[${color_red}FAIL${color_norm}]: no ${file_or_dir_name} found"
 24 |     print_dir_contents ${proj_path}
 25 |     echo -e "${color_red}${file_or_dir_name} [MISSING]${color_norm}"
 26 |     exit 1
 27 |   fi
 28 | }
 29 | 
 30 | # check project directory structure
 31 | function check_project_struct {
 32 |   find_file_or_dir_in_project ${PROJECT_PATH} run.sh
 33 |   find_file_or_dir_in_project ${PROJECT_PATH} src
 34 |   find_file_or_dir_in_project ${PROJECT_PATH} input
 35 |   find_file_or_dir_in_project ${PROJECT_PATH} output
 36 | }
 37 | 
 38 | # setup testing output folder
 39 | function setup_testing_input_output {
 40 |   TEST_OUTPUT_PATH=${GRADER_ROOT}/temp
 41 |   if [ -d ${TEST_OUTPUT_PATH} ]; then
 42 |     rm -rf ${TEST_OUTPUT_PATH}
 43 |   fi
 44 | 
 45 |   mkdir -p ${TEST_OUTPUT_PATH}
 46 | 
 47 |   cp -r ${PROJECT_PATH}/src ${TEST_OUTPUT_PATH}
 48 |   cp -r ${PROJECT_PATH}/run.sh ${TEST_OUTPUT_PATH}
 49 |   cp -r ${PROJECT_PATH}/input ${TEST_OUTPUT_PATH}
 50 |   cp -r ${PROJECT_PATH}/output ${TEST_OUTPUT_PATH}
 51 | 
 52 |   rm -r ${TEST_OUTPUT_PATH}/input/*
 53 |   rm -r ${TEST_OUTPUT_PATH}/output/*
 54 |   cp -r ${GRADER_ROOT}/tests/${test_folder}/input/Border_Crossing_Entry_Data.csv ${TEST_OUTPUT_PATH}/input/Border_Crossing_Entry_Data.csv
 55 | }
 56 | 
 57 | function compare_outputs {
 58 |   NUM_OUTPUT_FILES_PASSED=0
 59 |   OUTPUT_FILENAME=report.csv
 60 |   PROJECT_ANSWER_PATH1=${GRADER_ROOT}/temp/output/${OUTPUT_FILENAME}
 61 |   TEST_ANSWER_PATH1=${GRADER_ROOT}/tests/${test_folder}/output/${OUTPUT_FILENAME}
 62 | 
 63 |   DIFF_RESULT1=$(diff -bB ${PROJECT_ANSWER_PATH1} ${TEST_ANSWER_PATH1} | wc -l)
 64 |   if [ "${DIFF_RESULT1}" -eq "0" ] && [ -f ${PROJECT_ANSWER_PATH1} ]; then
 65 |     echo -e "[${color_green}PASS${color_norm}]: ${test_folder} ${OUTPUT_FILENAME}"
 66 |     NUM_OUTPUT_FILES_PASSED=$(($NUM_OUTPUT_FILES_PASSED+1))
 67 |   else
 68 |     echo -e "[${color_red}FAIL${color_norm}]: ${test_folder}"
 69 |     diff ${PROJECT_ANSWER_PATH1} ${TEST_ANSWER_PATH1}
 70 |   fi
 71 | 
 72 |   if [ "${NUM_OUTPUT_FILES_PASSED}" -eq "1" ]; then
 73 |     PASS_CNT=$(($PASS_CNT+1))
 74 |   fi
 75 | 
 76 | }
 77 | 
 78 | function run_all_tests {
 79 |   TEST_FOLDERS=$(ls ${GRADER_ROOT}/tests)
 80 |   NUM_TESTS=$(($(echo $(echo ${TEST_FOLDERS} | wc -w))))
 81 |   PASS_CNT=0
 82 | 
 83 |   # Loop through all tests
 84 |   for test_folder in ${TEST_FOLDERS}; do
 85 | 
 86 |     setup_testing_input_output
 87 | 
 88 |     cd ${GRADER_ROOT}/temp
 89 |     bash run.sh 2>&1
 90 |     cd ../
 91 | 
 92 |     compare_outputs
 93 |   done
 94 | 
 95 |   echo "[$(date)] ${PASS_CNT} of ${NUM_TESTS} tests passed"
 96 |   echo "[$(date)] ${PASS_CNT} of ${NUM_TESTS} tests passed" >> ${GRADER_ROOT}/results.txt
 97 | }
 98 | 
 99 | check_project_struct
100 | run_all_tests
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Border Crossing Analysis
  2 | 
  3 | ## Table of Contents
  4 | 1. [Problem](README.md#problem)
  5 | 1. [Steps to submit your solution](README.md#steps-to-submit-your-solution)
  6 | 1. [Input Dataset](README.md#input-dataset)
  7 | 1. [Expected output](README.md#expected-output)
  8 | 1. [Instructions](README.md#instructions)
  9 | 1. [Tips on getting an interview](README.md#tips-on-getting-an-interview)
 10 | 1. [Questions?](README.md#questions?)
 11 | 
 12 | ## Problem
 13 | The Bureau of Transportation Statistics regularly makes available data on the number of vehicles, equipment, passengers and pedestrians crossing into the United States by land.
 14 | 
 15 | **For this challenge, we want to you to calculate the total number of times vehicles, equipment, passengers and pedestrians cross the U.S.-Canadian and U.S.-Mexican borders each month. We also want to know the running monthly average of total number of crossings for that type of crossing and border.**
 16 | 
 17 | ## Steps to submit your solution
 18 | * To submit your entry please use the link you received in your coding challenge invite email
 19 | * You will only be able to submit through the link one time
 20 | * Do NOT attach a file - we will not admit solutions which are attached files
 21 | * Do NOT send your solution over an email - We are unable to accept coding challenges that way
 22 | * For a limited time we have made available a <a href="http://insight-cc-submission.com/test-my-repo-link">website</a> that will allow you to simulate the environment in which we will test your code. 
 23 | 
 24 | ### Creating private repositories
 25 | To avoid plagiarism and any wrongdoing, we request you to submit a private repository of your code. Both GitHub and Bitbucket offer free unlimited private repositories at no extra cost.
 26 | * Create a private repository on GitHub or Bitbucket with the given repository structure. Here is how you will be sharing your private repositories for us to see once you are ready to submit.
 27 | * Add "insight-cc-bot" as a collaborator in your project.
 28 |   * [How to add collaborators on GitHub?](https://help.github.com/articles/inviting-collaborators-to-a-personal-repository/)
 29 |   * [How to add users and groups as collaborators in Bitbucket?](https://confluence.atlassian.com/bitbucket/grant-repository-access-to-users-and-groups-221449716.html)
 30 | * **We will NOT be grading submissions we do not have access to.**
 31 | 
 32 | ### Submitting a link to your repository
 33 | * Use the submission box to enter the link to your GitHub or Bitbucket repo ONLY
 34 | * Link to the specific repo for this project, not your general profile
 35 | * Put any comments in the README inside your project repo, not in the submission box
 36 | 
 37 | ## Input Dataset
 38 | 
 39 | For this challenge, you will be given an input file, `Border_Crossing_Entry_Data.csv`, that will reside in the top-most `input` directory of your repository.
 40 | 
 41 | The file contains data of the form:
 42 | 
 43 | ```
 44 | Port Name,State,Port Code,Border,Date,Measure,Value,Location
 45 | Derby Line,Vermont,209,US-Canada Border,03/01/2019 12:00:00 AM,Truck Containers Full,6483,POINT (-72.09944 45.005)
 46 | Norton,Vermont,211,US-Canada Border,03/01/2019 12:00:00 AM,Trains,19,POINT (-71.79528000000002 45.01)
 47 | Calexico,California,2503,US-Mexico Border,03/01/2019 12:00:00 AM,Pedestrians,346158,POINT (-115.49806000000001 32.67889)
 48 | Hidalgo,Texas,2305,US-Mexico Border,02/01/2019 12:00:00 AM,Pedestrians,156891,POINT (-98.26278 26.1)
 49 | Frontier,Washington,3020,US-Canada Border,02/01/2019 12:00:00 AM,Truck Containers Empty,1319,POINT (-117.78134000000001 48.910160000000005)
 50 | Presidio,Texas,2403,US-Mexico Border,02/01/2019 12:00:00 AM,Pedestrians,15272,POINT (-104.37167 29.56056)
 51 | Eagle Pass,Texas,2303,US-Mexico Border,01/01/2019 12:00:00 AM,Pedestrians,56810,POINT (-100.49917 28.70889)
 52 | ```
 53 | See the [notes from the Bureau of Transportation Statistics](https://data.transportation.gov/Research-and-Statistics/Border-Crossing-Entry-Data/keg4-3bc2) for more information on each field.
 54 | 
 55 | For the purposes of this challenge, you'll want to pay attention to the following fields:
 56 | * `Border`: Designates what border was crossed
 57 | * `Date`: Timestamp indicating month and year of crossing
 58 | * `Measure`: Indicates means, or type, of crossing being measured (e.g., vehicle, equipment, passenger or pedestrian)
 59 | * `Value`: Number of crossings
 60 | 
 61 | ## Expected Output
 62 | Using the input file, you must write a program to 
 63 | * Sum the total number of crossings (`Value`) of each type of vehicle or equipment, or passengers or pedestrians, that crossed the border that month, regardless of what port was used. 
 64 | * Calculate the running monthly average of total crossings, rounded to the nearest whole number, for that combination of `Border` and `Measure`, or means of crossing.
 65 | 
 66 | Your program must write the requested output data to a file named `report.csv` in the top-most `output` directory of your repository.
 67 | 
 68 | For example, given the above input file, the correct output file, `report.csv` would be:
 69 | 
 70 | ```
 71 | Border,Date,Measure,Value,Average
 72 | US-Mexico Border,03/01/2019 12:00:00 AM,Pedestrians,346158,114487
 73 | US-Canada Border,03/01/2019 12:00:00 AM,Truck Containers Full,6483,0
 74 | US-Canada Border,03/01/2019 12:00:00 AM,Trains,19,0
 75 | US-Mexico Border,02/01/2019 12:00:00 AM,Pedestrians,172163,56810
 76 | US-Canada Border,02/01/2019 12:00:00 AM,Truck Containers Empty,1319,0
 77 | US-Mexico Border,01/01/2019 12:00:00 AM,Pedestrians,56810,0
 78 | 
 79 | ```
 80 | 
 81 | The lines should be sorted in descending order by 
 82 | * `Date`
 83 | * `Value` (or number of crossings)
 84 | * `Measure`
 85 | * `Border`
 86 | 
 87 | The column, `Average`, is for the running monthly average of total crossings for that border and means of crossing in all previous months. In this example, to calculate the `Average` for the first line (i.e., running monthly average of total pedestrians crossing the US-Mexico Border in all of the months preceding March), you'd take the average sum of total number of US-Mexico pedestrian crossings in February `156,891 + 15,272 = 172,163` and January `56,810`, and round it to the nearest whole number `round(228,973/2) = 114,487`
 88 | 
 89 | ## Instructions
 90 | 
 91 | We designed this coding challenge to assess your coding skills and your understanding of computer science fundamentals. They are both prerequisites of becoming a data engineer. To solve this challenge you might pick a programing language of your choice (preferably Python, Scala, Java, or C/C++ because they are commonly used and will help us better assess you), but you are only allowed to use the default data structures that come with that programming language (you might use I/O libraries). For example, you can code in Python, but you should not use Pandas or any other external libraries (i.e., don't use Python modules that must be installed using 'pip').
 92 | 
 93 | ***The objective here is to see if you can implement the solution using basic data structure building blocks and software engineering best practices (by writing clean, modular, and well-tested code).***
 94 | 
 95 | 
 96 | # Tips on getting an interview
 97 | 
 98 | ## Writing clean, scalable and well-tested code
 99 | 
100 | As a data engineer, it’s important that you write clean, well-documented code that scales for a large amount of data. For this reason, it’s important to ensure that your solution works well for a large number of records, rather than just the above example.
101 | 
102 | [Here](https://data.transportation.gov/api/views/keg4-3bc2/rows.csv?accessType=DOWNLOAD) you can find large datasets to test your code (see [here](https://data.transportation.gov/Research-and-Statistics/Border-Crossing-Entry-Data/keg4-3bc2) for more information on the data dictionary).
103 | 
104 | Note, we will use it to test the full functionality of your code, along with other tests.
105 | 
106 | It's also important to use software engineering best practices like unit tests, especially since data is not always clean and predictable.
107 | 
108 | Before submitting your solution you should summarize your approach and run instructions (if any) in your `README`.
109 | 
110 | You may write your solution in any mainstream programming language, such as C, C++, C#, Go, Java, Python, Ruby, or Scala. Once completed, submit a link of your Github or Bitbucket repo with your source code.
111 | 
112 | In addition to the source code, the top-most directory of your repo must include the `input` and `output` directories, and a shell script named `run.sh` that compiles and runs the program(s) that implement(s) the required features.
113 | 
114 | If your solution requires additional libraries, environments, or dependencies, you must specify these in your `README` documentation. See the figure below for the required structure of the top-most directory in your repo, or simply clone this repo.
115 | 
116 | ## Repo directory structure
117 | 
118 | The directory structure for your repo should look like this:
119 | 
120 |     ├── README.md
121 |     ├── run.sh
122 |     ├── src
123 |     │   └── border_analytics.py
124 |     ├── input
125 |     │   └── Border_Crossing_Entry_Data.csv
126 |     ├── output
127 |     |   └── report.csv
128 |     ├── insight_testsuite
129 |         └── run_tests.sh
130 |         └── tests
131 |             └── test_1
132 |             |   ├── input
133 |             |   │   └── Border_Crossing_Entry_Data.csv
134 |             |   |__ output
135 |             |   │   └── report.csv
136 |             ├── your-own-test_1
137 |                 ├── input
138 |                 │   └── Border_Crossing_Entry_Data.csv
139 |                 |── output
140 |                     └── report.csv
141 | 
142 | **Don't fork this repo** and don't use this `README` instead of your own. The content of `src` does not need to be a single file called `border_analytics.py`, which is only an example. Instead, you should include your own source files and give them expressive names.
143 | 
144 | ## Testing your directory structure and output format
145 | 
146 | To make sure that your code has the correct directory structure and the format of the output files are correct, we have included a test script called `run_tests.sh` in the `insight_testsuite` folder.
147 | 
148 | The tests are stored simply as text files under the `insight_testsuite/tests` folder. Each test should have a separate folder with an `input` folder with the test `Border_Crossing_Entry_Data.csv` input file and an `output` folder with what should be the expected `report.csv` for that test.
149 | 
150 | You can run the test with the following command from within the `insight_testsuite` folder:
151 | 
152 |     insight_testsuite~$ ./run_tests.sh
153 | 
154 | On a failed test, the output of `run_tests.sh` should look like:
155 | 
156 |     [FAIL]: test_1
157 |     [Thu Mar 30 16:28:01 PDT 2017] 0 of 1 tests passed
158 | 
159 | On success:
160 | 
161 |     [PASS]: test_1
162 |     [Thu Mar 30 16:25:57 PDT 2017] 1 of 1 tests passed
163 | 
164 | 
165 | 
166 | One test has been provided as a way to check your formatting and simulate how we will be running tests when you submit your solution. We urge you to write your own additional tests. `test_1` is intended to alert you if the directory structure or the output for this test is incorrect.
167 | 
168 | Your submission must pass at least the provided test in order to pass the coding challenge.
169 | 
170 | The <a href="http://insight-cc-submission.com/test-my-repo-link">website</a> that we mentioned earlier could be used to test your code has been primarily tested on Python code but could be used for Java and C++ repos. Keep in mind that if you need to compile your code (e.g., javac, make), that compilation needs to happen in the `run.sh` file of your code repository. For Python programmers, you are able to use Python2 or Python3 but if you use the later, specify `python3`, which defaults to Python 3.5.2, in your `run.sh` script or `python3.7` if you use that version.
171 | 
172 | # Questions?
173 | Email us at cc@insightdataengineering.com
174 | 


--------------------------------------------------------------------------------