├── src └── README.md ├── input └── README.md ├── output └── README.md ├── insight_testsuite ├── tests │ └── test_1 │ │ ├── output │ │ ├── top_10_states.txt │ │ └── top_10_occupations.txt │ │ └── input │ │ └── h1b_input.csv └── run_tests.sh ├── run.sh └── README.md /src/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /input/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /output/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /insight_testsuite/tests/test_1/output/top_10_states.txt: -------------------------------------------------------------------------------- 1 | TOP_STATES;NUMBER_CERTIFIED_APPLICATIONS;PERCENTAGE 2 | FL;2;20.0% 3 | AL;1;10.0% 4 | CA;1;10.0% 5 | DE;1;10.0% 6 | GA;1;10.0% 7 | MD;1;10.0% 8 | NJ;1;10.0% 9 | TX;1;10.0% 10 | WA;1;10.0% 11 | -------------------------------------------------------------------------------- /insight_testsuite/tests/test_1/output/top_10_occupations.txt: -------------------------------------------------------------------------------- 1 | TOP_OCCUPATIONS;NUMBER_CERTIFIED_APPLICATIONS;PERCENTAGE 2 | SOFTWARE DEVELOPERS, APPLICATIONS;6;60.0% 3 | ACCOUNTANTS AND AUDITORS;1;10.0% 4 | COMPUTER OCCUPATIONS, ALL OTHER;1;10.0% 5 | COMPUTER SYSTEMS ANALYST;1;10.0% 6 | DATABASE ADMINISTRATORS;1;10.0% 7 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Use this shell script to compile (if necessary) your code and then execute it. Below is an example of what might be found in this file if your program was written in Python 4 | # 5 | #python ./src/h1b_counting.py ./input/h1b_input.csv ./output/top_10_occupations.txt ./output/top_10_states.txt 6 | 7 | -------------------------------------------------------------------------------- /insight_testsuite/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | declare -r color_start="\033[" 4 | declare -r color_red="${color_start}0;31m" 5 | declare -r color_green="${color_start}0;32m" 6 | declare -r color_blue="${color_start}0;34m" 7 | declare -r color_norm="${color_start}0m" 8 | 9 | GRADER_ROOT=$(dirname ${BASH_SOURCE}) 10 | 11 | PROJECT_PATH=${GRADER_ROOT}/.. 12 | 13 | function print_dir_contents { 14 | local proj_path=$1 15 | echo "Project contents:" 16 | echo -e "${color_blue}$(ls ${proj_path})${color_norm}" 17 | } 18 | 19 | function find_file_or_dir_in_project { 20 | local proj_path=$1 21 | local file_or_dir_name=$2 22 | if [[ ! -e "${proj_path}/${file_or_dir_name}" ]]; then 23 | echo -e "[${color_red}FAIL${color_norm}]: no ${file_or_dir_name} found" 24 | print_dir_contents ${proj_path} 25 | echo -e "${color_red}${file_or_dir_name} [MISSING]${color_norm}" 26 | exit 1 27 | fi 28 | } 29 | 30 | # check project directory structure 31 | function check_project_struct { 32 | find_file_or_dir_in_project ${PROJECT_PATH} run.sh 33 | find_file_or_dir_in_project ${PROJECT_PATH} src 34 | find_file_or_dir_in_project ${PROJECT_PATH} input 35 | find_file_or_dir_in_project ${PROJECT_PATH} output 36 | } 37 | 38 | # setup testing output folder 39 | function setup_testing_input_output { 40 | TEST_OUTPUT_PATH=${GRADER_ROOT}/temp 41 | if [ -d ${TEST_OUTPUT_PATH} ]; then 42 | rm -rf ${TEST_OUTPUT_PATH} 43 | fi 44 | 45 | mkdir -p ${TEST_OUTPUT_PATH} 46 | 47 | cp -r ${PROJECT_PATH}/src ${TEST_OUTPUT_PATH} 48 | cp -r ${PROJECT_PATH}/run.sh ${TEST_OUTPUT_PATH} 49 | cp -r ${PROJECT_PATH}/input ${TEST_OUTPUT_PATH} 50 | cp -r ${PROJECT_PATH}/output ${TEST_OUTPUT_PATH} 51 | 52 | rm -r ${TEST_OUTPUT_PATH}/input/* 53 | rm -r ${TEST_OUTPUT_PATH}/output/* 54 | cp -r ${GRADER_ROOT}/tests/${test_folder}/input/h1b_input.csv ${TEST_OUTPUT_PATH}/input/h1b_input.csv 55 | } 56 | 57 | function compare_outputs { 58 | NUM_OUTPUT_FILES_PASSED=0 59 | OUTPUT_FILENAME1=top_10_occupations.txt 60 | PROJECT_ANSWER_PATH1=${GRADER_ROOT}/temp/output/${OUTPUT_FILENAME1} 61 | TEST_ANSWER_PATH1=${GRADER_ROOT}/tests/${test_folder}/output/${OUTPUT_FILENAME1} 62 | 63 | DIFF_RESULT1=$(diff -bB ${PROJECT_ANSWER_PATH1} ${TEST_ANSWER_PATH1} | wc -l) 64 | if [ "${DIFF_RESULT1}" -eq "0" ] && [ -f ${PROJECT_ANSWER_PATH1} ]; then 65 | echo -e "[${color_green}PASS${color_norm}]: ${test_folder} ${OUTPUT_FILENAME}" 66 | NUM_OUTPUT_FILES_PASSED=$(($NUM_OUTPUT_FILES_PASSED+1)) 67 | else 68 | echo -e "[${color_red}FAIL${color_norm}]: ${test_folder}" 69 | diff ${PROJECT_ANSWER_PATH1} ${TEST_ANSWER_PATH1} 70 | fi 71 | 72 | OUTPUT_FILENAME2=top_10_states.txt 73 | PROJECT_ANSWER_PATH2=${GRADER_ROOT}/temp/output/${OUTPUT_FILENAME2} 74 | TEST_ANSWER_PATH2=${GRADER_ROOT}/tests/${test_folder}/output/${OUTPUT_FILENAME2} 75 | 76 | DIFF_RESULT2=$(diff -bB ${PROJECT_ANSWER_PATH2} ${TEST_ANSWER_PATH2} | wc -l) 77 | if [ "${DIFF_RESULT2}" -eq "0" ] && [ -f ${PROJECT_ANSWER_PATH2} ]; then 78 | echo -e "[${color_green}PASS${color_norm}]: ${test_folder} ${OUTPUT_FILENAME2}" 79 | NUM_OUTPUT_FILES_PASSED=$(($NUM_OUTPUT_FILES_PASSED+1)) 80 | else 81 | echo -e "[${color_red}FAIL${color_norm}]: ${test_folder}" 82 | diff ${PROJECT_ANSWER_PATH2} ${TEST_ANSWER_PATH2} 83 | fi 84 | 85 | if [ "${NUM_OUTPUT_FILES_PASSED}" -eq "2" ]; then 86 | PASS_CNT=$(($PASS_CNT+1)) 87 | fi 88 | 89 | } 90 | 91 | function run_all_tests { 92 | TEST_FOLDERS=$(ls ${GRADER_ROOT}/tests) 93 | NUM_TESTS=$(($(echo $(echo ${TEST_FOLDERS} | wc -w)))) 94 | PASS_CNT=0 95 | 96 | # Loop through all tests 97 | for test_folder in ${TEST_FOLDERS}; do 98 | 99 | setup_testing_input_output 100 | 101 | cd ${GRADER_ROOT}/temp 102 | bash run.sh 2>&1 103 | cd ../ 104 | 105 | compare_outputs 106 | done 107 | 108 | echo "[$(date)] ${PASS_CNT} of ${NUM_TESTS} tests passed" 109 | echo "[$(date)] ${PASS_CNT} of ${NUM_TESTS} tests passed" >> ${GRADER_ROOT}/results.txt 110 | } 111 | 112 | check_project_struct 113 | run_all_tests 114 | -------------------------------------------------------------------------------- /insight_testsuite/tests/test_1/input/h1b_input.csv: -------------------------------------------------------------------------------- 1 | ;CASE_NUMBER;CASE_STATUS;CASE_SUBMITTED;DECISION_DATE;VISA_CLASS;EMPLOYMENT_START_DATE;EMPLOYMENT_END_DATE;EMPLOYER_NAME;EMPLOYER_BUSINESS_DBA;EMPLOYER_ADDRESS;EMPLOYER_CITY;EMPLOYER_STATE;EMPLOYER_POSTAL_CODE;EMPLOYER_COUNTRY;EMPLOYER_PROVINCE;EMPLOYER_PHONE;EMPLOYER_PHONE_EXT;AGENT_REPRESENTING_EMPLOYER;AGENT_ATTORNEY_NAME;AGENT_ATTORNEY_CITY;AGENT_ATTORNEY_STATE;JOB_TITLE;SOC_CODE;SOC_NAME;NAICS_CODE;TOTAL_WORKERS;NEW_EMPLOYMENT;CONTINUED_EMPLOYMENT;CHANGE_PREVIOUS_EMPLOYMENT;NEW_CONCURRENT_EMP;CHANGE_EMPLOYER;AMENDED_PETITION;FULL_TIME_POSITION;PREVAILING_WAGE;PW_UNIT_OF_PAY;PW_WAGE_LEVEL;PW_SOURCE;PW_SOURCE_YEAR;PW_SOURCE_OTHER;WAGE_RATE_OF_PAY_FROM;WAGE_RATE_OF_PAY_TO;WAGE_UNIT_OF_PAY;H1B_DEPENDENT;WILLFUL_VIOLATOR;SUPPORT_H1B;LABOR_CON_AGREE;PUBLIC_DISCLOSURE_LOCATION;WORKSITE_CITY;WORKSITE_COUNTY;WORKSITE_STATE;WORKSITE_POSTAL_CODE;ORIGINAL_CERT_DATE 2 | 0;I-200-18026-338377;CERTIFIED;2018-01-29;2018-02-02;H-1B;2018-07-28;2021-07-27;MICROSOFT CORPORATION;;1 MICROSOFT WAY;REDMOND;WA;98052;UNITED STATES OF AMERICA;;4258828080;;N;",";;;SOFTWARE ENGINEER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";51121.0;1;0;1;0;0;0;0;Y;112549.0;Year;Level II;OES;2017.0;OFLC ONLINE DATA CENTER;143915.0;0.0;Year;N;N;;;;REDMOND;KING;WA;98052; 3 | 1;I-200-17296-353451;CERTIFIED;2017-10-23;2017-10-27;H-1B;2017-11-06;2020-11-06;ERNST & YOUNG U.S. LLP;;200 PLAZA DRIVE;SECAUCUS;NJ;07094;UNITED STATES OF AMERICA;;2018723003;;Y;"BRADSHAW, MELANIE";TORONTO;;TAX SENIOR;13-2011;ACCOUNTANTS AND AUDITORS;541211.0;1;0;0;0;0;1;0;Y;79976.0;Year;Level II;OES;2017.0;OFLC ONLINE DATA CENTER;100000.0;0.0;Year;N;N;;;;SANTA CLARA;SAN JOSE;CA;95110; 4 | 2;I-200-18242-524477;CERTIFIED;2018-08-30;2018-09-06;H-1B;2018-09-10;2021-09-09;LOGIXHUB LLC;;320 DECKER DRIVE;IRVING;TX;75062;UNITED STATES OF AMERICA;;2145419305;;N;",";;;DATABASE ADMINISTRATOR;15-1141;DATABASE ADMINISTRATORS;541511.0;1;0;0;0;0;1;0;Y;77792.0;Year;Level II;OES;2018.0;OFLC ONLINE DATA CENTER;78240.0;0.0;Year;N;N;;;;IRVING;DALLAS;TX;75062; 5 | 3;I-200-18070-575236;CERTIFIED;;2018-03-30;H-1B;2018-09-10;2021-09-09;"HEXAWARE TECHNOLOGIES, INC.";;101 WOOD AVENUE SOUTH;ISELIN;NJ;08830;UNITED STATES OF AMERICA;;6094096950;;Y;"DUTOT, CHRISTOPHER";TROY;MI;SOFTWARE ENGINEER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;5;5;0;0;0;0;0;Y;84406.0;Year;Level II;OES;2017.0;OFLC ONLINE DATA CENTER;84406.0;85000.0;Year;Y;N;Y;;;NEW CASTLE;NEW CASTLE;DE;19720; 6 | 4;I-200-18243-850522;CERTIFIED;2018-08-31;2018-09-07;H-1B;2018-09-07;2021-09-06;"ECLOUD LABS,INC.";;120 S WOOD AVENUE;ISELIN;NJ;08830;UNITED STATES OF AMERICA;;7327501323;;Y;"ALLEN, THOMAS";EDISON;NJ;MICROSOFT DYNAMICS CRM APPLICATION DEVELOPER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;1;0;0;0;0;0;1;Y;87714.0;Year;Level III;OES;2018.0;OFLC ONLINE DATA CENTER;95000.0;0.0;Year;Y;N;Y;Y;;BIRMINGHAM;SHELBY;AL;35244; 7 | 5;I-200-18142-939501;CERTIFIED;2018-05-22;2018-05-29;H-1B;2018-05-29;2021-05-28;OBERON IT;;1404 W WALNUT HILL LN;IRVING;TX;75038;UNITED STATES OF AMERICA;;8666609190;;Y;"GARRITSON, JAMES";RICHARDSON;TX;SENIOR SYSTEM ARCHITECT;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;1;0;0;0;0;0;1;Y;71864.0;Year;Level II;Other;2017.0;OFLC ONLINE DATA CENTER;74000.0;0.0;Year;Y;N;Y;;;SUNRISE;BROWARD;FL;33323; 8 | 6;I-200-18121-552858;CERTIFIED;2018-05-01;2018-05-07;H-1B;2018-05-02;2018-10-26;ICONSOFT INC.;;101 CAMBRIDGE STREET SUITE 360;BURLINGTON;MA;01803;UNITED STATES OF AMERICA;;8882054614;1;N;",";;;SENIOR ORACLE ADF DEVELOPER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;1;0;1;0;0;0;0;Y;92331.0;Year;Level III;Other;2017.0;OFLC ONLINE DATA CENTER;114000.0;0.0;Year;Y;N;Y;;;JACKSONVILLE;DUVAL COUNTY;FL;32202; 9 | 7;I-200-18215-849606;CERTIFIED;2018-08-03;2018-08-09;H-1B;2018-08-11;2021-08-11;COGNIZANT TECHNOLOGY SOLUTIONS US CORP;;211 QUALITY CIRCLE;COLLEGE STATION;TX;77845;UNITED STATES OF AMERICA;;2019661249;;N;",";;;SENIOR SYSTEMS ANALYST JC60;15-1121;COMPUTER SYSTEMS ANALYST;541512.0;1;0;1;0;0;0;0;Y;80579.0;Year;Level II;OES;2018.0;OFLC ONLINE DATA CENTER;80579.0;0.0;Year;Y;N;Y;;;OWINGS MILLS;BALTIMORE;MD;21117; 10 | 8;I-201-17339-472823;CERTIFIED;2017-12-08;2017-12-14;H-1B1 Chile;2017-12-08;2019-06-07;ISHI SYSTEMS INC;;185 HUDSON STREET;JERSEY CITY;NJ;07311;UNITED STATES OF AMERICA;;2013326900;;N;",";;;ASSOCIATE PRODUCT MANAGER(15-1199.09);15-1199;"COMPUTER OCCUPATIONS, ALL OTHER";541511.0;1;0;1;0;0;0;0;Y;88317.0;Year;Level III;OES;2017.0;OFLC ONLINE DATA CENTER;90000.0;0.0;Year;;;;;;JERSEY CITY;HUDSON;NJ;07311; 11 | 9;I-200-18233-239931;CERTIFIED;2018-08-21;2018-08-27;H-1B;2018-09-05;2021-09-04;"WB SOLUTIONS, LLC";;7320 E FLETCHER AVE;TAMPA;FL;33637;UNITED STATES OF AMERICA;;8133300099;;Y;"KIDAMBI, VAMAN";TRUMBULL;CT;SENIOR JAVA DEVELOPER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;1;0;0;0;0;1;0;Y;104790.0;Year;Level III;OES;2018.0;OFLC ONLINE DATA CENTER;105000.0;0.0;Year;Y;N;Y;Y;;ALPHARETTA;FULTON;GA;30005; 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | 1. [Problem](README.md#problem) 3 | 2. [Input Dataset](README.md#input-dataset) 4 | 3. [Instructions](README.md#instructions) 5 | 4. [Output](README.md#output) 6 | 5. [Tips on getting an interview](README.md#tips-on-getting-an-interview) 7 | 6. [Instructions to submit your solution](README.md#instructions-to-submit-your-solution) 8 | 7. [FAQ](README.md#faq) 9 | 8. [Questions?](README.md#questions?) 10 | 11 | # Problem 12 | 13 | A newspaper editor was researching immigration data trends on H1B(H-1B, H-1B1, E-3) visa application processing over the past years, trying to identify the occupations and states with the most number of approved H1B visas. She has found statistics available from the US Department of Labor and its [Office of Foreign Labor Certification Performance Data](https://www.foreignlaborcert.doleta.gov/performancedata.cfm#dis). But while there are ready-made reports for [2018](https://www.foreignlaborcert.doleta.gov/pdf/PerformanceData/2018/H-1B_Selected_Statistics_FY2018_Q4.pdf) and [2017](https://www.foreignlaborcert.doleta.gov/pdf/PerformanceData/2017/H-1B_Selected_Statistics_FY2017.pdf), the site doesn’t have them for past years. 14 | 15 | As a data engineer, you are asked to create a mechanism to analyze past years data, specificially calculate two metrics: **Top 10 Occupations** and **Top 10 States** for **certified** visa applications. 16 | 17 | Your code should be modular and reusable for future. If the newspaper gets data for the year 2019 (with the assumption that the necessary data to calculate the metrics are available) and puts it in the `input` directory, running the `run.sh` script should produce the results in the `output` folder without needing to change the code. 18 | 19 | # Input Dataset 20 | 21 | Raw data could be found [here](https://www.foreignlaborcert.doleta.gov/performancedata.cfm) under the __Disclosure Data__ tab (i.e., files listed in the __Disclosure File__ column with ".xlsx" extension). 22 | For your convenience we converted the Excel files into a semicolon separated (";") format and placed them into this Google drive [folder](https://drive.google.com/drive/folders/1Nti6ClUfibsXSQw5PUIWfVGSIrpuwyxf?usp=sharing). However, do not feel limited to test your code on only the files we've provided on the Google drive 23 | 24 | **Note:** Each year of data can have different columns. Check **File Structure** docs before development. 25 | 26 | # Instructions 27 | 28 | We designed this coding challenge to assess your coding skills and your understanding of computer science fundamentals. They are both prerequisites of becoming a data engineer. To solve this challenge you might pick a programing language of your choice (preferably Python, Scala, Java, or C/C++ because they are commonly used and will help us better assess you), but you are only allowed to use the default data structures that come with that programming language (you may use I/O and other standard libraries). For example, you can code in Python, **but you should not use Pandas or other external libraries**. 29 | 30 | ***The objective here is to see if you can implement the solution using basic data structure building blocks and software engineering best practices (by writing clean, modular, and well-tested code).*** 31 | 32 | # Output 33 | 34 | Your program must create 2 output files: 35 | * `top_10_occupations.txt`: Top 10 occupations for certified visa applications 36 | * `top_10_states.txt`: Top 10 states for certified visa applications 37 | 38 | Each line holds one record and each field on each line is separated by a semicolon (;). 39 | 40 | Each line of the `top_10_occupations.txt` file should contain these fields in this order: 41 | 1. __`TOP_OCCUPATIONS`__: Use the occupation name associated with an application's Standard Occupational Classification (SOC) code 42 | 2. __`NUMBER_CERTIFIED_APPLICATIONS`__: Number of applications that have been certified for that occupation. An application is considered certified if it has a case status of `Certified` 43 | 3. __`PERCENTAGE`__: % of applications that have been certified for that occupation compared to total number of certified applications regardless of occupation. 44 | 45 | The records in the file must be sorted by __`NUMBER_CERTIFIED_APPLICATIONS`__, and in case of a tie, alphabetically by __`TOP_OCCUPATIONS`__. 46 | 47 | Each line of the `top_10_states.txt` file should contain these fields in this order: 48 | 1. __`TOP_STATES`__: State where the work will take place 49 | 2. __`NUMBER_CERTIFIED_APPLICATIONS`__: Number of applications that have been certified for work in that state. An application is considered certified if it has a case status of `Certified` 50 | 3. __`PERCENTAGE`__: % of applications that have been certified in that state compared to total number of certified applications regardless of state. 51 | 52 | The records in this file must be sorted by __`NUMBER_CERTIFIED_APPLICATIONS`__ field, and in case of a tie, alphabetically by __`TOP_STATES`__. 53 | 54 | Depending on the input (e.g., see the example below), there may be fewer than 10 lines in each file. There, however, should not be more than 10 lines in each file. In case of ties, only list the top 10 based on the sorting instructions given above. 55 | 56 | Percentages also should be rounded off to 1 decimal place. For instance, 1.05% should be rounded to 1.1% and 1.04% should be rounded to 1.0%. Also, 1% should be represented by 1.0% 57 | 58 | ## Example 59 | If you are given the input file, `./input/h1b_input.csv` with the following data: 60 | ``` 61 | ;CASE_NUMBER;CASE_STATUS;CASE_SUBMITTED;DECISION_DATE;VISA_CLASS;EMPLOYMENT_START_DATE;EMPLOYMENT_END_DATE;EMPLOYER_NAME;EMPLOYER_BUSINESS_DBA;EMPLOYER_ADDRESS;EMPLOYER_CITY;EMPLOYER_STATE;EMPLOYER_POSTAL_CODE;EMPLOYER_COUNTRY;EMPLOYER_PROVINCE;EMPLOYER_PHONE;EMPLOYER_PHONE_EXT;AGENT_REPRESENTING_EMPLOYER;AGENT_ATTORNEY_NAME;AGENT_ATTORNEY_CITY;AGENT_ATTORNEY_STATE;JOB_TITLE;SOC_CODE;SOC_NAME;NAICS_CODE;TOTAL_WORKERS;NEW_EMPLOYMENT;CONTINUED_EMPLOYMENT;CHANGE_PREVIOUS_EMPLOYMENT;NEW_CONCURRENT_EMP;CHANGE_EMPLOYER;AMENDED_PETITION;FULL_TIME_POSITION;PREVAILING_WAGE;PW_UNIT_OF_PAY;PW_WAGE_LEVEL;PW_SOURCE;PW_SOURCE_YEAR;PW_SOURCE_OTHER;WAGE_RATE_OF_PAY_FROM;WAGE_RATE_OF_PAY_TO;WAGE_UNIT_OF_PAY;H1B_DEPENDENT;WILLFUL_VIOLATOR;SUPPORT_H1B;LABOR_CON_AGREE;PUBLIC_DISCLOSURE_LOCATION;WORKSITE_CITY;WORKSITE_COUNTY;WORKSITE_STATE;WORKSITE_POSTAL_CODE;ORIGINAL_CERT_DATE 62 | 0;I-200-18026-338377;CERTIFIED;2018-01-29;2018-02-02;H-1B;2018-07-28;2021-07-27;MICROSOFT CORPORATION;;1 MICROSOFT WAY;REDMOND;WA;98052;UNITED STATES OF AMERICA;;4258828080;;N;",";;;SOFTWARE ENGINEER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";51121.0;1;0;1;0;0;0;0;Y;112549.0;Year;Level II;OES;2017.0;OFLC ONLINE DATA CENTER;143915.0;0.0;Year;N;N;;;;REDMOND;KING;WA;98052; 63 | 1;I-200-17296-353451;CERTIFIED;2017-10-23;2017-10-27;H-1B;2017-11-06;2020-11-06;ERNST & YOUNG U.S. LLP;;200 PLAZA DRIVE;SECAUCUS;NJ;07094;UNITED STATES OF AMERICA;;2018723003;;Y;"BRADSHAW, MELANIE";TORONTO;;TAX SENIOR;13-2011;ACCOUNTANTS AND AUDITORS;541211.0;1;0;0;0;0;1;0;Y;79976.0;Year;Level II;OES;2017.0;OFLC ONLINE DATA CENTER;100000.0;0.0;Year;N;N;;;;SANTA CLARA;SAN JOSE;CA;95110; 64 | 2;I-200-18242-524477;CERTIFIED;2018-08-30;2018-09-06;H-1B;2018-09-10;2021-09-09;LOGIXHUB LLC;;320 DECKER DRIVE;IRVING;TX;75062;UNITED STATES OF AMERICA;;2145419305;;N;",";;;DATABASE ADMINISTRATOR;15-1141;DATABASE ADMINISTRATORS;541511.0;1;0;0;0;0;1;0;Y;77792.0;Year;Level II;OES;2018.0;OFLC ONLINE DATA CENTER;78240.0;0.0;Year;N;N;;;;IRVING;DALLAS;TX;75062; 65 | 3;I-200-18070-575236;CERTIFIED;;2018-03-30;H-1B;2018-09-10;2021-09-09;"HEXAWARE TECHNOLOGIES, INC.";;101 WOOD AVENUE SOUTH;ISELIN;NJ;08830;UNITED STATES OF AMERICA;;6094096950;;Y;"DUTOT, CHRISTOPHER";TROY;MI;SOFTWARE ENGINEER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;5;5;0;0;0;0;0;Y;84406.0;Year;Level II;OES;2017.0;OFLC ONLINE DATA CENTER;84406.0;85000.0;Year;Y;N;Y;;;NEW CASTLE;NEW CASTLE;DE;19720; 66 | 4;I-200-18243-850522;CERTIFIED;2018-08-31;2018-09-07;H-1B;2018-09-07;2021-09-06;"ECLOUD LABS,INC.";;120 S WOOD AVENUE;ISELIN;NJ;08830;UNITED STATES OF AMERICA;;7327501323;;Y;"ALLEN, THOMAS";EDISON;NJ;MICROSOFT DYNAMICS CRM APPLICATION DEVELOPER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;1;0;0;0;0;0;1;Y;87714.0;Year;Level III;OES;2018.0;OFLC ONLINE DATA CENTER;95000.0;0.0;Year;Y;N;Y;Y;;BIRMINGHAM;SHELBY;AL;35244; 67 | 5;I-200-18142-939501;CERTIFIED;2018-05-22;2018-05-29;H-1B;2018-05-29;2021-05-28;OBERON IT;;1404 W WALNUT HILL LN;IRVING;TX;75038;UNITED STATES OF AMERICA;;8666609190;;Y;"GARRITSON, JAMES";RICHARDSON;TX;SENIOR SYSTEM ARCHITECT;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;1;0;0;0;0;0;1;Y;71864.0;Year;Level II;Other;2017.0;OFLC ONLINE DATA CENTER;74000.0;0.0;Year;Y;N;Y;;;SUNRISE;BROWARD;FL;33323; 68 | 6;I-200-18121-552858;CERTIFIED;2018-05-01;2018-05-07;H-1B;2018-05-02;2018-10-26;ICONSOFT INC.;;101 CAMBRIDGE STREET SUITE 360;BURLINGTON;MA;01803;UNITED STATES OF AMERICA;;8882054614;1;N;",";;;SENIOR ORACLE ADF DEVELOPER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;1;0;1;0;0;0;0;Y;92331.0;Year;Level III;Other;2017.0;OFLC ONLINE DATA CENTER;114000.0;0.0;Year;Y;N;Y;;;JACKSONVILLE;DUVAL COUNTY;FL;32202; 69 | 7;I-200-18215-849606;CERTIFIED;2018-08-03;2018-08-09;H-1B;2018-08-11;2021-08-11;COGNIZANT TECHNOLOGY SOLUTIONS US CORP;;211 QUALITY CIRCLE;COLLEGE STATION;TX;77845;UNITED STATES OF AMERICA;;2019661249;;N;",";;;SENIOR SYSTEMS ANALYST JC60;15-1121;COMPUTER SYSTEMS ANALYST;541512.0;1;0;1;0;0;0;0;Y;80579.0;Year;Level II;OES;2018.0;OFLC ONLINE DATA CENTER;80579.0;0.0;Year;Y;N;Y;;;OWINGS MILLS;BALTIMORE;MD;21117; 70 | 8;I-201-17339-472823;CERTIFIED;2017-12-08;2017-12-14;H-1B1 Chile;2017-12-08;2019-06-07;ISHI SYSTEMS INC;;185 HUDSON STREET;JERSEY CITY;NJ;07311;UNITED STATES OF AMERICA;;2013326900;;N;",";;;ASSOCIATE PRODUCT MANAGER(15-1199.09);15-1199;"COMPUTER OCCUPATIONS, ALL OTHER";541511.0;1;0;1;0;0;0;0;Y;88317.0;Year;Level III;OES;2017.0;OFLC ONLINE DATA CENTER;90000.0;0.0;Year;;;;;;JERSEY CITY;HUDSON;NJ;07311; 71 | 9;I-200-18233-239931;CERTIFIED;2018-08-21;2018-08-27;H-1B;2018-09-05;2021-09-04;"WB SOLUTIONS, LLC";;7320 E FLETCHER AVE;TAMPA;FL;33637;UNITED STATES OF AMERICA;;8133300099;;Y;"KIDAMBI, VAMAN";TRUMBULL;CT;SENIOR JAVA DEVELOPER;15-1132;"SOFTWARE DEVELOPERS, APPLICATIONS";541511.0;1;0;0;0;0;1;0;Y;104790.0;Year;Level III;OES;2018.0;OFLC ONLINE DATA CENTER;105000.0;0.0;Year;Y;N;Y;Y;;ALPHARETTA;FULTON;GA;30005; 72 | ``` 73 | 74 | then your output files would be: 75 | 76 | `./output/top_10_occupations.txt`: 77 | ``` 78 | TOP_OCCUPATIONS;NUMBER_CERTIFIED_APPLICATIONS;PERCENTAGE 79 | SOFTWARE DEVELOPERS, APPLICATIONS;6;60.0% 80 | ACCOUNTANTS AND AUDITORS;1;10.0% 81 | COMPUTER OCCUPATIONS, ALL OTHER;1;10.0% 82 | COMPUTER SYSTEMS ANALYST;1;10.0% 83 | DATABASE ADMINISTRATORS;1;10.0% 84 | ``` 85 | `./output/top_10_states.txt`: 86 | ``` 87 | TOP_STATES;NUMBER_CERTIFIED_APPLICATIONS;PERCENTAGE 88 | FL;2;20.0% 89 | AL;1;10.0% 90 | CA;1;10.0% 91 | DE;1;10.0% 92 | GA;1;10.0% 93 | MD;1;10.0% 94 | NJ;1;10.0% 95 | TX;1;10.0% 96 | WA;1;10.0% 97 | ``` 98 | 99 | # Tips on getting an interview 100 | 101 | ## What we are looking at 102 | As a data engineer, it’s important that you write clean, well-tested, well-documented code that scales for a large amount of data. For this reason, it’s important to ensure that your solution works well for a large number of records. 103 | Your solution should safisfy the following requirements: 104 | * Repo follows the required repo directory structure 105 | * `run.sh` script works as is in our environment and correct results are generated. If your code needs to be compilied before being executed, you must modify this script to include both compiling and executing your code 106 | * The code is well-commented 107 | * `README.md` contains Problem, Approach and Run instructions sections 108 | 109 | You may write your solution in any mainstream programming language, such as C, C++, C#, Go, Java, Python, Ruby, or Scala. 110 | Once your solution satisfies all requirements listed above, submit a link of your Github or Bitbucket repo with your source code. 111 | 112 | 113 | ## Repo directory structure 114 | 115 | The directory structure for your repo should look like this: 116 | ``` 117 | ├── README.md 118 | ├── run.sh 119 | ├── src 120 | │ └──h1b_counting.py 121 | ├── input 122 | │ └──h1b_input.csv 123 | ├── output 124 | | └── top_10_occupations.txt 125 | | └── top_10_states.txt 126 | ├── insight_testsuite 127 | └── run_tests.sh 128 | └── tests 129 | └── test_1 130 | | ├── input 131 | | │ └── h1b_input.csv 132 | | |__ output 133 | | | └── top_10_occupations.txt 134 | | | └── top_10_states.txt 135 | ├── your-own-test_1 136 | ├── input 137 | │ └── h1b_input.csv 138 | |── output 139 | | | └── top_10_occupations.txt 140 | | | └── top_10_states.txt 141 | ``` 142 | **Don't fork this repo** and don't use this `README` instead of your own. The content of `src` does not need to be a single file called `h1b-counting.py`, which is only an example. Instead, you should include your own source files and give them expressive names. 143 | 144 | ## Testing your directory structure and output format 145 | 146 | To make sure that your code has the correct directory structure and the format of the output files are correct, we have included a test script called `run_tests.sh` in the `insight_testsuite` folder. 147 | 148 | The tests files are stored in `.csv` format under the `insight_testsuite/tests` folder. Each test should have a separate folder with an `input` folder and `h1b_input.csv` file and an `output` folder with the two requested output files. 149 | 150 | You can run the test with the following command from within the `insight_testsuite` folder: 151 | 152 | insight_testsuite~$ ./run_tests.sh 153 | 154 | On a failed test, the output of `run_tests.sh` should look like: 155 | 156 | [FAIL]: test_1 157 | [Thu Mar 30 16:28:01 PDT 2017] 0 of 1 tests passed 158 | 159 | On success: 160 | 161 | [PASS]: test_1 162 | [Thu Mar 30 16:25:57 PDT 2017] 1 of 1 tests passed 163 | 164 | 165 | One test has been provided as a way to check your formatting and simulate how we will be running tests when you submit your solution. We urge you to write your own additional tests. `test_1` is only intended to alert you if the directory structure or the output for this test is incorrect. 166 | 167 | Your submission must pass at least the provided test in order to pass the coding challenge. 168 | 169 | For a limited time we also are making available a website that will allow you to simulate the environment in which we will test your code. It has been primarily tested on Python code but could be used for Java and C++ repos. Keep in mind that if you need to compile your code (e.g., javac, make), that compilation needs to happen in the run.sh file of your code repository. For Python programmers, you are able to use Python2 or Python3 but if you use the later, specify python3 in your run.sh script. 170 | 171 | # Instructions to submit your solution 172 | * To submit your entry please use the link you received in your coding challenge invite email 173 | * You will only be able to submit through the link one time 174 | * Do NOT attach a file - we will not admit solutions which are attached files 175 | * Use the submission box to enter the link to your GitHub or Bitbucket repo ONLY 176 | * Link to the specific repo for this project, not your general profile 177 | * Put any comments in the `README.md` inside your project repo, not in the submission box 178 | * We are unable to accept coding challenges that are emailed to us 179 | 180 | # FAQ 181 | 182 | **Which Github link should I submit?** 183 | You should submit the URL for the top-level root of your repository. For example, this repo would be submitted by copying the URL https://github.com/InsightDataScience/h1b_statistics into the appropriate field on the application. Do NOT try to submit your coding challenge using a pull request, which would make your source code publicly available. 184 | 185 | **Do I need a private Github repo?** 186 | No, you may use a public repo, there is no need to purchase a private repo. You may also submit a link to a Bitbucket repo if you prefer. 187 | 188 | **May I use R, Matlab, or other analytics programming languages to solve the challenge?** 189 | No. It's important that your implementation scales to handle large amounts of data. While many of our Fellows have experience with R and Matlab, applicants have found that these languages are unable to process data in a scalable fashion, so you must consider another language. 190 | 191 | **May I use distributed technologies like Hadoop or Spark?** 192 | No. Your code will be tested on a single machine, so using these technologies will negatively impact your solution. We're not testing your knowledge on distributed computing, but rather on computer science fundamentals and software engineering best practices. 193 | 194 | **What sort of system should I use to run my program on (Windows, Linux, Mac)?** 195 | You may write your solution on any system, but your source code should be portable and work on all systems. Additionally, your run.sh must be able to run on either Unix or Linux, as that's the system that will be used for testing. Linux machines are the industry standard for most data engineering teams, so it is helpful to be familiar with this. If you're currently using Windows, we recommend installing a virtual Unix environment, such as VirtualBox or VMWare, and using that to develop your code. Otherwise, you also could use tools, such as Cygwin or Docker, or a free online IDE such as Cloud9. 196 | 197 | **How fast should my program run?** 198 | While there are no strict performance guidelines to this coding challenge, we will consider the amount of time your program takes when grading the challenge. Therefore, you should design and develop your program in the optimal way (i.e. think about time and space complexity instead of trying to hit a specific run time value). 199 | 200 | **Will you email me if my code doesn't run?** 201 | Unfortunately, we receive hundreds of submissions in a very short time and are unable to email individuals if their code doesn't compile or run. We will do everything we can to properly test your code, but this requires good documentation. More so, we have provided a test suite so you can confirm that your directory structure and format are correct. 202 | 203 | **Can I use a database engine?** 204 | While a database engine can be used to complete this coding challenge, we are looking to see how well you program so please do not submit code that relies on a database engine for this challenge. 205 | 206 | **What should the format of the output be?** 207 | In order to be tested correctly, you must use the format described above. You can ensure that you have the correct format by using the testing suite we've included. 208 | 209 | **Should I check if the files in the input directory are text files or non-text files(binary)?** 210 | No, for simplicity you may assume that all of the files in the input directory are text files, with the format as described above. 211 | 212 | **Can I use an IDE like Eclipse or IntelliJ to write my program?** 213 | Yes, you can use whatever tools you want - as long as your run.sh script correctly runs the relevant target files and creates the expected files in the output directory. 214 | 215 | **What should be in the input directory?** 216 | You can put any text file you want in the directory since our testing suite will replace it. Indeed, using your own input files would be quite useful for testing. The file size limit on Github is 100 MB so you won't be able to include the larger sample input files in your input directory. 217 | 218 | **How long will it take for me to hear back from you about my submission?** 219 | We receive hundreds of submissions and try to evaluate them all in a timely manner. We try to get back to all applicants within two or three weeks of submission, but if you have a specific deadline that requires expedited review, please email us at cc@insightdataengineering.com. 220 | 221 | # Questions? 222 | Re-read this README first and if you can't find an answer to your question, Email us at cc@insightdataengineering.com 223 | --------------------------------------------------------------------------------