├── README.md ├── address_sample_ 2.xlsx ├── input.xlsx ├── main.py ├── output.xlsx └── start.bat /README.md: -------------------------------------------------------------------------------- 1 | # Python-Address-Filter -------------------------------------------------------------------------------- /address_sample_ 2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AppleMac-DH-TECH/Python-Address-Filter/54bda9e6badca9bda05ce294c4c8d6cdd2a86fb8/address_sample_ 2.xlsx -------------------------------------------------------------------------------- /input.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AppleMac-DH-TECH/Python-Address-Filter/54bda9e6badca9bda05ce294c4c8d6cdd2a86fb8/input.xlsx -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from pandas import DataFrame 4 | 5 | cleaned = [] 6 | 7 | def train(src_xlsx_path: str): 8 | df = pd.read_excel(src_xlsx_path, sheet_name="Sheet1") 9 | 10 | global cleaned 11 | 12 | for id in range(len(df)): 13 | src_addr = df.at[id, 'EMP LOCATION'] 14 | dst_addr = df.at[id, 'CLEANED ADDRESS'] 15 | inx = 0 16 | while src_addr[inx] == ' ': 17 | inx += 1 18 | src_addr = src_addr[inx:] 19 | inx = 0 20 | while dst_addr[inx] == ' ': 21 | inx += 1 22 | dst_addr = dst_addr[inx:] 23 | # print(src_addr) 24 | # print(dst_addr) 25 | 26 | diff_str = '' 27 | j = 0 28 | for i in src_addr: 29 | if j >= len(dst_addr): 30 | break 31 | if i == dst_addr[j]: 32 | if diff_str != '' and diff_str not in cleaned: 33 | cleaned.append(diff_str) 34 | print(diff_str) 35 | diff_str = '' 36 | j += 1 37 | else: 38 | diff_str += i 39 | 40 | 41 | print(cleaned) 42 | 43 | def filter(src_addr: str): 44 | 45 | global cleaned 46 | 47 | dst_addr = src_addr 48 | 49 | if not len(cleaned): 50 | print("first training!") 51 | return '' 52 | 53 | for item in cleaned: 54 | dst_addr = dst_addr.replace(item, '') 55 | 56 | return dst_addr 57 | 58 | 59 | if __name__ == "__main__": 60 | train_xlsx_path = os.getcwd() + "\\address_sample_ 2.xlsx" 61 | src_addr_path = os.getcwd() + "\\input.xlsx" 62 | train(train_xlsx_path) 63 | 64 | df = pd.read_excel(src_addr_path, sheet_name="Sheet1") 65 | 66 | dst_addr_list = [] 67 | 68 | for id in range(len(df)): 69 | src_addr = df.at[id, 'EMP LOCATION'] 70 | dst_addr = filter(src_addr) 71 | if dst_addr != '': 72 | dst_addr_list.append(dst_addr) 73 | 74 | df = DataFrame(dst_addr_list, columns=['CLEANED ADDRESS']) 75 | df.to_excel("output.xlsx", sheet_name='Sheet1') -------------------------------------------------------------------------------- /output.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AppleMac-DH-TECH/Python-Address-Filter/54bda9e6badca9bda05ce294c4c8d6cdd2a86fb8/output.xlsx -------------------------------------------------------------------------------- /start.bat: -------------------------------------------------------------------------------- 1 | python main.py --------------------------------------------------------------------------------