from pandas import read_csv, set_option, merge
from numpy import mean
'display.max_columns', None) set_option(
Pre-Process Tax Data
Import Libraries
Reformat Data Dictionary Names
def data_dict_pre_process(filename: str):
= []
strip_variable_whitespace_list = read_csv(f'../../data/raw-data/{filename}.csv')
data_dict_df for idx, row in data_dict_df.iterrows():
= row['VARIABLE NAME'].strip()
variable
strip_variable_whitespace_list.append(variable)'VARIABLE NAME'] = strip_variable_whitespace_list
data_dict_df[
= data_dict_df['VARIABLE NAME'].tolist()
data_dict_df_variable_names
= data_dict_df_variable_names[:19]
tax_qualitative_variables = []
format_tax_qualitative_variables for code in tax_qualitative_variables:
format_tax_qualitative_variables.append(code)
= data_dict_df_variable_names[19:]
tax_numerical_variables = []
total_count_return_variables = []
total_amount_return_variables for code in tax_numerical_variables:
if code.startswith('N'):
total_count_return_variables.append(code)else:
total_amount_return_variables.append(code)
= format_tax_qualitative_variables + total_amount_return_variables
amount_return_variables_new = tax_qualitative_variables + total_count_return_variables
count_return_variables_new = data_dict_df['DESCRIPTION'].loc[data_dict_df['VARIABLE NAME'].isin(amount_return_variables_new)].tolist()
description_amount_data_dict_list = data_dict_df['DESCRIPTION'].loc[data_dict_df['VARIABLE NAME'].isin(count_return_variables_new)].tolist()
description_count_data_dict_list
= data_dict_df.loc[data_dict_df['VARIABLE NAME'].isin(amount_return_variables_new)]
amount_data_dict_df = data_dict_df.loc[data_dict_df['VARIABLE NAME'].isin(count_return_variables_new)]
count_data_dict_df '../../data/intermediate-data/data_dict_variable_total_amount.csv', index=False)
amount_data_dict_df.to_csv('../../data/intermediate-data/data_dict_variable_total_number_returns.csv', index=False) count_data_dict_df.to_csv(
Preprocess Tax Data
def tax_data_pre_process(filename: str, state: str):
= read_csv(f'../../data/raw-data/{filename}.csv').drop('agi_stub', axis=1)
tax_data_df = tax_data_df.loc[tax_data_df['STATE'] == f'{state}']
tax_data_df
= read_csv(f'../../data/intermediate-data/data_dict_variable_total_amount.csv')
amount_data_dict_df = amount_data_dict_df['VARIABLE NAME'].tolist()
amount_data_dict_df_variables_list = amount_data_dict_df['DESCRIPTION'].tolist()
description_amount_list = tax_data_df.loc[:, tax_data_df.columns.isin(amount_data_dict_df_variables_list)]
total_amount_tax_data_df = description_amount_list
total_amount_tax_data_df.columns = total_amount_tax_data_df.rename(
total_amount_tax_data_df ={
columns'5-digit Zip code': 'ZIPCODE'
}
)f'../../data/intermediate-data/{state}_total_amount_tax_data.csv', index=False)
total_amount_tax_data_df.to_csv(
= read_csv(f'../../data/intermediate-data/data_dict_variable_total_number_returns.csv')
number_data_dict_df = number_data_dict_df['VARIABLE NAME'].tolist()
number_data_dict_df_variables_list = number_data_dict_df['DESCRIPTION'].tolist()
description_number_list = tax_data_df.loc[:, tax_data_df.columns.isin(number_data_dict_df_variables_list)]
total_number_tax_data_df = description_number_list
total_number_tax_data_df.columns = total_number_tax_data_df.rename(
total_number_tax_data_df ={
columns'5-digit Zip code': 'ZIPCODE'
}
)f'../../data/intermediate-data/{state}_total_number_returns_tax_data.csv', index=False) total_number_tax_data_df.to_csv(
Preprocess EV Data
def ev_data_pre_process(state: str):
= read_csv('../../data/raw-data/Electric_Vehicle_Population_Data.csv').rename(columns={
ev_data_df 'Postal Code': 'ZIPCODE'
})'ZIPCODE'] = ev_data_df['ZIPCODE'].fillna(0).astype(int)
ev_data_df[= ev_data_df.isnull().sum()
null_values =True)
ev_data_df.dropna(inplace= ev_data_df.loc[(ev_data_df['Model Year'] <= 2020) & (ev_data_df['Electric Range'] > 0) & (ev_data_df['Model Year'] > 2010) & (ev_data_df['State'] == f'{state}')]
ev_data_df_2020 = ev_data_df[ev_data_df.duplicated()]
ev_duplicated_df
= ev_data_df_2020.groupby(['City', 'State', 'ZIPCODE', 'Model Year', 'Make', 'Model', 'Electric Vehicle Type', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility']).agg({
zip_code_count_brand_model_year 'Electric Range': 'mean',
'County': 'count',
}).reset_index()= ev_data_df_2020.groupby(['City', 'State', 'ZIPCODE', 'Make', 'Model', 'Electric Vehicle Type', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility']).agg({
zip_code_count_brand_model 'Electric Range': 'mean',
'County': 'count',
}).reset_index()= ev_data_df_2020.groupby(['City', 'State', 'ZIPCODE', 'Make', 'Electric Vehicle Type', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility']).agg({
zip_code_count_brand 'Electric Range': 'mean',
'County': 'count',
}).reset_index()f'../../data/intermediate-data/{state}_zip_code_brand_model_year.csv', index=False)
zip_code_count_brand_model_year.to_csv(f'../../data/intermediate-data/{state}_zip_code_brand_model.csv', index=False)
zip_code_count_brand_model.to_csv(f'../../data/intermediate-data/{state}_zip_code_brand.csv', index=False) zip_code_count_brand.to_csv(
Join Tax/EV Data
def join_tax_ev_data(state: str):
= read_csv(f'../../data/intermediate-data/{state}_zip_code_brand_model_year.csv')
zip_code_count_brand_model_year = read_csv(f'../../data/intermediate-data/{state}_zip_code_brand_model.csv')
zip_code_count_brand_model= read_csv(f'../../data/intermediate-data/{state}_zip_code_brand.csv')
zip_code_count_brand = read_csv(f'../../data/intermediate-data/{state}_total_amount_tax_data.csv')
amount_tax_data
= merge(zip_code_count_brand_model_year, amount_tax_data, on='ZIPCODE').rename(
final_zip_code_count_brand_model_year ={
columns'County': 'Vehicle Count'
}
)= merge(zip_code_count_brand_model, amount_tax_data, on='ZIPCODE').rename(
final_zip_code_count_brand_model ={
columns'County': 'Vehicle Count'
}
)= merge(zip_code_count_brand, amount_tax_data, on='ZIPCODE').rename(
final_zip_code_count_brand ={
columns'County': 'Vehicle Count'
}
)= final_zip_code_count_brand_model_year.drop(
final_zip_code_count_brand_model_year
['Adjust gross income (AGI) [8]',
'Number of volunteer prepared returns with Earned Income Credit [5]',
'Total income amount',
'Total statutory adjustments amount',
'Total standard deduction amount',
'Total itemized deductions amount',
'Amount of AGI for itemized returns',
'Total taxes paid amount',
'Taxable income amount',
'Income tax before credits amount',
'Alternative minimum tax amount',
'Total tax credits amount',
'Total premium tax credit amount',
'Total tax payments amount',
'Income tax after credits amount ',
'Total tax liability amount [18] ',
'Tax due at time of filing amount [19] ',
'Total overpayments amount',
'Overpayments refunded amount [20] ',
],= 1
axis
)'../../data/processed-data/brand_model_year_tax_data.csv', index=False)
final_zip_code_count_brand_model_year.to_csv('../../data/processed-data/brand_model_tax_data.csv', index=False)
final_zip_code_count_brand_model.to_csv('../../data/processed-data/brand_tax_data.csv', index=False) final_zip_code_count_brand.to_csv(
Execute Functions For Final Dataset
='data_dict2021')
data_dict_pre_process(filename='tax_data_2021', state='WA')
tax_data_pre_process(filename='WA')
ev_data_pre_process(state='WA') join_tax_ev_data(state