diff options
| author | Gabriel Kosmacher <73120774+kennykos@users.noreply.github.com> | 2023-06-29 15:40:53 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-06-29 15:40:53 -0500 |
| commit | 87472cbe8f20612b21f26f8b1a8e8f3c2d55163d (patch) | |
| tree | 5266b052fcf68b6511b39e9a361d465925a89149 /scripts | |
| parent | 3a10bd84157892f9160b5a8b80c6ceb366556703 (diff) | |
6 turn UI security inventory 23 parsingipynb into python script (#5)
* commit clean notebook
* initilizing investments cleaning script
* finishing up command line prompts, being processing data
* investment_cleaning.py script is implemented
* Investment Cleaning Script is Implemented
---------
Co-authored-by: kennykos <“gkosmacher01@gmail.com”>
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/constants.py | 90 | ||||
| -rw-r--r-- | scripts/investment_cleaning.py | 200 |
2 files changed, 290 insertions, 0 deletions
diff --git a/scripts/constants.py b/scripts/constants.py new file mode 100644 index 0000000..7af110e --- /dev/null +++ b/scripts/constants.py @@ -0,0 +1,90 @@ +COMPANY_NAMES = { + "AMERICAN EXPRESS" : "AMERICAN EXPRESS CO", + 'AIG GLOBAL' : "AMERICAN INTL GROUP INC", + "ANHEUSER-BUSCH" : "ANHEUSER-BUSCH CO", + "APTIV" : "APTIV CO", + "ASTRAZENECA" : "ASTRAZENECA PLC", + "AUSTRALIA & NEW" : "AUSTRALIA & NEW ZEALAND BKG GR", + "BAE SYS" : "BAE SYS PLC", + "BANCO SANTANDER" : "BANCO SANTANDER SA", + "BANK MONTREAL" : "BANK OF MONTREAL", + "BANK OF MONTREAL" : "BANK OF MONTREAL", + "BK MONTREAL" : "BANK OF MONTREAL", + "BANK NOVA SCOTIA" : "BANK OF NOVA SCOTIA", + "BANK OF NOVA SCOTIA" : "BANK OF NOVA SCOTIA", + "BANK AMER" : "BANK OF AMERICA CO", + "BAXTER INT" : "BAXTER INTERNATIONAL INC", + "BAYER US FIN" : "BAYER US FINANCE LLC", + "BB&T" : "BB&T CO", + "BLACKSTONE" : "BLACKSTONE", + "BMW" : "BMW", + "BNP PARIBAS" : "BNP PARIBAS", + "BRIGHTHOUSE" : "BRIGHTHOUSE", + "BRISTOL MYERS SQUIBB" : "BRISTOL MYERS SQUIBB CO", + "BRISTOL-MYERS SQUIBB" : "BRISTOL MYERS SQUIBB CO", + "CANADIAN IMPERIAL BK" : "CANADIAN IMPERIAL BK CO", + "CAPITAL ONE" : "CAPITAL ONE FINL CO", + "CATERPILLAR FINL" : "CATERPILLAR FINL", + "CENTERPOINT ENERGY" : "CENTERPOINT ENERGY INC", + "CHEVRON U S A" : "CHEVRON CO", + "CREDIT AGRICOLE" : "CREDIT AGRICOLE", + "CREDIT SUISSE" : "CREDIT SUISSE GROUP AG", + "CROWN CASTLE INTL" : "CROWN CASTLE INTL", + "DAIMLER" : "DAIMLER", + "DELTA AIR LINES" : "DELTA AIR LINES", + "DTE E" : "DTE ELEC", + "DUKE ENERGY" : "DUKE ENERGY CO", + "DOWDUPONT INC" : "DUPONT DE NEMOURS INC", + "ENTERGY" : "ENTERGY CO", + "EQUITABLE FINL LIFE" : "EQUITABLE FINL LIFE GLOBAL FDG", + "ESC CB LEHMAN BROS" : "ESC LEHMAN BROTH HLD INC", + "FIFTH THIRD BANCORP" : "FIFTH THIRD BANCORP", + "FLORIDA P" : "FLORIDA POWER & LIGHT CO", + "GENERAL MTRS" : "GENERAL MOTORS", + "GENERAL MOTORS" : "GENERAL MOTORS", + "HEWLETT PACKARD" : "HEWLETT PACKARD ENTERPRISE CO", + "HP INC" : "HEWLETT PACKARD ENTERPRISE CO", + "HUNTINGTON" : "HUNTINGTON NATL BK MD", + "JACKSON FINANCIAL INC" : "JACKSON NATIONAL LIFE GL", + "JPM CHASE" : "JPMORGAN CHASE & CO", + "KINDER MORGAN" : "KINDER MORGAN INC", + "LLOYDS BKG" : "LLOYDS BANKING GROUP PLC FORME", + "MACQUARIE" : "MACQUARIE BK LTD", + "MIZUHO" : "MIZUHO CO", + "MONDELEZ INT" : "MONDELEZ INTERNATIONAL INC", + "MORGAN STANLEY" : "MORGAN STANLEY", + "NATIONAL AUSTRALIA B" : "NATIONAL AUSTRALIA BANK", + "NATIONWIDE BLDG SOC" : "NATIONWIDE BLDG SOCIETY", + "NATIONAL BANK OF CANADA" : "NATIONAL BANK OF CANADA", + "NATL BK CDA" : "NATIONAL BANK OF CANADA", + "NATWEST M" : "NATWEST MARKETS PLC", + "NEXTERA ENERGY" : "NEXTERA ENERGY CAP", + "NORDEA BANK" : "NORDEA BANK", + "NORTHWESTERN" : "NORTHWESTERN MUT", + "NXP B V" : "NXP B V", + "PHILLIPS 66" : "PHILLIPS 66", + "PRINCIPAL LIFE GLOBAL" : "PRINCIPAL LIFE GLOBAL FDG", + "PROTECTIVE LIFE G" : "PROTECTIVE LIFE GLOBAL", + "PUBLIC SVC" : "PUBLIC SERVICE ELECTRIC & GAS", + "RABOBANK NEDERLAND" : "RABOBANK NEDERLAND", + "ROCHE H" : "ROCHE HOLDINGS INC", + "ROPER " : "ROPER TECHNOLOGIES INC", + "ROYAL BANK OF CANADA" : "ROYAL BANK OF CANADA", + "ROYAL BK CDA" : "ROYAL BANK OF CANADA", + "SCHLUMBERGER" : "SCHLUMBERGER", + "SIEMENS FINANCIERINGSMAA" : "SIEMENS FINANCIERINGSMAATSCHAP", + "SIMON PPTY GROUP" : "SIMON PPTY GROUP", + "STATE STR" : "STATE STREET CO", + "SUMITOMO MITSUI" : "SUMITOMO MITSUI BANKING", + "SWEDBANK AB" : "SWEDBANK AB", + "TORONTO DOMINION B" : "TORONTO DOMINION BANK", + "TOYOTA M" : "TOYOTA MOTOR", + "TRANS-CDA PIPELINES" : "TRANS-CDA PIPELINES", + "TRUIST " : "TRUIST BANK", + "TSMC " : "TSMC GLOBAL LTD", + "U S B" : "U S BANCORP", + "UBS " : "UBS AG LONDON", + "VENTAS REALTY" : "VENTAS REALTY LP", + "VOLKSWAGEN GROUP" : "VOLKSWAGEN GROUP", + "WESTPAC B" : "WESTPAC BANKING", +}
\ No newline at end of file diff --git a/scripts/investment_cleaning.py b/scripts/investment_cleaning.py new file mode 100644 index 0000000..fdb5a14 --- /dev/null +++ b/scripts/investment_cleaning.py @@ -0,0 +1,200 @@ +""" +This script is intended to clean CSV files resulting from FIOA request +for the University of Illinois Systems Office of Investments Annual Report +Appendix D. As such, the input CSV files are expected to have 6 lines of +header, and 6 columns: Account or Security, Coupon, Maturity Date, +Quantity, Cost Value, and Market Value. This script will clean this CSV +file and create an output CSV file called investment_fy<YEAR> +""" + +from constants import COMPANY_NAMES + +import pandas as pd +import numpy as np +import yfinance as yf +import requests +import concurrent.futures + +""" +Reading in Inputs from the Command Line +""" + +print("Welcome to the Investment Cleaning Scripts. Please answer the following prompts so we may process your data.\n") +print("Investment CSV File to be Cleaned (make sure the file is in the data directory)") +input_file = input() +input_file_path = "../data/" + input_file + +while 1: + try: + df = pd.read_excel(input_file_path, parse_dates=True).dropna(how='all') + break + except FileNotFoundError: + print(f"No such file or directory: '../data/{input_file}'") + print("Please enter a valid file path") + input_file = input() + input_file_path = "../data/" + input_file + +print("Fiscal Year of Annual Report (in YYYY format)") +year = input() +for i in range (10): + try: + year_found = df.iloc[i][0].year + break + except: + pass + +if int(year) != int(year_found): + print(f"Input Year {year} does not Match Year Found in Input CSV {year_found}") + print(f"Would you like to update the year to {year_found}? (y/n)") + if input().upper() == 'Y': + print(f"Updating Year to {year_found}") + year = year_found + +output_file_path = f'../data/investments_fy{year}.csv' + +""" +Processing Dataframe +""" +starting_index = -1 +for i in range(10): + try: + if 'Account or Security' == df.iloc[i][0]: + starting_index = i + 2 + break + except: + pass +if starting_index < 0: + print("Could Not Find Entry 'Account or Security', Terminating the Script") + quit() + +df = pd.read_excel(input_file_path, parse_dates=True, skiprows=starting_index).dropna(how='all') + +## Setting up the Operating Pool Dataframe +bank_i = df[df['Account or Security'].str.contains("9-200100", na=False)].index +op_i = df[df['Account or Security'].str.contains("Operating Funds Pool", na=False)].index +op_df = df.loc[bank_i[0]:op_i[1]-1] +op_df.insert(6, 'Bank', pd.NA) +op_df.insert(7, 'Asset Type', pd.NA) +op_df.insert(8, 'Company', pd.NA) +op_df.insert(9, 'Industry', pd.NA) +op_df.insert(10, 'Private Placement', False) +op_df.insert(11, 'Ticker', pd.NA) +op_df.insert(11, 'Info', object) + +## Adding the bank and Asset Type to the Dataframe +bank_name = pd.NA +asset_type = pd.NA +for i in op_df.index: + if np.isnan(op_df.loc[i]["Quantity"]): + if "9-200100" in df.loc[i]["Account or Security"]: + bank_name = df.loc[i]["Account or Security"] + else: + asset_type = df.loc[i]["Account or Security"] + op_df.at[i,'Bank'] = bank_name + op_df.at[i,'Asset Type'] = asset_type +op_df.head() + +## Corporate Bonds Dataframe +cb_df = op_df[op_df['Asset Type'].str.contains("Corporate Bonds", na=False)] + +## Clean Company Names +company_names = set() +for i in cb_df.index: + # Set the Company to be cleaned + cb_df.at[i,'Company'] = cb_df.at[i,'Account or Security'] + if not np.isnan(cb_df.at[i,'Quantity']): + # clean private placement + for prefix in ["PVTPL", "PVPTL", "PVYPL", "PVT PL", "PVPTL"]: + if prefix in cb_df.loc[i]["Company"]: + cb_df.at[i,'Private Placement'] = True + cb_df.at[i,'Company'] = cb_df.at[i,'Company'][6:].strip() + for end in [" CAP", " INC", " FDG", " CORP", " CO", " LLC", " CR", " SR", " A/S", " LP", " ASA", " LTD", ]: + if end in cb_df.at[i, "Company"]: + cb_df.at[i, "Company"] = cb_df.at[i, 'Company'].split(end)[0].strip()+" "+end + for token in ['%']: + if token in cb_df.at[i, "Company"]: + # get everything before the token, then get everything before the last space + cb_df.at[i, "Company"] = cb_df.at[i, "Company"].split(token)[0].rsplit(' ', 1)[0].strip() + for key, value in COMPANY_NAMES.items(): + if key in cb_df.at[i, "Company"]: + cb_df.at[i, "Company"] = value + company_names.add(cb_df.at[i, "Company"]) + else: + cb_df.drop(i, axis=0) + + +## Get Company Tickers +# taken from https://gist.github.com/bruhbruhroblox/dd9d981c8c37983f61e423a45085e063 +def lookup_ticker(company_name): + yfinance = "https://query2.finance.yahoo.com/v1/finance/search" + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' + params = {"q": company_name, "quotes_count": 1, "country": "United States"} + + res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent}) + data = res.json() + + ticker = data['quotes'][0]['symbol'] + return ticker + +def get_ticker(name): + try: + # try to get the ticker + ticker = lookup_ticker(name) + except: + try: + # shorten the name and try again + short_name = name.split(' ')[0] + ticker = lookup_ticker(short_name) + except: + # no ticker could be found, probably a private company, check by hand to make sure + ticker = 'NO_TICKER_FOUND' + return (name, ticker) + +company_name_to_ticker = dict() +with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(get_ticker, name) for name in company_names] + for future in concurrent.futures.as_completed(futures): + name, ticker = future.result() + company_name_to_ticker[name] = ticker + +## Match Company Name to Ticker in Dataframe +for i in cb_df.index: + try: + cb_df.at[i,'Ticker'] = company_name_to_ticker[cb_df.at[i,'Company']] + except: + assert cb_df.at[i,'Company'] == 'Corporate Bonds', f"Expected Cororate Bonds, got {cb_df.at[i,'Company']}" + +## Get Company Info +def get_info_from_ticker(ticker): + search_results = yf.Tickers(ticker) + return search_results.tickers[ticker].info + +def get_info(name): + try: + ticker = company_name_to_ticker[name] + info = get_info_from_ticker(ticker) + except: + info = 'No Info Found' + return (name, info) + +## use parallelization to speed up this process +company_info_dict = dict() +with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(get_info, name) for name in company_names] + for future in concurrent.futures.as_completed(futures): + name, info = future.result() + company_info_dict[name] = info + +""" +Output Results to File +""" +cb_df.to_csv(output_file_path) +print(f"Investment Processing are Done, Saving to File {output_file_path}") + + +""" +Investments Meta Information Output +""" +print("""Corporate Bond Totals""") +print("Cost Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Cost Value"])) +print("Market Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Market Value"]))
\ No newline at end of file |
