summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/constants.py90
-rw-r--r--scripts/investment_cleaning.py200
2 files changed, 290 insertions, 0 deletions
diff --git a/scripts/constants.py b/scripts/constants.py
new file mode 100644
index 0000000..7af110e
--- /dev/null
+++ b/scripts/constants.py
@@ -0,0 +1,90 @@
+COMPANY_NAMES = {
+ "AMERICAN EXPRESS" : "AMERICAN EXPRESS CO",
+ 'AIG GLOBAL' : "AMERICAN INTL GROUP INC",
+ "ANHEUSER-BUSCH" : "ANHEUSER-BUSCH CO",
+ "APTIV" : "APTIV CO",
+ "ASTRAZENECA" : "ASTRAZENECA PLC",
+ "AUSTRALIA & NEW" : "AUSTRALIA & NEW ZEALAND BKG GR",
+ "BAE SYS" : "BAE SYS PLC",
+ "BANCO SANTANDER" : "BANCO SANTANDER SA",
+ "BANK MONTREAL" : "BANK OF MONTREAL",
+ "BANK OF MONTREAL" : "BANK OF MONTREAL",
+ "BK MONTREAL" : "BANK OF MONTREAL",
+ "BANK NOVA SCOTIA" : "BANK OF NOVA SCOTIA",
+ "BANK OF NOVA SCOTIA" : "BANK OF NOVA SCOTIA",
+ "BANK AMER" : "BANK OF AMERICA CO",
+ "BAXTER INT" : "BAXTER INTERNATIONAL INC",
+ "BAYER US FIN" : "BAYER US FINANCE LLC",
+ "BB&T" : "BB&T CO",
+ "BLACKSTONE" : "BLACKSTONE",
+ "BMW" : "BMW",
+ "BNP PARIBAS" : "BNP PARIBAS",
+ "BRIGHTHOUSE" : "BRIGHTHOUSE",
+ "BRISTOL MYERS SQUIBB" : "BRISTOL MYERS SQUIBB CO",
+ "BRISTOL-MYERS SQUIBB" : "BRISTOL MYERS SQUIBB CO",
+ "CANADIAN IMPERIAL BK" : "CANADIAN IMPERIAL BK CO",
+ "CAPITAL ONE" : "CAPITAL ONE FINL CO",
+ "CATERPILLAR FINL" : "CATERPILLAR FINL",
+ "CENTERPOINT ENERGY" : "CENTERPOINT ENERGY INC",
+ "CHEVRON U S A" : "CHEVRON CO",
+ "CREDIT AGRICOLE" : "CREDIT AGRICOLE",
+ "CREDIT SUISSE" : "CREDIT SUISSE GROUP AG",
+ "CROWN CASTLE INTL" : "CROWN CASTLE INTL",
+ "DAIMLER" : "DAIMLER",
+ "DELTA AIR LINES" : "DELTA AIR LINES",
+ "DTE E" : "DTE ELEC",
+ "DUKE ENERGY" : "DUKE ENERGY CO",
+ "DOWDUPONT INC" : "DUPONT DE NEMOURS INC",
+ "ENTERGY" : "ENTERGY CO",
+ "EQUITABLE FINL LIFE" : "EQUITABLE FINL LIFE GLOBAL FDG",
+ "ESC CB LEHMAN BROS" : "ESC LEHMAN BROTH HLD INC",
+ "FIFTH THIRD BANCORP" : "FIFTH THIRD BANCORP",
+ "FLORIDA P" : "FLORIDA POWER & LIGHT CO",
+ "GENERAL MTRS" : "GENERAL MOTORS",
+ "GENERAL MOTORS" : "GENERAL MOTORS",
+ "HEWLETT PACKARD" : "HEWLETT PACKARD ENTERPRISE CO",
+ "HP INC" : "HEWLETT PACKARD ENTERPRISE CO",
+ "HUNTINGTON" : "HUNTINGTON NATL BK MD",
+ "JACKSON FINANCIAL INC" : "JACKSON NATIONAL LIFE GL",
+ "JPM CHASE" : "JPMORGAN CHASE & CO",
+ "KINDER MORGAN" : "KINDER MORGAN INC",
+ "LLOYDS BKG" : "LLOYDS BANKING GROUP PLC FORME",
+ "MACQUARIE" : "MACQUARIE BK LTD",
+ "MIZUHO" : "MIZUHO CO",
+ "MONDELEZ INT" : "MONDELEZ INTERNATIONAL INC",
+ "MORGAN STANLEY" : "MORGAN STANLEY",
+ "NATIONAL AUSTRALIA B" : "NATIONAL AUSTRALIA BANK",
+ "NATIONWIDE BLDG SOC" : "NATIONWIDE BLDG SOCIETY",
+ "NATIONAL BANK OF CANADA" : "NATIONAL BANK OF CANADA",
+ "NATL BK CDA" : "NATIONAL BANK OF CANADA",
+ "NATWEST M" : "NATWEST MARKETS PLC",
+ "NEXTERA ENERGY" : "NEXTERA ENERGY CAP",
+ "NORDEA BANK" : "NORDEA BANK",
+ "NORTHWESTERN" : "NORTHWESTERN MUT",
+ "NXP B V" : "NXP B V",
+ "PHILLIPS 66" : "PHILLIPS 66",
+ "PRINCIPAL LIFE GLOBAL" : "PRINCIPAL LIFE GLOBAL FDG",
+ "PROTECTIVE LIFE G" : "PROTECTIVE LIFE GLOBAL",
+ "PUBLIC SVC" : "PUBLIC SERVICE ELECTRIC & GAS",
+ "RABOBANK NEDERLAND" : "RABOBANK NEDERLAND",
+ "ROCHE H" : "ROCHE HOLDINGS INC",
+ "ROPER " : "ROPER TECHNOLOGIES INC",
+ "ROYAL BANK OF CANADA" : "ROYAL BANK OF CANADA",
+ "ROYAL BK CDA" : "ROYAL BANK OF CANADA",
+ "SCHLUMBERGER" : "SCHLUMBERGER",
+ "SIEMENS FINANCIERINGSMAA" : "SIEMENS FINANCIERINGSMAATSCHAP",
+ "SIMON PPTY GROUP" : "SIMON PPTY GROUP",
+ "STATE STR" : "STATE STREET CO",
+ "SUMITOMO MITSUI" : "SUMITOMO MITSUI BANKING",
+ "SWEDBANK AB" : "SWEDBANK AB",
+ "TORONTO DOMINION B" : "TORONTO DOMINION BANK",
+ "TOYOTA M" : "TOYOTA MOTOR",
+ "TRANS-CDA PIPELINES" : "TRANS-CDA PIPELINES",
+ "TRUIST " : "TRUIST BANK",
+ "TSMC " : "TSMC GLOBAL LTD",
+ "U S B" : "U S BANCORP",
+ "UBS " : "UBS AG LONDON",
+ "VENTAS REALTY" : "VENTAS REALTY LP",
+ "VOLKSWAGEN GROUP" : "VOLKSWAGEN GROUP",
+ "WESTPAC B" : "WESTPAC BANKING",
+} \ No newline at end of file
diff --git a/scripts/investment_cleaning.py b/scripts/investment_cleaning.py
new file mode 100644
index 0000000..fdb5a14
--- /dev/null
+++ b/scripts/investment_cleaning.py
@@ -0,0 +1,200 @@
+"""
+This script is intended to clean CSV files resulting from FIOA request
+for the University of Illinois Systems Office of Investments Annual Report
+Appendix D. As such, the input CSV files are expected to have 6 lines of
+header, and 6 columns: Account or Security, Coupon, Maturity Date,
+Quantity, Cost Value, and Market Value. This script will clean this CSV
+file and create an output CSV file called investment_fy<YEAR>
+"""
+
+from constants import COMPANY_NAMES
+
+import pandas as pd
+import numpy as np
+import yfinance as yf
+import requests
+import concurrent.futures
+
+"""
+Reading in Inputs from the Command Line
+"""
+
+print("Welcome to the Investment Cleaning Scripts. Please answer the following prompts so we may process your data.\n")
+print("Investment CSV File to be Cleaned (make sure the file is in the data directory)")
+input_file = input()
+input_file_path = "../data/" + input_file
+
+while 1:
+ try:
+ df = pd.read_excel(input_file_path, parse_dates=True).dropna(how='all')
+ break
+ except FileNotFoundError:
+ print(f"No such file or directory: '../data/{input_file}'")
+ print("Please enter a valid file path")
+ input_file = input()
+ input_file_path = "../data/" + input_file
+
+print("Fiscal Year of Annual Report (in YYYY format)")
+year = input()
+for i in range (10):
+ try:
+ year_found = df.iloc[i][0].year
+ break
+ except:
+ pass
+
+if int(year) != int(year_found):
+ print(f"Input Year {year} does not Match Year Found in Input CSV {year_found}")
+ print(f"Would you like to update the year to {year_found}? (y/n)")
+ if input().upper() == 'Y':
+ print(f"Updating Year to {year_found}")
+ year = year_found
+
+output_file_path = f'../data/investments_fy{year}.csv'
+
+"""
+Processing Dataframe
+"""
+starting_index = -1
+for i in range(10):
+ try:
+ if 'Account or Security' == df.iloc[i][0]:
+ starting_index = i + 2
+ break
+ except:
+ pass
+if starting_index < 0:
+ print("Could Not Find Entry 'Account or Security', Terminating the Script")
+ quit()
+
+df = pd.read_excel(input_file_path, parse_dates=True, skiprows=starting_index).dropna(how='all')
+
+## Setting up the Operating Pool Dataframe
+bank_i = df[df['Account or Security'].str.contains("9-200100", na=False)].index
+op_i = df[df['Account or Security'].str.contains("Operating Funds Pool", na=False)].index
+op_df = df.loc[bank_i[0]:op_i[1]-1]
+op_df.insert(6, 'Bank', pd.NA)
+op_df.insert(7, 'Asset Type', pd.NA)
+op_df.insert(8, 'Company', pd.NA)
+op_df.insert(9, 'Industry', pd.NA)
+op_df.insert(10, 'Private Placement', False)
+op_df.insert(11, 'Ticker', pd.NA)
+op_df.insert(11, 'Info', object)
+
+## Adding the bank and Asset Type to the Dataframe
+bank_name = pd.NA
+asset_type = pd.NA
+for i in op_df.index:
+ if np.isnan(op_df.loc[i]["Quantity"]):
+ if "9-200100" in df.loc[i]["Account or Security"]:
+ bank_name = df.loc[i]["Account or Security"]
+ else:
+ asset_type = df.loc[i]["Account or Security"]
+ op_df.at[i,'Bank'] = bank_name
+ op_df.at[i,'Asset Type'] = asset_type
+op_df.head()
+
+## Corporate Bonds Dataframe
+cb_df = op_df[op_df['Asset Type'].str.contains("Corporate Bonds", na=False)]
+
+## Clean Company Names
+company_names = set()
+for i in cb_df.index:
+ # Set the Company to be cleaned
+ cb_df.at[i,'Company'] = cb_df.at[i,'Account or Security']
+ if not np.isnan(cb_df.at[i,'Quantity']):
+ # clean private placement
+ for prefix in ["PVTPL", "PVPTL", "PVYPL", "PVT PL", "PVPTL"]:
+ if prefix in cb_df.loc[i]["Company"]:
+ cb_df.at[i,'Private Placement'] = True
+ cb_df.at[i,'Company'] = cb_df.at[i,'Company'][6:].strip()
+ for end in [" CAP", " INC", " FDG", " CORP", " CO", " LLC", " CR", " SR", " A/S", " LP", " ASA", " LTD", ]:
+ if end in cb_df.at[i, "Company"]:
+ cb_df.at[i, "Company"] = cb_df.at[i, 'Company'].split(end)[0].strip()+" "+end
+ for token in ['%']:
+ if token in cb_df.at[i, "Company"]:
+ # get everything before the token, then get everything before the last space
+ cb_df.at[i, "Company"] = cb_df.at[i, "Company"].split(token)[0].rsplit(' ', 1)[0].strip()
+ for key, value in COMPANY_NAMES.items():
+ if key in cb_df.at[i, "Company"]:
+ cb_df.at[i, "Company"] = value
+ company_names.add(cb_df.at[i, "Company"])
+ else:
+ cb_df.drop(i, axis=0)
+
+
+## Get Company Tickers
+# taken from https://gist.github.com/bruhbruhroblox/dd9d981c8c37983f61e423a45085e063
+def lookup_ticker(company_name):
+ yfinance = "https://query2.finance.yahoo.com/v1/finance/search"
+ user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
+ params = {"q": company_name, "quotes_count": 1, "country": "United States"}
+
+ res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})
+ data = res.json()
+
+ ticker = data['quotes'][0]['symbol']
+ return ticker
+
+def get_ticker(name):
+ try:
+ # try to get the ticker
+ ticker = lookup_ticker(name)
+ except:
+ try:
+ # shorten the name and try again
+ short_name = name.split(' ')[0]
+ ticker = lookup_ticker(short_name)
+ except:
+ # no ticker could be found, probably a private company, check by hand to make sure
+ ticker = 'NO_TICKER_FOUND'
+ return (name, ticker)
+
+company_name_to_ticker = dict()
+with concurrent.futures.ThreadPoolExecutor() as executor:
+ futures = [executor.submit(get_ticker, name) for name in company_names]
+ for future in concurrent.futures.as_completed(futures):
+ name, ticker = future.result()
+ company_name_to_ticker[name] = ticker
+
+## Match Company Name to Ticker in Dataframe
+for i in cb_df.index:
+ try:
+ cb_df.at[i,'Ticker'] = company_name_to_ticker[cb_df.at[i,'Company']]
+ except:
+ assert cb_df.at[i,'Company'] == 'Corporate Bonds', f"Expected Cororate Bonds, got {cb_df.at[i,'Company']}"
+
+## Get Company Info
+def get_info_from_ticker(ticker):
+ search_results = yf.Tickers(ticker)
+ return search_results.tickers[ticker].info
+
+def get_info(name):
+ try:
+ ticker = company_name_to_ticker[name]
+ info = get_info_from_ticker(ticker)
+ except:
+ info = 'No Info Found'
+ return (name, info)
+
+## use parallelization to speed up this process
+company_info_dict = dict()
+with concurrent.futures.ThreadPoolExecutor() as executor:
+ futures = [executor.submit(get_info, name) for name in company_names]
+ for future in concurrent.futures.as_completed(futures):
+ name, info = future.result()
+ company_info_dict[name] = info
+
+"""
+Output Results to File
+"""
+cb_df.to_csv(output_file_path)
+print(f"Investment Processing are Done, Saving to File {output_file_path}")
+
+
+"""
+Investments Meta Information Output
+"""
+print("""Corporate Bond Totals""")
+print("Cost Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Cost Value"]))
+print("Market Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Market Value"])) \ No newline at end of file