From 87472cbe8f20612b21f26f8b1a8e8f3c2d55163d Mon Sep 17 00:00:00 2001 From: Gabriel Kosmacher <73120774+kennykos@users.noreply.github.com> Date: Thu, 29 Jun 2023 15:40:53 -0500 Subject: 6 turn UI security inventory 23 parsingipynb into python script (#5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * commit clean notebook * initilizing investments cleaning script * finishing up command line prompts, being processing data * investment_cleaning.py script is implemented * Investment Cleaning Script is Implemented --------- Co-authored-by: kennykos <“gkosmacher01@gmail.com”> --- scripts/investment_cleaning.py | 200 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 scripts/investment_cleaning.py (limited to 'scripts/investment_cleaning.py') diff --git a/scripts/investment_cleaning.py b/scripts/investment_cleaning.py new file mode 100644 index 0000000..fdb5a14 --- /dev/null +++ b/scripts/investment_cleaning.py @@ -0,0 +1,200 @@ +""" +This script is intended to clean CSV files resulting from FIOA request +for the University of Illinois Systems Office of Investments Annual Report +Appendix D. As such, the input CSV files are expected to have 6 lines of +header, and 6 columns: Account or Security, Coupon, Maturity Date, +Quantity, Cost Value, and Market Value. This script will clean this CSV +file and create an output CSV file called investment_fy +""" + +from constants import COMPANY_NAMES + +import pandas as pd +import numpy as np +import yfinance as yf +import requests +import concurrent.futures + +""" +Reading in Inputs from the Command Line +""" + +print("Welcome to the Investment Cleaning Scripts. Please answer the following prompts so we may process your data.\n") +print("Investment CSV File to be Cleaned (make sure the file is in the data directory)") +input_file = input() +input_file_path = "../data/" + input_file + +while 1: + try: + df = pd.read_excel(input_file_path, parse_dates=True).dropna(how='all') + break + except FileNotFoundError: + print(f"No such file or directory: '../data/{input_file}'") + print("Please enter a valid file path") + input_file = input() + input_file_path = "../data/" + input_file + +print("Fiscal Year of Annual Report (in YYYY format)") +year = input() +for i in range (10): + try: + year_found = df.iloc[i][0].year + break + except: + pass + +if int(year) != int(year_found): + print(f"Input Year {year} does not Match Year Found in Input CSV {year_found}") + print(f"Would you like to update the year to {year_found}? (y/n)") + if input().upper() == 'Y': + print(f"Updating Year to {year_found}") + year = year_found + +output_file_path = f'../data/investments_fy{year}.csv' + +""" +Processing Dataframe +""" +starting_index = -1 +for i in range(10): + try: + if 'Account or Security' == df.iloc[i][0]: + starting_index = i + 2 + break + except: + pass +if starting_index < 0: + print("Could Not Find Entry 'Account or Security', Terminating the Script") + quit() + +df = pd.read_excel(input_file_path, parse_dates=True, skiprows=starting_index).dropna(how='all') + +## Setting up the Operating Pool Dataframe +bank_i = df[df['Account or Security'].str.contains("9-200100", na=False)].index +op_i = df[df['Account or Security'].str.contains("Operating Funds Pool", na=False)].index +op_df = df.loc[bank_i[0]:op_i[1]-1] +op_df.insert(6, 'Bank', pd.NA) +op_df.insert(7, 'Asset Type', pd.NA) +op_df.insert(8, 'Company', pd.NA) +op_df.insert(9, 'Industry', pd.NA) +op_df.insert(10, 'Private Placement', False) +op_df.insert(11, 'Ticker', pd.NA) +op_df.insert(11, 'Info', object) + +## Adding the bank and Asset Type to the Dataframe +bank_name = pd.NA +asset_type = pd.NA +for i in op_df.index: + if np.isnan(op_df.loc[i]["Quantity"]): + if "9-200100" in df.loc[i]["Account or Security"]: + bank_name = df.loc[i]["Account or Security"] + else: + asset_type = df.loc[i]["Account or Security"] + op_df.at[i,'Bank'] = bank_name + op_df.at[i,'Asset Type'] = asset_type +op_df.head() + +## Corporate Bonds Dataframe +cb_df = op_df[op_df['Asset Type'].str.contains("Corporate Bonds", na=False)] + +## Clean Company Names +company_names = set() +for i in cb_df.index: + # Set the Company to be cleaned + cb_df.at[i,'Company'] = cb_df.at[i,'Account or Security'] + if not np.isnan(cb_df.at[i,'Quantity']): + # clean private placement + for prefix in ["PVTPL", "PVPTL", "PVYPL", "PVT PL", "PVPTL"]: + if prefix in cb_df.loc[i]["Company"]: + cb_df.at[i,'Private Placement'] = True + cb_df.at[i,'Company'] = cb_df.at[i,'Company'][6:].strip() + for end in [" CAP", " INC", " FDG", " CORP", " CO", " LLC", " CR", " SR", " A/S", " LP", " ASA", " LTD", ]: + if end in cb_df.at[i, "Company"]: + cb_df.at[i, "Company"] = cb_df.at[i, 'Company'].split(end)[0].strip()+" "+end + for token in ['%']: + if token in cb_df.at[i, "Company"]: + # get everything before the token, then get everything before the last space + cb_df.at[i, "Company"] = cb_df.at[i, "Company"].split(token)[0].rsplit(' ', 1)[0].strip() + for key, value in COMPANY_NAMES.items(): + if key in cb_df.at[i, "Company"]: + cb_df.at[i, "Company"] = value + company_names.add(cb_df.at[i, "Company"]) + else: + cb_df.drop(i, axis=0) + + +## Get Company Tickers +# taken from https://gist.github.com/bruhbruhroblox/dd9d981c8c37983f61e423a45085e063 +def lookup_ticker(company_name): + yfinance = "https://query2.finance.yahoo.com/v1/finance/search" + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' + params = {"q": company_name, "quotes_count": 1, "country": "United States"} + + res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent}) + data = res.json() + + ticker = data['quotes'][0]['symbol'] + return ticker + +def get_ticker(name): + try: + # try to get the ticker + ticker = lookup_ticker(name) + except: + try: + # shorten the name and try again + short_name = name.split(' ')[0] + ticker = lookup_ticker(short_name) + except: + # no ticker could be found, probably a private company, check by hand to make sure + ticker = 'NO_TICKER_FOUND' + return (name, ticker) + +company_name_to_ticker = dict() +with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(get_ticker, name) for name in company_names] + for future in concurrent.futures.as_completed(futures): + name, ticker = future.result() + company_name_to_ticker[name] = ticker + +## Match Company Name to Ticker in Dataframe +for i in cb_df.index: + try: + cb_df.at[i,'Ticker'] = company_name_to_ticker[cb_df.at[i,'Company']] + except: + assert cb_df.at[i,'Company'] == 'Corporate Bonds', f"Expected Cororate Bonds, got {cb_df.at[i,'Company']}" + +## Get Company Info +def get_info_from_ticker(ticker): + search_results = yf.Tickers(ticker) + return search_results.tickers[ticker].info + +def get_info(name): + try: + ticker = company_name_to_ticker[name] + info = get_info_from_ticker(ticker) + except: + info = 'No Info Found' + return (name, info) + +## use parallelization to speed up this process +company_info_dict = dict() +with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(get_info, name) for name in company_names] + for future in concurrent.futures.as_completed(futures): + name, info = future.result() + company_info_dict[name] = info + +""" +Output Results to File +""" +cb_df.to_csv(output_file_path) +print(f"Investment Processing are Done, Saving to File {output_file_path}") + + +""" +Investments Meta Information Output +""" +print("""Corporate Bond Totals""") +print("Cost Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Cost Value"])) +print("Market Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Market Value"])) \ No newline at end of file -- cgit v1.2.3