From 87472cbe8f20612b21f26f8b1a8e8f3c2d55163d Mon Sep 17 00:00:00 2001
From: Gabriel Kosmacher <73120774+kennykos@users.noreply.github.com>
Date: Thu, 29 Jun 2023 15:40:53 -0500
Subject: 6 turn UI security inventory 23 parsingipynb into python script (#5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* commit clean notebook

* initilizing investments cleaning script

* finishing up command line prompts, being processing data

* investment_cleaning.py script is implemented

* Investment Cleaning Script is Implemented

---------

Co-authored-by: kennykos <“gkosmacher01@gmail.com”>
---
 scripts/investment_cleaning.py | 200 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 scripts/investment_cleaning.py

(limited to 'scripts/investment_cleaning.py')
diff --git a/scripts/investment_cleaning.py b/scripts/investment_cleaning.py
new file mode 100644
index 0000000..fdb5a14
--- /dev/null
+++ b/scripts/investment_cleaning.py
@@ -0,0 +1,200 @@
+"""
+This script is intended to clean CSV files resulting from FIOA request
+for the University of Illinois Systems Office of Investments Annual Report
+Appendix D. As such, the input CSV files are expected to have 6 lines of
+header, and 6 columns: Account or Security, Coupon, Maturity Date,
+Quantity, Cost Value, and Market Value. This script will clean this CSV
+file and create an output CSV file called investment_fy<YEAR>
+"""
+
+from constants import COMPANY_NAMES 
+
+import pandas as pd
+import numpy as np
+import yfinance as yf
+import requests
+import concurrent.futures
+
+"""
+Reading in Inputs from the Command Line
+"""
+
+print("Welcome to the Investment Cleaning Scripts. Please answer the following prompts so we may process your data.\n")
+print("Investment CSV File to be Cleaned (make sure the file is in the data directory)")
+input_file = input()
+input_file_path = "../data/" + input_file
+
+while 1:
+    try:
+        df = pd.read_excel(input_file_path, parse_dates=True).dropna(how='all')
+        break
+    except FileNotFoundError:
+        print(f"No such file or directory: '../data/{input_file}'")
+        print("Please enter a valid file path")
+        input_file = input()
+        input_file_path = "../data/" + input_file
+
+print("Fiscal Year of Annual Report (in YYYY format)")
+year = input()
+for i in range (10):
+    try:
+        year_found = df.iloc[i][0].year
+        break
+    except:
+        pass
+
+if int(year) != int(year_found):
+    print(f"Input Year {year} does not Match Year Found in Input CSV {year_found}")
+    print(f"Would you like to update the year to {year_found}? (y/n)")
+    if input().upper() == 'Y':
+        print(f"Updating Year to {year_found}")
+        year = year_found
+
+output_file_path = f'../data/investments_fy{year}.csv'
+
+"""
+Processing Dataframe
+"""
+starting_index = -1
+for i in range(10):
+    try:
+        if 'Account or Security' == df.iloc[i][0]:
+            starting_index = i + 2
+            break
+    except:
+        pass
+if starting_index < 0:
+    print("Could Not Find Entry 'Account or Security', Terminating the Script")
+    quit()
+
+df = pd.read_excel(input_file_path, parse_dates=True, skiprows=starting_index).dropna(how='all')
+
+## Setting up the Operating Pool Dataframe
+bank_i = df[df['Account or Security'].str.contains("9-200100", na=False)].index
+op_i = df[df['Account or Security'].str.contains("Operating Funds Pool", na=False)].index
+op_df = df.loc[bank_i[0]:op_i[1]-1]
+op_df.insert(6, 'Bank', pd.NA)
+op_df.insert(7, 'Asset Type', pd.NA)
+op_df.insert(8, 'Company', pd.NA)
+op_df.insert(9, 'Industry', pd.NA)
+op_df.insert(10, 'Private Placement', False)
+op_df.insert(11, 'Ticker', pd.NA)
+op_df.insert(11, 'Info', object)
+
+## Adding the bank and Asset Type to the Dataframe
+bank_name = pd.NA
+asset_type = pd.NA
+for i in op_df.index:
+    if np.isnan(op_df.loc[i]["Quantity"]):
+        if "9-200100" in df.loc[i]["Account or Security"]:
+            bank_name = df.loc[i]["Account or Security"]
+        else:
+            asset_type = df.loc[i]["Account or Security"]
+    op_df.at[i,'Bank'] = bank_name
+    op_df.at[i,'Asset Type'] = asset_type
+op_df.head()
+
+## Corporate Bonds Dataframe
+cb_df = op_df[op_df['Asset Type'].str.contains("Corporate Bonds", na=False)]
+
+## Clean Company Names
+company_names = set()
+for i in cb_df.index:
+    # Set the Company to be cleaned
+    cb_df.at[i,'Company'] = cb_df.at[i,'Account or Security']
+    if not np.isnan(cb_df.at[i,'Quantity']):
+        # clean private placement
+        for prefix in ["PVTPL", "PVPTL", "PVYPL", "PVT PL", "PVPTL"]:
+            if prefix in cb_df.loc[i]["Company"]:
+                cb_df.at[i,'Private Placement'] = True
+                cb_df.at[i,'Company'] = cb_df.at[i,'Company'][6:].strip()
+        for end in [" CAP", " INC", " FDG", " CORP", " CO", " LLC", " CR", " SR", " A/S", " LP", " ASA", " LTD", ]:
+            if end in cb_df.at[i, "Company"]:
+                cb_df.at[i, "Company"] = cb_df.at[i, 'Company'].split(end)[0].strip()+" "+end
+        for token in ['%']:
+            if token in cb_df.at[i, "Company"]:
+                # get everything before the token, then get everything before the last space
+                cb_df.at[i, "Company"] = cb_df.at[i, "Company"].split(token)[0].rsplit(' ', 1)[0].strip()
+        for key, value in COMPANY_NAMES.items():
+            if key in cb_df.at[i, "Company"]:
+                cb_df.at[i, "Company"] = value
+        company_names.add(cb_df.at[i, "Company"])
+    else:
+        cb_df.drop(i, axis=0)
+
+
+## Get Company Tickers
+# taken from https://gist.github.com/bruhbruhroblox/dd9d981c8c37983f61e423a45085e063
+def lookup_ticker(company_name):
+    yfinance = "https://query2.finance.yahoo.com/v1/finance/search"
+    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
+    params = {"q": company_name, "quotes_count": 1, "country": "United States"}
+
+    res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})
+    data = res.json()
+
+    ticker = data['quotes'][0]['symbol']
+    return ticker
+
+def get_ticker(name):
+    try:
+        # try to get the ticker
+        ticker = lookup_ticker(name)
+    except:
+        try:
+            # shorten the name and try again
+            short_name = name.split(' ')[0]
+            ticker = lookup_ticker(short_name)
+        except:
+            # no ticker could be found, probably a private company, check by hand to make sure
+            ticker = 'NO_TICKER_FOUND'
+    return (name, ticker)
+
+company_name_to_ticker = dict()
+with concurrent.futures.ThreadPoolExecutor() as executor:
+    futures = [executor.submit(get_ticker, name) for name in company_names]
+    for future in concurrent.futures.as_completed(futures):
+        name, ticker = future.result()
+        company_name_to_ticker[name] = ticker
+
+## Match Company Name to Ticker in Dataframe
+for i in cb_df.index:
+    try:
+        cb_df.at[i,'Ticker'] = company_name_to_ticker[cb_df.at[i,'Company']]
+    except:
+        assert cb_df.at[i,'Company'] == 'Corporate Bonds', f"Expected Cororate Bonds, got {cb_df.at[i,'Company']}"
+
+## Get Company Info
+def get_info_from_ticker(ticker):
+    search_results = yf.Tickers(ticker)
+    return search_results.tickers[ticker].info
+
+def get_info(name):
+    try:
+        ticker = company_name_to_ticker[name]
+        info = get_info_from_ticker(ticker)
+    except:
+        info = 'No Info Found'
+    return (name, info)
+
+## use parallelization to speed up this process
+company_info_dict = dict()
+with concurrent.futures.ThreadPoolExecutor() as executor:
+    futures = [executor.submit(get_info, name) for name in company_names]
+    for future in concurrent.futures.as_completed(futures):
+        name, info = future.result()
+        company_info_dict[name] = info
+
+"""
+Output Results to File
+"""
+cb_df.to_csv(output_file_path)
+print(f"Investment Processing are Done, Saving to File {output_file_path}")
+
+
+"""
+Investments Meta Information Output
+"""
+print("""Corporate Bond Totals""")
+print("Cost Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Cost Value"]))
+print("Market Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Market Value"]))
\ No newline at end of file
-- 
cgit v1.2.3