scripts/investment_cleaning.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200

"""
This script is intended to clean CSV files resulting from FIOA request
for the University of Illinois Systems Office of Investments Annual Report
Appendix D. As such, the input CSV files are expected to have 6 lines of
header, and 6 columns: Account or Security, Coupon, Maturity Date,
Quantity, Cost Value, and Market Value. This script will clean this CSV
file and create an output CSV file called investment_fy<YEAR>
"""

from constants import COMPANY_NAMES 

import pandas as pd
import numpy as np
import yfinance as yf
import requests
import concurrent.futures

"""
Reading in Inputs from the Command Line
"""

print("Welcome to the Investment Cleaning Scripts. Please answer the following prompts so we may process your data.\n")
print("Investment CSV File to be Cleaned (make sure the file is in the data directory)")
input_file = input()
input_file_path = "../data/" + input_file

while 1:
    try:
        df = pd.read_excel(input_file_path, parse_dates=True).dropna(how='all')
        break
    except FileNotFoundError:
        print(f"No such file or directory: '../data/{input_file}'")
        print("Please enter a valid file path")
        input_file = input()
        input_file_path = "../data/" + input_file

print("Fiscal Year of Annual Report (in YYYY format)")
year = input()
for i in range (10):
    try:
        year_found = df.iloc[i][0].year
        break
    except:
        pass

if int(year) != int(year_found):
    print(f"Input Year {year} does not Match Year Found in Input CSV {year_found}")
    print(f"Would you like to update the year to {year_found}? (y/n)")
    if input().upper() == 'Y':
        print(f"Updating Year to {year_found}")
        year = year_found

output_file_path = f'../data/investments_fy{year}.csv'

"""
Processing Dataframe
"""
starting_index = -1
for i in range(10):
    try:
        if 'Account or Security' == df.iloc[i][0]:
            starting_index = i + 2
            break
    except:
        pass
if starting_index < 0:
    print("Could Not Find Entry 'Account or Security', Terminating the Script")
    quit()

df = pd.read_excel(input_file_path, parse_dates=True, skiprows=starting_index).dropna(how='all')

## Setting up the Operating Pool Dataframe
bank_i = df[df['Account or Security'].str.contains("9-200100", na=False)].index
op_i = df[df['Account or Security'].str.contains("Operating Funds Pool", na=False)].index
op_df = df.loc[bank_i[0]:op_i[1]-1]
op_df.insert(6, 'Bank', pd.NA)
op_df.insert(7, 'Asset Type', pd.NA)
op_df.insert(8, 'Company', pd.NA)
op_df.insert(9, 'Industry', pd.NA)
op_df.insert(10, 'Private Placement', False)
op_df.insert(11, 'Ticker', pd.NA)
op_df.insert(11, 'Info', object)

## Adding the bank and Asset Type to the Dataframe
bank_name = pd.NA
asset_type = pd.NA
for i in op_df.index:
    if np.isnan(op_df.loc[i]["Quantity"]):
        if "9-200100" in df.loc[i]["Account or Security"]:
            bank_name = df.loc[i]["Account or Security"]
        else:
            asset_type = df.loc[i]["Account or Security"]
    op_df.at[i,'Bank'] = bank_name
    op_df.at[i,'Asset Type'] = asset_type
op_df.head()

## Corporate Bonds Dataframe
cb_df = op_df[op_df['Asset Type'].str.contains("Corporate Bonds", na=False)]

## Clean Company Names
company_names = set()
for i in cb_df.index:
    # Set the Company to be cleaned
    cb_df.at[i,'Company'] = cb_df.at[i,'Account or Security']
    if not np.isnan(cb_df.at[i,'Quantity']):
        # clean private placement
        for prefix in ["PVTPL", "PVPTL", "PVYPL", "PVT PL", "PVPTL"]:
            if prefix in cb_df.loc[i]["Company"]:
                cb_df.at[i,'Private Placement'] = True
                cb_df.at[i,'Company'] = cb_df.at[i,'Company'][6:].strip()
        for end in [" CAP", " INC", " FDG", " CORP", " CO", " LLC", " CR", " SR", " A/S", " LP", " ASA", " LTD", ]:
            if end in cb_df.at[i, "Company"]:
                cb_df.at[i, "Company"] = cb_df.at[i, 'Company'].split(end)[0].strip()+" "+end
        for token in ['%']:
            if token in cb_df.at[i, "Company"]:
                # get everything before the token, then get everything before the last space
                cb_df.at[i, "Company"] = cb_df.at[i, "Company"].split(token)[0].rsplit(' ', 1)[0].strip()
        for key, value in COMPANY_NAMES.items():
            if key in cb_df.at[i, "Company"]:
                cb_df.at[i, "Company"] = value
        company_names.add(cb_df.at[i, "Company"])
    else:
        cb_df.drop(i, axis=0)


## Get Company Tickers
# taken from https://gist.github.com/bruhbruhroblox/dd9d981c8c37983f61e423a45085e063
def lookup_ticker(company_name):
    yfinance = "https://query2.finance.yahoo.com/v1/finance/search"
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    params = {"q": company_name, "quotes_count": 1, "country": "United States"}

    res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})
    data = res.json()

    ticker = data['quotes'][0]['symbol']
    return ticker

def get_ticker(name):
    try:
        # try to get the ticker
        ticker = lookup_ticker(name)
    except:
        try:
            # shorten the name and try again
            short_name = name.split(' ')[0]
            ticker = lookup_ticker(short_name)
        except:
            # no ticker could be found, probably a private company, check by hand to make sure
            ticker = 'NO_TICKER_FOUND'
    return (name, ticker)

company_name_to_ticker = dict()
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(get_ticker, name) for name in company_names]
    for future in concurrent.futures.as_completed(futures):
        name, ticker = future.result()
        company_name_to_ticker[name] = ticker

## Match Company Name to Ticker in Dataframe
for i in cb_df.index:
    try:
        cb_df.at[i,'Ticker'] = company_name_to_ticker[cb_df.at[i,'Company']]
    except:
        assert cb_df.at[i,'Company'] == 'Corporate Bonds', f"Expected Cororate Bonds, got {cb_df.at[i,'Company']}"

## Get Company Info
def get_info_from_ticker(ticker):
    search_results = yf.Tickers(ticker)
    return search_results.tickers[ticker].info

def get_info(name):
    try:
        ticker = company_name_to_ticker[name]
        info = get_info_from_ticker(ticker)
    except:
        info = 'No Info Found'
    return (name, info)

## use parallelization to speed up this process
company_info_dict = dict()
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(get_info, name) for name in company_names]
    for future in concurrent.futures.as_completed(futures):
        name, info = future.result()
        company_info_dict[name] = info

"""
Output Results to File
"""
cb_df.to_csv(output_file_path)
print(f"Investment Processing are Done, Saving to File {output_file_path}")


"""
Investments Meta Information Output
"""
print("""Corporate Bond Totals""")
print("Cost Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Cost Value"]))
print("Market Value\t",'${:,.2f}'.format(cb_df.sum(numeric_only=True)["Market Value"]))