diff options
Diffstat (limited to 'scripts-2023')
| -rw-r--r-- | scripts-2023/.DS_Store | bin | 0 -> 6148 bytes | |||
| -rw-r--r-- | scripts-2023/01-clean.ipynb | 225 | ||||
| -rw-r--r-- | scripts-2023/02-populate-tickers.ipynb | 384 | ||||
| -rw-r--r-- | scripts-2023/README.md | 10 |
4 files changed, 619 insertions, 0 deletions
diff --git a/scripts-2023/.DS_Store b/scripts-2023/.DS_Store Binary files differnew file mode 100644 index 0000000..91f6d6d --- /dev/null +++ b/scripts-2023/.DS_Store diff --git a/scripts-2023/01-clean.ipynb b/scripts-2023/01-clean.ipynb new file mode 100644 index 0000000..5c6ae37 --- /dev/null +++ b/scripts-2023/01-clean.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "76f1b610", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ef6781e", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the CSV file into a DataFrame\n", + "df = pd.read_csv('../data-2023/in/23-138.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de6cb72a", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename the column names\n", + "df = df.rename(columns={\n", + " 'University of Illinois System - Investment Office': 'Name',\n", + " 'Unnamed: 1': 'Coupon',\n", + " 'Unnamed: 2': 'Maturity Date',\n", + " 'Unnamed: 3': 'Quantity',\n", + " 'Unnamed: 4': 'Cost Val',\n", + " 'Unnamed: 5': 'Market Val'\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d909ab8", + "metadata": {}, + "outputs": [], + "source": [ + "# Reset the index and drop rows before '9-200100-Neuberger Intermediate'\n", + "index_to_reset = df.index[df['Name'] == '9-200100-Neuberger Intermediate'][0]\n", + "df.reset_index(drop=True, inplace=True)\n", + "\n", + "# Now your DataFrame should have the index reset, and rows before '9-200100-Neuberger Intermediate' dropped\n", + "# because Agency Funds are unimportant to us.\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "383efe00", + "metadata": {}, + "outputs": [], + "source": [ + "# Asset Types are header rows in Account or Security column \n", + "def is_asset_type(name):\n", + " investment_types = [\n", + " 'Asset-Backed Securities',\n", + " 'Commercial Mortgage Backed Securities',\n", + " 'Corporate Bonds',\n", + " 'Municipal Bonds',\n", + " 'U.S. Agency Bonds',\n", + " 'International Government Bonds',\n", + " 'U.S. Treasury Securities',\n", + " 'Cash & Cash Equivalents',\n", + " ]\n", + " return name in investment_types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd54c34e", + "metadata": {}, + "outputs": [], + "source": [ + "# Determine the position where you want to insert the 'Asset_Type' column (e.g., after 'Investment_Type')\n", + "insert_position = df.columns.get_loc('Name') + 1\n", + "\n", + "# Create a new column 'Asset Type' and fill it with the corresponding row headers\n", + "df['Asset Type'] = df['Name'].where(df['Name'].apply(is_asset_type))\n", + "\n", + "# Forward fill the 'Asset Type' column to propagate the labels down to the rows\n", + "df['Asset Type'] = df['Asset Type'].ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b1b000b", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity check: let's export what we currently have a to .csv and check that Asset Type is right \n", + "df.to_csv('../data-2023/out/asset_type_sanity_check.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f774d484", + "metadata": {}, + "outputs": [], + "source": [ + "# Alright! Let's drop all the rows with null values (this includes header rows for investment data + other fund data)\n", + "\n", + "# Drop any rows with empty or null values\n", + "df = df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "344a39dd", + "metadata": {}, + "outputs": [], + "source": [ + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5766ff23", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity check: let's export what we currently have a to .csv and check that we only have investment data \n", + "df.to_csv('../data-2023/out/investment_data_sanity_check.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06801e7f", + "metadata": {}, + "outputs": [], + "source": [ + "# Next, let's clean unruly typed data. Investments should obey the schema :\n", + "# Account or Security (string)\n", + "# Coupon (Date format mm/dd/yyyy)\n", + "# Maturity Date (double) \n", + "# Quantity (double) \n", + "# Cost Val (double) \n", + "# Market Val (double)\n", + "\n", + "cnt_old_entries = len(df)\n", + "\n", + "# for example, this bond has switched its Maturity Date and Date Coupon (this is actually one of a kind outlier)\n", + "print(df.iloc[135])\n", + "\n", + "# Convert 'Maturity Date' to datetime format\n", + "df['Maturity Date'] = pd.to_datetime(df['Maturity Date'], errors='coerce')\n", + "\n", + "\n", + "# Clean commas from numeric column values using regular expressions\n", + "numeric_columns = ['Coupon', 'Quantity', 'Cost Val', 'Market Val']\n", + "\n", + "for col in numeric_columns:\n", + " # clean commas to register values as numbers \n", + " df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '', regex=True), errors='coerce')\n", + "\n", + "# Now, the unruly typed data becomes NaN or NaT\n", + "print(df.iloc[135])\n", + "\n", + "# Drop all unruly typed data \n", + "df = df.dropna()\n", + "\n", + "cnt_new_entries = len(df)\n", + "\n", + "print(f\"\\nSuccessfully cleaned {cnt_old_entries - cnt_new_entries} unruly typed values.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a654e0", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity check: let's export what we currently have a to .csv and check that all our values are the right type. \n", + "df.to_csv('../data-2023/out/df_data_typed.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bef9be05", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts-2023/02-populate-tickers.ipynb b/scripts-2023/02-populate-tickers.ipynb new file mode 100644 index 0000000..cef727a --- /dev/null +++ b/scripts-2023/02-populate-tickers.ipynb @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "ea6b210e", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "import re\n", + "import yfinance as yf\n", + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50c15419", + "metadata": {}, + "outputs": [], + "source": [ + "# reload the CSV file into a DataFrame\n", + "df = pd.read_csv('../data-2023/out/df_data_typed.csv')\n", + "\n", + "# Reset the index to remove the 'Unnamed: 0' column from the DataFrame\n", + "df.reset_index(drop=True, inplace=True)\n", + "\n", + "# Add Year attribute\n", + "year = 2023\n", + "df['Year']= 2023" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cb83f2b", + "metadata": {}, + "outputs": [], + "source": [ + "# filter only corporate bonds\n", + "df = df[df['Asset Type'] == 'Corporate Bonds']\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fe32a9d", + "metadata": {}, + "outputs": [], + "source": [ + "# sanity check uncleaned\n", + "for idx, name in enumerate(df['Name'], start=1):\n", + " print(f\"{idx}: {name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3876447c", + "metadata": {}, + "outputs": [], + "source": [ + "# TRIM UNNECESSARY PREFIXES\n", + "\n", + "# trim PVTPL, PFS, PASS, THRU, CMO, COML, MTG, SR, TR, SER, DUE, POOL, \n", + "df['Name'].replace( { r\"\\b(PVPTL|PVTPL|PVTPL1|PVYPL|PFS|PVT|PL|ESC)\\b\" : '' }, inplace= True, regex = True)\n", + "# trim words with hyphens \n", + "df['Name'].replace( { r'(\\d+(\\.\\d+)?%)(.*)' : '' }, inplace= True, regex = True)\n", + "# trim unnec suffixes \n", + "df['Name'].replace( { r\"\\b(INC|)\\b\" : '' }, inplace= True, regex = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99aed56a", + "metadata": {}, + "outputs": [], + "source": [ + "# here's them cleaned\n", + "for idx, name in enumerate(df['Name'], start=1):\n", + " print(f\"{idx}: {name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ba8784e", + "metadata": {}, + "outputs": [], + "source": [ + "# Add Tickers\n", + "df['Ticker'] = str('NaN')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c0f7585", + "metadata": {}, + "outputs": [], + "source": [ + "# WARNING: this only works for this dataset! the following tickers were hand-annotated and crosslisted with \n", + "# the original investments\n", + "company_dict = {\n", + " \"VMWARE\": \"VMW\",\n", + " \"ADVENTIST\": \"SDA\",\n", + " \"AQUARION\": \"ES\", # acquired by ES\n", + " \"ASCENSION\": \"ACE.NZ\",\n", + " \"COMMONSPIRIT\": \"US:US20268JAA16\",\n", + " \"DOWDUPONT\": \"DD\", \n", + " \"LACLEDE\": \"SPIR\",\n", + " \"LULWA\": \"DISSOLVED\",\n", + " \"ONCOR\": \"SRE\", # majority owner\n", + " \"PEACEHEALTH\": \"US:US70462GAA67\",\n", + " \"RABOBANK\": \"RABO.AS\",\n", + " \"GULFSTREAM\": \"GD\", # acquired by GD\n", + " \"PRICOA\": \"US:US74153XBH98\",\n", + " \"AVOLON\": \"AVOL\",\n", + " \"3.05%\": \"INVALID\",\n", + " \"LSEGA\": \"LSEG.L\", # london stock exchange?\n", + " \"GLAXOSMITHKLINE\": \"GSK\",\n", + " \"INTERCONTINENTAL\": \"IHG\",\n", + " \"JPMORGAN\": \"JPM\",\n", + " \"KANSAS\": \"EVRG\", # kansas city power now evergy\n", + " \"KLA\": \"KLAC\", # kla corp\n", + " \"LLOYDS\": \"LYG\",\n", + " \"NATWEST\": \"NWG\",\n", + " \"NXP\": \"NXPI\",\n", + " \"PEPSICO\": \"PEP\",\n", + " \"BAE\": \"BAESY\",\n", + " \"BMW\": \"BMWYY\",\n", + " \"KIA\": \"KRX\",\n", + " \"MACQUARIE\": \"MQG.AX\",\n", + " \"PACIFIC\": \"0593430D:US\", # not publically traded\n", + " \"VOLKSWAGEN\": \"VWAGY\",\n", + " \"S&P\": \"SPGI\",\n", + " \"TEXTRON\": \"TXT\",\n", + " \"COMCAST\": \"CMCSA\",\n", + " \"CES MU2 LLC\": \"???\",\n", + " \"CHEVRON\": \"CVX\",\n", + " \"CITIGROUP\": \"C\",\n", + " \"CITIZENS BK\": \"CFG\",\n", + " \"CONOCOPHILLIPS\": \"COP\",\n", + " \"DEERE JOHN\": \"DE\",\n", + " \"ECOLAB\": \"ECL\",\n", + " \"EMORY\": \"US:US29157TAD81\",\n", + " \"ENBRIDGE\": \"EMB\",\n", + " \"ENTERGY\": \"ETR\",\n", + " \"ERP\": \"???\",\n", + " \"FEDERAL RLTY INVT\": \"FRT\",\n", + " \"FISERV\": \"FI\",\n", + " \"FMC CORP\": \"FMC\",\n", + " \"GENERAL DYNAMICS\": \"GD\",\n", + " \"GENERAL MLS\": \"GIS\",\n", + " \"GENERAL MTRS\": \"GM\",\n", + " \"GEORGIA-PAC\": \"???\",\n", + " \"GETTY PAUL\": \"US374288AB41\",\n", + " \"GLOBAL PMTS\": \"GPN\",\n", + " \"HCP\": \"HCP\",\n", + " \"HEWLETT PACKARD\": \"HPQ\",\n", + " \"HYUNDAI\": \"HYMTF\",\n", + " \"KEY BANK\": \"KEY\",\n", + " \"MARTIN MARIETTA\": \"MLM\",\n", + " \"MITSUBISHI\": \"MSBHF\",\n", + " \"MONDELEZ INTERNATIONAL\": \"MDLZ\",\n", + " \"NATL BK CDA MEDIUM\": \"NTIOF\",\n", + " \"NOVARTIS\": \"NVS\",\n", + " \"ONE GAS\": \"OGS\",\n", + " \"PACCAR FINL\": \"PCAR\",\n", + " \"PARKER-HANNIFIN\": \"PH\",\n", + " \"SIMON PPTY\": \"SPG\",\n", + " \"WEINGARTEN\": \"WP4.F\",\n", + " \"AMERICAN HONDA\": \"HMC\",\n", + " \"BP\": \"BP\",\n", + " \"BPCE S A MEDIUM\": \"ENXTPA:BPCEGP\",\n", + " \"CANADIAN IMPERIAL\": \"CM\",\n", + " \"CARDINAL HEALTH\": \"CAH\",\n", + " \"CATERPILLAR\": \"CAT\",\n", + " \"COOPERATIEVE RABOBANK\": \"RABO.AS\",\n", + " \"TRANS-CDA\": \"TRP\",\n", + " \"BOSTON PPTYS\": \"BXP\",\n", + " \"BRISTOL MYERS SQUIBB\": \"BMY\",\n", + " \"CANADIAN NAT RES LTD\": \"CNQ\",\n", + " \"CANADIAN PAC RY\": \"CP\",\n", + " \"CAPITAL ONE\": \"COF\",\n", + " \"CENTERPOINT ENERGY\": \"CNP\",\n", + " \"CHARLES SCHWAB\": \"SCHW\",\n", + " \"CNH INDL\": \"CNHI\",\n", + " \"CONNECTICUT LIGHT & POWER\": \"CNTHO\",\n", + " \"DAIMLER TRUCKS\": \"DTRUY\",\n", + " \"DTE ELEC\": \"DTE\",\n", + " \"DUKE ENERGY\": \"DUK\",\n", + " \"DUKE RLTY\": \"DRE\",\n", + " \"EATON\": \"ETN\",\n", + " \"ENTERPRISE PRODS\": \"EPD\",\n", + " \"ERP OPER\": \"EPD\",\n", + " \"EXXON\": \"XOM\",\n", + " \"FIFTH THIRD\": \"FITB\",\n", + " \"GILEAD SCIENCES\": \"GILD\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e26ab487", + "metadata": {}, + "outputs": [], + "source": [ + "# # unpack company_dict from json file\n", + "# import json\n", + "\n", + "# # Opening JSON file\n", + "# company_dict = open('../data-2023/in/company_dict.json')\n", + " \n", + "# # returns JSON object as \n", + "# # a dictionary\n", + "# company_dict = json.load(company_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "575c1340", + "metadata": {}, + "outputs": [], + "source": [ + "# taken from https://gist.github.com/bruhbruhroblox/dd9d981c8c37983f61e423a45085e063\n", + "def getTicker(name):\n", + " try: \n", + " yfinance = \"https://query2.finance.yahoo.com/v1/finance/search\"\n", + " user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'\n", + " params = {\"q\": name, \"quotes_count\": 1, \"country\": \"United States\"}\n", + "\n", + " res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})\n", + " data = res.json()\n", + " ticker = data['quotes'][0]['symbol']\n", + " return ticker\n", + " except: \n", + " print(name)\n", + "\n", + "def getTickerWrapper(company_name):\n", + " words = company_name.split()\n", + "# print(words)\n", + " for i in range(len(words), 0, -1):\n", + " curr_name = ' '.join(words[:i])\n", + "# print(curr_name)\n", + " # check against company dict \n", + " if curr_name in company_dict:\n", + " return company_dict[curr_name]\n", + " ticker = getTicker(curr_name)\n", + " if ticker:\n", + " company_dict[curr_name] = ticker \n", + " return ticker \n", + " return \"Error\"" + ] + }, + { + "cell_type": "markdown", + "id": "7d949acf", + "metadata": {}, + "source": [ + "# How this works\n", + "The following three blocks are run iteratively in increments of 100 entries (i.e. df[:100], then df[101:200], etc.) for speed and human review. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5b47fc0", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# 1st round - grab tickers for all in increments of 300 (0-300, 301-600, etc.)\n", + "for index, row in df[:100].iterrows():\n", + "# print(getTickerWrapper(row['Name']))\n", + "# row['Ticker'] = getTickerWrapper(row['Name'])\n", + " df.at[index, 'Ticker'] = getTickerWrapper(row['Name'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4809707", + "metadata": {}, + "outputs": [], + "source": [ + "# second round - fill any companies with Not Found in company_dict\n", + "for index, row in df[:100].iterrows():\n", + " if row['Ticker'] == \"Error\" and row['Ticker'] in company_dict:\n", + " df.at[index, 'Ticker'] = company_dict[row['Ticker']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4969275", + "metadata": {}, + "outputs": [], + "source": [ + "df[:100]" + ] + }, + { + "cell_type": "markdown", + "id": "c5c54ea9", + "metadata": {}, + "source": [ + "# End of Iterative Script\n", + "At this point, we are operating under the assumption that all increments have been run, i.e. the entire dataframe's tickers have been generated. Only run the next block once this is true, otherwise you will rewrite the entire existing dataframe CSV file and all unpopulated ticker values will be NaN." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8cc57ca", + "metadata": {}, + "outputs": [], + "source": [ + "# At this point, inspect the file to see if there are any incorrect / erroneous tickers.\n", + "df.to_csv(\"../data-2023/out/df_with_ticker_0_to_100.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "5b8fc2ec", + "metadata": {}, + "source": [ + "# Graveyard: Do Not Run After this Point" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03d1fdc9", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# Convert and write JSON object to file\n", + "with open(\"company_dict.json\", \"w\") as outfile: \n", + " json.dump(company_dict, outfile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac5f3b94", + "metadata": {}, + "outputs": [], + "source": [ + "# drop final error and ??? rows\n", + "df = df[df['Ticker'] != ]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts-2023/README.md b/scripts-2023/README.md new file mode 100644 index 0000000..0a29c18 --- /dev/null +++ b/scripts-2023/README.md @@ -0,0 +1,10 @@ +# Populating 2023 Data into a CSV + +This is the (crude) first draft of documentation for the 2023 data pipeline! + +How this works: + +- All the input/output dataframes that the scripts operate with are located in the directory data-2023/in/ and data-2023/out/. +- To follow the pipeline, drag/drop the new year's investment portfolio .CSV file into data-2023/in/. +- Follow 01-clean.ipynb and 02-populate-tickers.ipynb respectively. +- The final dataframe, will be populated as 'data-2023/out/df*with_ticker*...csv'. Please replicate this process and compare your results with 'data-2023/out/master_df_with_ticker.csv', which is the final dataframe that I uploaded to MongoDB. |
