diff options
| author | Gabriel Kosmacher <73120774+kennykos@users.noreply.github.com> | 2023-02-15 16:12:31 -0600 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-02-15 16:12:31 -0600 |
| commit | 28c1eff12954a8c24d88fa4df036bd7221c919f3 (patch) | |
| tree | 9f0e68432200809698b0ec230d2125c309204f57 /ui_security_inventory_23_parsing.ipynb | |
| parent | eb4132862f19c157289f126e5a071863ebeedc2c (diff) | |
| parent | b6b578249e447614f7a9e07c7b2343e625d21101 (diff) | |
Merge pull request #1 from kennykos/main
Adding scripts to read & clean University of Illinois Investment Data, get yahoo! info for invested companies
Diffstat (limited to 'ui_security_inventory_23_parsing.ipynb')
| -rw-r--r-- | ui_security_inventory_23_parsing.ipynb | 484 |
1 files changed, 484 insertions, 0 deletions
diff --git a/ui_security_inventory_23_parsing.ipynb b/ui_security_inventory_23_parsing.ipynb new file mode 100644 index 0000000..297203f --- /dev/null +++ b/ui_security_inventory_23_parsing.ipynb @@ -0,0 +1,484 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "8ce6b023", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import yfinance as yf\n", + "import requests\n", + "import concurrent.futures\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8304939a", + "metadata": {}, + "outputs": [], + "source": [ + "file_path_23 = \"./data/23-138.Records.xlsx\"\n", + " # skip header rows so column names align, drop all NaN rows\n", + "df = pd.read_excel(file_path_23, parse_dates=True, skiprows=6).dropna(how='all')" + ] + }, + { + "cell_type": "markdown", + "id": "538233a8", + "metadata": {}, + "source": [ + "#### Setting up the Operating Pool DataFrame " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96dee5bc", + "metadata": {}, + "outputs": [], + "source": [ + "bank_i = df[df['Account or Security'].str.contains(\"9-200100\", na=False)].index\n", + "op_i = df[df['Account or Security'].str.contains(\"Operating Funds Pool\", na=False)].index\n", + "op_df = df.loc[bank_i[0]:op_i[1]-1]\n", + "op_df.insert(6, 'Bank', pd.NA)\n", + "op_df.insert(7, 'Asset Type', pd.NA)\n", + "op_df.insert(8, 'Company', pd.NA)\n", + "op_df.insert(9, 'Industry', pd.NA)\n", + "op_df.insert(10, 'Private Placement', False)\n", + "op_df.insert(11, 'Ticker', pd.NA)\n", + "op_df.insert(11, 'Info', object)\n", + "# op_df = op_df.insert(7, 'Bank', pd.NA)\n", + "op_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "cc1f037f", + "metadata": {}, + "source": [ + "#### Add the Bank and Asset Type " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29c95d09", + "metadata": {}, + "outputs": [], + "source": [ + "bank_name = pd.NA\n", + "asset_type = pd.NA\n", + "for i in op_df.index:\n", + " if np.isnan(op_df.loc[i][\"Quantity\"]):\n", + " if \"9-200100\" in df.loc[i][\"Account or Security\"]:\n", + " bank_name = df.loc[i][\"Account or Security\"]\n", + " else:\n", + " asset_type = df.loc[i][\"Account or Security\"]\n", + " op_df.at[i,'Bank'] = bank_name\n", + " op_df.at[i,'Asset Type'] = asset_type\n", + "op_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ea40a3e", + "metadata": {}, + "outputs": [], + "source": [ + "all_asset_types = set([op_df.loc[i]['Asset Type'] for i in op_df.index]);all_asset_types" + ] + }, + { + "cell_type": "markdown", + "id": "274d0b39", + "metadata": {}, + "source": [ + "#### First, we just check the corperate bonds " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe124c92", + "metadata": {}, + "outputs": [], + "source": [ + "cb_df = op_df[op_df['Asset Type'].str.contains(\"Corporate Bonds\", na=False)]\n", + "print(\"Corperate Bond Totals\")\n", + "print(\"Cost Value\\t\",'${:,.2f}'.format(cb_df.sum(numeric_only=True)[\"Cost Value\"]))\n", + "print(\"Market Value\\t\",'${:,.2f}'.format(cb_df.sum(numeric_only=True)[\"Market Value\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "d4aba4d7", + "metadata": {}, + "source": [ + "Yup, you read that right" + ] + }, + { + "cell_type": "markdown", + "id": "9879b032", + "metadata": {}, + "source": [ + "##### get company name\n", + "+ Parse our 'PVTPL', which is an abreviation for privatly placed https://www.investopedia.com/terms/p/privateplacement.asp\n", + "+ Remove everything after and including the tokens ```['%']```\n", + "+ Remove everything after `[\" CAP\", \" INC\", \" FDG\", \" CORP\", \" CO\", \" LLC\", \" CR\"]`\n", + "+ Add the company names to a set\n", + "+ Map different semantic names to the same syntax for the same company" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d78a9733", + "metadata": {}, + "outputs": [], + "source": [ + "company_name_dict = {\n", + " \"AMERICAN EXPRESS\" : \"AMERICAN EXPRESS CO\",\n", + " 'AIG GLOBAL' : \"AMERICAN INTL GROUP INC\",\n", + " \"ANHEUSER-BUSCH\" : \"ANHEUSER-BUSCH CO\",\n", + " \"APTIV\" : \"APTIV CO\",\n", + " \"ASTRAZENECA\" : \"ASTRAZENECA PLC\",\n", + " \"AUSTRALIA & NEW\" : \"AUSTRALIA & NEW ZEALAND BKG GR\",\n", + " \"BAE SYS\" : \"BAE SYS PLC\",\n", + " \"BANCO SANTANDER\" : \"BANCO SANTANDER SA\",\n", + " \"BANK MONTREAL\" : \"BANK OF MONTREAL\",\n", + " \"BANK OF MONTREAL\" : \"BANK OF MONTREAL\",\n", + " \"BK MONTREAL\" : \"BANK OF MONTREAL\",\n", + " \"BANK NOVA SCOTIA\" : \"BANK OF NOVA SCOTIA\",\n", + " \"BANK OF NOVA SCOTIA\" : \"BANK OF NOVA SCOTIA\",\n", + " \"BANK AMER\" : \"BANK OF AMERICA CO\",\n", + " \"BAXTER INT\" : \"BAXTER INTERNATIONAL INC\",\n", + " \"BAYER US FIN\" : \"BAYER US FINANCE LLC\",\n", + " \"BB&T\" : \"BB&T CO\",\n", + " \"BLACKSTONE\" : \"BLACKSTONE\",\n", + " \"BMW\" : \"BMW\",\n", + " \"BNP PARIBAS\" : \"BNP PARIBAS\",\n", + " \"BRIGHTHOUSE\" : \"BRIGHTHOUSE\",\n", + " \"BRISTOL MYERS SQUIBB\" : \"BRISTOL MYERS SQUIBB CO\",\n", + " \"BRISTOL-MYERS SQUIBB\" : \"BRISTOL MYERS SQUIBB CO\",\n", + " \"CANADIAN IMPERIAL BK\" : \"CANADIAN IMPERIAL BK CO\",\n", + " \"CAPITAL ONE\" : \"CAPITAL ONE FINL CO\",\n", + " \"CATERPILLAR FINL\" : \"CATERPILLAR FINL\",\n", + " \"CENTERPOINT ENERGY\" : \"CENTERPOINT ENERGY INC\",\n", + " \"CHEVRON U S A\" : \"CHEVRON CO\",\n", + " \"CREDIT AGRICOLE\" : \"CREDIT AGRICOLE\",\n", + " \"CREDIT SUISSE\" : \"CREDIT SUISSE GROUP AG\",\n", + " \"CROWN CASTLE INTL\" : \"CROWN CASTLE INTL\",\n", + " \"DAIMLER\" : \"DAIMLER\",\n", + " \"DELTA AIR LINES\" : \"DELTA AIR LINES\",\n", + " \"DTE E\" : \"DTE ELEC\",\n", + " \"DUKE ENERGY\" : \"DUKE ENERGY CO\",\n", + " \"DOWDUPONT INC\" : \"DUPONT DE NEMOURS INC\",\n", + " \"ENTERGY\" : \"ENTERGY CO\",\n", + " \"EQUITABLE FINL LIFE\" : \"EQUITABLE FINL LIFE GLOBAL FDG\",\n", + " \"ESC CB LEHMAN BROS\" : \"ESC LEHMAN BROTH HLD INC\",\n", + " \"FIFTH THIRD BANCORP\" : \"FIFTH THIRD BANCORP\",\n", + " \"FLORIDA P\" : \"FLORIDA POWER & LIGHT CO\",\n", + " \"GENERAL MTRS\" : \"GENERAL MOTORS\",\n", + " \"GENERAL MOTORS\" : \"GENERAL MOTORS\",\n", + " \"HEWLETT PACKARD\" : \"HEWLETT PACKARD ENTERPRISE CO\",\n", + " \"HP INC\" : \"HEWLETT PACKARD ENTERPRISE CO\",\n", + " \"HUNTINGTON\" : \"HUNTINGTON NATL BK MD\",\n", + " \"JACKSON FINANCIAL INC\" : \"JACKSON NATIONAL LIFE GL\",\n", + " \"JPM CHASE\" : \"JPMORGAN CHASE & CO\",\n", + " \"KINDER MORGAN\" : \"KINDER MORGAN INC\",\n", + " \"LLOYDS BKG\" : \"LLOYDS BANKING GROUP PLC FORME\",\n", + " \"MACQUARIE\" : \"MACQUARIE BK LTD\",\n", + " \"MIZUHO\" : \"MIZUHO CO\",\n", + " \"MONDELEZ INT\" : \"MONDELEZ INTERNATIONAL INC\",\n", + " \"MORGAN STANLEY\" : \"MORGAN STANLEY\",\n", + " \"NATIONAL AUSTRALIA B\" : \"NATIONAL AUSTRALIA BANK\",\n", + " \"NATIONWIDE BLDG SOC\" : \"NATIONWIDE BLDG SOCIETY\",\n", + " \"NATIONAL BANK OF CANADA\" : \"NATIONAL BANK OF CANADA\",\n", + " \"NATL BK CDA\" : \"NATIONAL BANK OF CANADA\",\n", + " \"NATWEST M\" : \"NATWEST MARKETS PLC\",\n", + " \"NEXTERA ENERGY\" : \"NEXTERA ENERGY CAP\",\n", + " \"NORDEA BANK\" : \"NORDEA BANK\",\n", + " \"NORTHWESTERN\" : \"NORTHWESTERN MUT\",\n", + " \"NXP B V\" : \"NXP B V\",\n", + " \"PHILLIPS 66\" : \"PHILLIPS 66\",\n", + " \"PRINCIPAL LIFE GLOBAL\" : \"PRINCIPAL LIFE GLOBAL FDG\",\n", + " \"PROTECTIVE LIFE G\" : \"PROTECTIVE LIFE GLOBAL\",\n", + " \"PUBLIC SVC\" : \"PUBLIC SERVICE ELECTRIC & GAS\",\n", + " \"RABOBANK NEDERLAND\" : \"RABOBANK NEDERLAND\",\n", + " \"ROCHE H\" : \"ROCHE HOLDINGS INC\",\n", + " \"ROPER \" : \"ROPER TECHNOLOGIES INC\",\n", + " \"ROYAL BANK OF CANADA\" : \"ROYAL BANK OF CANADA\",\n", + " \"ROYAL BK CDA\" : \"ROYAL BANK OF CANADA\",\n", + " \"SCHLUMBERGER\" : \"SCHLUMBERGER\", \n", + " \"SIEMENS FINANCIERINGSMAA\" : \"SIEMENS FINANCIERINGSMAATSCHAP\",\n", + " \"SIMON PPTY GROUP\" : \"SIMON PPTY GROUP\",\n", + " \"STATE STR\" : \"STATE STREET CO\",\n", + " \"SUMITOMO MITSUI\" : \"SUMITOMO MITSUI BANKING\",\n", + " \"SWEDBANK AB\" : \"SWEDBANK AB\",\n", + " \"TORONTO DOMINION B\" : \"TORONTO DOMINION BANK\",\n", + " \"TOYOTA M\" : \"TOYOTA MOTOR\",\n", + " \"TRANS-CDA PIPELINES\" : \"TRANS-CDA PIPELINES\",\n", + " \"TRUIST \" : \"TRUIST BANK\",\n", + " \"TSMC \" : \"TSMC GLOBAL LTD\",\n", + " \"U S B\" : \"U S BANCORP\",\n", + " \"UBS \" : \"UBS AG LONDON\",\n", + " \"VENTAS REALTY\" : \"VENTAS REALTY LP\",\n", + " \"VOLKSWAGEN GROUP\" : \"VOLKSWAGEN GROUP\",\n", + " \"WESTPAC B\" : \"WESTPAC BANKING\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "659c0ba7", + "metadata": {}, + "outputs": [], + "source": [ + "company_names = set()\n", + "for i in cb_df.index:\n", + " # Set the Company to be cleaned\n", + " cb_df.at[i,'Company'] = cb_df.at[i,'Account or Security']\n", + " if not np.isnan(cb_df.at[i,'Quantity']):\n", + " # clean private placement\n", + " for prefix in [\"PVTPL\", \"PVPTL\", \"PVYPL\", \"PVT PL\", \"PVPTL\"]:\n", + " if prefix in cb_df.loc[i][\"Company\"]:\n", + " cb_df.at[i,'Private Placement'] = True\n", + " cb_df.at[i,'Company'] = cb_df.at[i,'Company'][6:].strip()\n", + " for end in [\" CAP\", \" INC\", \" FDG\", \" CORP\", \" CO\", \" LLC\", \" CR\", \" SR\", \" A/S\", \" LP\", \" ASA\", \" LTD\", ]:\n", + " if end in cb_df.at[i, \"Company\"]:\n", + " cb_df.at[i, \"Company\"] = cb_df.at[i, 'Company'].split(end)[0].strip()+\" \"+end\n", + " for token in ['%']:\n", + " if token in cb_df.at[i, \"Company\"]:\n", + " # get everythng before the token, then get eveything before the last space\n", + " cb_df.at[i, \"Company\"] = cb_df.at[i, \"Company\"].split(token)[0].rsplit(' ', 1)[0].strip()\n", + " for key, value in company_name_dict.items():\n", + " if key in cb_df.at[i, \"Company\"]:\n", + " cb_df.at[i, \"Company\"] = value\n", + " company_names.add(cb_df.at[i, \"Company\"])\n", + " else:\n", + " cb_df.drop(i, axis=0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7e06dfe", + "metadata": {}, + "outputs": [], + "source": [ + "cb_df" + ] + }, + { + "cell_type": "markdown", + "id": "8b78d9cd", + "metadata": {}, + "source": [ + "##### get ticker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba37103f", + "metadata": {}, + "outputs": [], + "source": [ + "# taken from https://gist.github.com/bruhbruhroblox/dd9d981c8c37983f61e423a45085e063\n", + "def getTicker(company_name):\n", + " yfinance = \"https://query2.finance.yahoo.com/v1/finance/search\"\n", + " user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'\n", + " params = {\"q\": company_name, \"quotes_count\": 1, \"country\": \"United States\"}\n", + "\n", + " res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})\n", + " data = res.json()\n", + "\n", + " company_code = data['quotes'][0]['symbol']\n", + " return company_code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc68a7ab", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "def get_ticker(name):\n", + " try:\n", + " # try to get the ticker\n", + " ticker = getTicker(name)\n", + " except:\n", + " try:\n", + " # shorten the name and try again\n", + " short_name = name.split(' ')[0]\n", + " ticker = getTicker(short_name)\n", + " except:\n", + " # no ticker could be found, probably a private company, check by hand to make sure\n", + " ticker = 'NO_TICKER_FOUND'\n", + " return (name, ticker)\n", + "\n", + "company_name_to_ticker = dict()\n", + "with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " futures = [executor.submit(get_ticker, name) for name in company_names]\n", + " for future in concurrent.futures.as_completed(futures):\n", + " name, ticker = future.result()\n", + " company_name_to_ticker[name] = ticker" + ] + }, + { + "cell_type": "markdown", + "id": "e3009168", + "metadata": {}, + "source": [ + "##### match company name to ticker in DF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8f64024", + "metadata": {}, + "outputs": [], + "source": [ + "for i in cb_df.index:\n", + " try:\n", + " cb_df.at[i,'Ticker'] = company_name_to_ticker[cb_df.at[i,'Company']]\n", + " except:\n", + " assert cb_df.at[i,'Company'] == 'Corporate Bonds', f\"Expected Cororate Bonds, got {cb_df.at[i,'Company']}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04e49491", + "metadata": {}, + "outputs": [], + "source": [ + "cb_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "2342f360", + "metadata": {}, + "source": [ + "## Get info from ticker " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fe3d0df", + "metadata": {}, + "outputs": [], + "source": [ + "def get_info_from_ticker(ticker):\n", + " # Search for the company on Yahoo Finance\n", + " search_results = yf.Tickers(ticker)\n", + " return search_results.tickers[ticker].info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ed7c317", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time\n", + "def get_info(name):\n", + " try:\n", + " ticker = company_name_to_ticker[name]\n", + " info = get_info_from_ticker(ticker)\n", + " except:\n", + " info = 'No Info Found'\n", + " return (name, info)\n", + "\n", + "## use parallelization to speed up this process\n", + "company_info_dict = dict()\n", + "with concurrent.futures.ThreadPoolExecutor() as executor:\n", + " futures = [executor.submit(get_info, name) for name in company_names]\n", + " for future in concurrent.futures.as_completed(futures):\n", + " name, info = future.result()\n", + " company_info_dict[name] = info" + ] + }, + { + "cell_type": "markdown", + "id": "180e9785", + "metadata": {}, + "source": [ + "##### Link Info to Company, saved as a json dump in the dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "570e70b3", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "for i in cb_df.index:\n", + " if cb_df.at[i,'Company'] == 'Corporate Bonds':\n", + " continue\n", + " if company_info_dict[cb_df.at[i,'Company']] is None:\n", + " continue\n", + " info_dict = dict(company_info_dict[cb_df.at[i,'Company']])\n", + " json_str = json.dumps(my_dict)\n", + " cb_df.at[i,'Info'] = json_str\n", + "\n", + "# assert cb_df.at[i,'Company'] == 'Corporate Bonds', f\"Expected Cororate Bonds, got {cb_df.at[i,'Company']}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bd1606d", + "metadata": {}, + "outputs": [], + "source": [ + "cb_df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "vscode": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} |
