Add 2023 Scripts and Data (#6)HEAD main

* Add 2023 Scripts and Data * Update README.md
author: eeshabarua <61480751+eeshabarua@users.noreply.github.com> 2025-03-12 22:47:09 -0400
committer: GitHub <noreply@github.com> 2025-03-12 21:47:09 -0500
commit: f612a123f135f93da662ca0543998dc90a0c50d6 (patch)
tree: ccd8fa57d72e983c02a50caabe6121eb6c2e373c /scripts-2023/02-populate-tickers.ipynb
parent: 87472cbe8f20612b21f26f8b1a8e8f3c2d55163d (diff)
1 files changed, 384 insertions, 0 deletions
diff --git a/scripts-2023/02-populate-tickers.ipynb b/scripts-2023/02-populate-tickers.ipynb
new file mode 100644
index 0000000..cef727a
--- /dev/null
+++ b/scripts-2023/02-populate-tickers.ipynb
@@ -0,0 +1,384 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea6b210e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import json\n",
+    "import re\n",
+    "import yfinance as yf\n",
+    "import requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50c15419",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reload the CSV file into a DataFrame\n",
+    "df = pd.read_csv('../data-2023/out/df_data_typed.csv')\n",
+    "\n",
+    "# Reset the index to remove the 'Unnamed: 0' column from the DataFrame\n",
+    "df.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "# Add Year attribute\n",
+    "year = 2023\n",
+    "df['Year']= 2023"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cb83f2b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# filter only corporate bonds\n",
+    "df = df[df['Asset Type'] == 'Corporate Bonds']\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fe32a9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sanity check uncleaned\n",
+    "for idx, name in enumerate(df['Name'], start=1):\n",
+    "    print(f\"{idx}: {name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3876447c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TRIM UNNECESSARY PREFIXES\n",
+    "\n",
+    "# trim PVTPL, PFS, PASS, THRU, CMO, COML, MTG, SR, TR, SER, DUE, POOL, \n",
+    "df['Name'].replace( { r\"\\b(PVPTL|PVTPL|PVTPL1|PVYPL|PFS|PVT|PL|ESC)\\b\" : '' }, inplace= True, regex = True)\n",
+    "# trim words with hyphens \n",
+    "df['Name'].replace( { r'(\\d+(\\.\\d+)?%)(.*)' : '' }, inplace= True, regex = True)\n",
+    "# trim unnec suffixes \n",
+    "df['Name'].replace( { r\"\\b(INC|)\\b\" : '' }, inplace= True, regex = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99aed56a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# here's them cleaned\n",
+    "for idx, name in enumerate(df['Name'], start=1):\n",
+    "    print(f\"{idx}: {name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ba8784e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add Tickers\n",
+    "df['Ticker'] = str('NaN')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c0f7585",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# WARNING: this only works for this dataset! the following tickers were hand-annotated and crosslisted with \n",
+    "# the original investments\n",
+    "company_dict = {\n",
+    "    \"VMWARE\": \"VMW\",\n",
+    "    \"ADVENTIST\": \"SDA\",\n",
+    "    \"AQUARION\": \"ES\", # acquired by ES\n",
+    "    \"ASCENSION\": \"ACE.NZ\",\n",
+    "    \"COMMONSPIRIT\": \"US:US20268JAA16\",\n",
+    "    \"DOWDUPONT\": \"DD\", \n",
+    "    \"LACLEDE\": \"SPIR\",\n",
+    "    \"LULWA\": \"DISSOLVED\",\n",
+    "    \"ONCOR\": \"SRE\", # majority owner\n",
+    "    \"PEACEHEALTH\": \"US:US70462GAA67\",\n",
+    "    \"RABOBANK\": \"RABO.AS\",\n",
+    "    \"GULFSTREAM\": \"GD\", # acquired by GD\n",
+    "    \"PRICOA\": \"US:US74153XBH98\",\n",
+    "    \"AVOLON\": \"AVOL\",\n",
+    "    \"3.05%\": \"INVALID\",\n",
+    "    \"LSEGA\": \"LSEG.L\", # london stock exchange?\n",
+    "    \"GLAXOSMITHKLINE\": \"GSK\",\n",
+    "    \"INTERCONTINENTAL\": \"IHG\",\n",
+    "    \"JPMORGAN\": \"JPM\",\n",
+    "    \"KANSAS\": \"EVRG\", # kansas city power now evergy\n",
+    "    \"KLA\": \"KLAC\", # kla corp\n",
+    "    \"LLOYDS\": \"LYG\",\n",
+    "    \"NATWEST\": \"NWG\",\n",
+    "    \"NXP\": \"NXPI\",\n",
+    "    \"PEPSICO\": \"PEP\",\n",
+    "    \"BAE\": \"BAESY\",\n",
+    "    \"BMW\": \"BMWYY\",\n",
+    "    \"KIA\": \"KRX\",\n",
+    "    \"MACQUARIE\": \"MQG.AX\",\n",
+    "    \"PACIFIC\": \"0593430D:US\", # not publically traded\n",
+    "    \"VOLKSWAGEN\": \"VWAGY\",\n",
+    "    \"S&P\": \"SPGI\",\n",
+    "    \"TEXTRON\": \"TXT\",\n",
+    "    \"COMCAST\": \"CMCSA\",\n",
+    "    \"CES MU2 LLC\": \"???\",\n",
+    "    \"CHEVRON\": \"CVX\",\n",
+    "    \"CITIGROUP\": \"C\",\n",
+    "    \"CITIZENS BK\": \"CFG\",\n",
+    "    \"CONOCOPHILLIPS\": \"COP\",\n",
+    "    \"DEERE JOHN\": \"DE\",\n",
+    "    \"ECOLAB\": \"ECL\",\n",
+    "    \"EMORY\": \"US:US29157TAD81\",\n",
+    "    \"ENBRIDGE\": \"EMB\",\n",
+    "    \"ENTERGY\": \"ETR\",\n",
+    "    \"ERP\": \"???\",\n",
+    "    \"FEDERAL RLTY INVT\": \"FRT\",\n",
+    "    \"FISERV\": \"FI\",\n",
+    "    \"FMC CORP\": \"FMC\",\n",
+    "    \"GENERAL DYNAMICS\": \"GD\",\n",
+    "    \"GENERAL MLS\": \"GIS\",\n",
+    "    \"GENERAL MTRS\": \"GM\",\n",
+    "    \"GEORGIA-PAC\": \"???\",\n",
+    "    \"GETTY PAUL\": \"US374288AB41\",\n",
+    "    \"GLOBAL PMTS\": \"GPN\",\n",
+    "    \"HCP\": \"HCP\",\n",
+    "    \"HEWLETT PACKARD\": \"HPQ\",\n",
+    "    \"HYUNDAI\": \"HYMTF\",\n",
+    "    \"KEY BANK\": \"KEY\",\n",
+    "    \"MARTIN MARIETTA\": \"MLM\",\n",
+    "    \"MITSUBISHI\": \"MSBHF\",\n",
+    "    \"MONDELEZ INTERNATIONAL\": \"MDLZ\",\n",
+    "    \"NATL BK CDA MEDIUM\": \"NTIOF\",\n",
+    "    \"NOVARTIS\": \"NVS\",\n",
+    "    \"ONE GAS\": \"OGS\",\n",
+    "    \"PACCAR FINL\": \"PCAR\",\n",
+    "    \"PARKER-HANNIFIN\": \"PH\",\n",
+    "    \"SIMON PPTY\": \"SPG\",\n",
+    "    \"WEINGARTEN\": \"WP4.F\",\n",
+    "    \"AMERICAN HONDA\": \"HMC\",\n",
+    "    \"BP\": \"BP\",\n",
+    "    \"BPCE S A MEDIUM\": \"ENXTPA:BPCEGP\",\n",
+    "    \"CANADIAN IMPERIAL\": \"CM\",\n",
+    "    \"CARDINAL HEALTH\": \"CAH\",\n",
+    "    \"CATERPILLAR\": \"CAT\",\n",
+    "    \"COOPERATIEVE RABOBANK\": \"RABO.AS\",\n",
+    "    \"TRANS-CDA\": \"TRP\",\n",
+    "    \"BOSTON PPTYS\": \"BXP\",\n",
+    "    \"BRISTOL MYERS SQUIBB\": \"BMY\",\n",
+    "    \"CANADIAN NAT RES LTD\": \"CNQ\",\n",
+    "    \"CANADIAN PAC RY\": \"CP\",\n",
+    "    \"CAPITAL ONE\": \"COF\",\n",
+    "    \"CENTERPOINT ENERGY\": \"CNP\",\n",
+    "    \"CHARLES SCHWAB\": \"SCHW\",\n",
+    "    \"CNH INDL\": \"CNHI\",\n",
+    "    \"CONNECTICUT LIGHT & POWER\": \"CNTHO\",\n",
+    "    \"DAIMLER TRUCKS\": \"DTRUY\",\n",
+    "    \"DTE ELEC\": \"DTE\",\n",
+    "    \"DUKE ENERGY\": \"DUK\",\n",
+    "    \"DUKE RLTY\": \"DRE\",\n",
+    "    \"EATON\": \"ETN\",\n",
+    "    \"ENTERPRISE PRODS\": \"EPD\",\n",
+    "    \"ERP OPER\": \"EPD\",\n",
+    "    \"EXXON\": \"XOM\",\n",
+    "    \"FIFTH THIRD\": \"FITB\",\n",
+    "    \"GILEAD SCIENCES\": \"GILD\"\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e26ab487",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # unpack company_dict from json file\n",
+    "# import json\n",
+    "\n",
+    "# # Opening JSON file\n",
+    "# company_dict = open('../data-2023/in/company_dict.json')\n",
+    " \n",
+    "# # returns JSON object as \n",
+    "# # a dictionary\n",
+    "# company_dict = json.load(company_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "575c1340",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# taken from https://gist.github.com/bruhbruhroblox/dd9d981c8c37983f61e423a45085e063\n",
+    "def getTicker(name):\n",
+    "    try: \n",
+    "        yfinance = \"https://query2.finance.yahoo.com/v1/finance/search\"\n",
+    "        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'\n",
+    "        params = {\"q\": name, \"quotes_count\": 1, \"country\": \"United States\"}\n",
+    "\n",
+    "        res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})\n",
+    "        data = res.json()\n",
+    "        ticker = data['quotes'][0]['symbol']\n",
+    "        return ticker\n",
+    "    except: \n",
+    "        print(name)\n",
+    "\n",
+    "def getTickerWrapper(company_name):\n",
+    "    words = company_name.split()\n",
+    "#     print(words)\n",
+    "    for i in range(len(words), 0, -1):\n",
+    "        curr_name = ' '.join(words[:i])\n",
+    "#         print(curr_name)\n",
+    "        # check against company dict \n",
+    "        if curr_name in company_dict:\n",
+    "            return company_dict[curr_name]\n",
+    "        ticker = getTicker(curr_name)\n",
+    "        if ticker:\n",
+    "            company_dict[curr_name] = ticker \n",
+    "            return ticker \n",
+    "    return \"Error\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d949acf",
+   "metadata": {},
+   "source": [
+    "# How this works\n",
+    "The following three blocks are run iteratively in increments of 100 entries (i.e. df[:100], then df[101:200], etc.) for speed and human review. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5b47fc0",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# 1st round - grab tickers for all in increments of 300 (0-300, 301-600, etc.)\n",
+    "for index, row in df[:100].iterrows():\n",
+    "#     print(getTickerWrapper(row['Name']))\n",
+    "#     row['Ticker'] = getTickerWrapper(row['Name'])\n",
+    "    df.at[index, 'Ticker'] = getTickerWrapper(row['Name'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4809707",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# second round - fill any companies with Not Found in company_dict\n",
+    "for index, row in df[:100].iterrows():\n",
+    "    if row['Ticker'] == \"Error\" and row['Ticker'] in company_dict:\n",
+    "        df.at[index, 'Ticker'] = company_dict[row['Ticker']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4969275",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[:100]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5c54ea9",
+   "metadata": {},
+   "source": [
+    "# End of Iterative Script\n",
+    "At this point, we are operating under the assumption that all increments have been run, i.e. the entire dataframe's tickers have been generated. Only run the next block once this is true, otherwise you will rewrite the entire existing dataframe CSV file and all unpopulated ticker values will be NaN."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8cc57ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# At this point, inspect the file to see if there are any incorrect / erroneous tickers.\n",
+    "df.to_csv(\"../data-2023/out/df_with_ticker_0_to_100.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b8fc2ec",
+   "metadata": {},
+   "source": [
+    "# Graveyard: Do Not Run After this Point"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03d1fdc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "# Convert and write JSON object to file\n",
+    "with open(\"company_dict.json\", \"w\") as outfile: \n",
+    "    json.dump(company_dict, outfile)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac5f3b94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop final error and ??? rows\n",
+    "df = df[df['Ticker'] != ]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
author	eeshabarua <61480751+eeshabarua@users.noreply.github.com>	2025-03-12 22:47:09 -0400
committer	GitHub <noreply@github.com>	2025-03-12 21:47:09 -0500
commit	f612a123f135f93da662ca0543998dc90a0c50d6 (patch)
tree	ccd8fa57d72e983c02a50caabe6121eb6c2e373c /scripts-2023/02-populate-tickers.ipynb
parent	87472cbe8f20612b21f26f8b1a8e8f3c2d55163d (diff)