From f612a123f135f93da662ca0543998dc90a0c50d6 Mon Sep 17 00:00:00 2001 From: eeshabarua <61480751+eeshabarua@users.noreply.github.com> Date: Wed, 12 Mar 2025 22:47:09 -0400 Subject: Add 2023 Scripts and Data (#6) * Add 2023 Scripts and Data * Update README.md --- scripts-2023/01-clean.ipynb | 225 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 scripts-2023/01-clean.ipynb (limited to 'scripts-2023/01-clean.ipynb') diff --git a/scripts-2023/01-clean.ipynb b/scripts-2023/01-clean.ipynb new file mode 100644 index 0000000..5c6ae37 --- /dev/null +++ b/scripts-2023/01-clean.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "76f1b610", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ef6781e", + "metadata": {}, + "outputs": [], + "source": [ + "# Read the CSV file into a DataFrame\n", + "df = pd.read_csv('../data-2023/in/23-138.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de6cb72a", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename the column names\n", + "df = df.rename(columns={\n", + " 'University of Illinois System - Investment Office': 'Name',\n", + " 'Unnamed: 1': 'Coupon',\n", + " 'Unnamed: 2': 'Maturity Date',\n", + " 'Unnamed: 3': 'Quantity',\n", + " 'Unnamed: 4': 'Cost Val',\n", + " 'Unnamed: 5': 'Market Val'\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d909ab8", + "metadata": {}, + "outputs": [], + "source": [ + "# Reset the index and drop rows before '9-200100-Neuberger Intermediate'\n", + "index_to_reset = df.index[df['Name'] == '9-200100-Neuberger Intermediate'][0]\n", + "df.reset_index(drop=True, inplace=True)\n", + "\n", + "# Now your DataFrame should have the index reset, and rows before '9-200100-Neuberger Intermediate' dropped\n", + "# because Agency Funds are unimportant to us.\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "383efe00", + "metadata": {}, + "outputs": [], + "source": [ + "# Asset Types are header rows in Account or Security column \n", + "def is_asset_type(name):\n", + " investment_types = [\n", + " 'Asset-Backed Securities',\n", + " 'Commercial Mortgage Backed Securities',\n", + " 'Corporate Bonds',\n", + " 'Municipal Bonds',\n", + " 'U.S. Agency Bonds',\n", + " 'International Government Bonds',\n", + " 'U.S. Treasury Securities',\n", + " 'Cash & Cash Equivalents',\n", + " ]\n", + " return name in investment_types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd54c34e", + "metadata": {}, + "outputs": [], + "source": [ + "# Determine the position where you want to insert the 'Asset_Type' column (e.g., after 'Investment_Type')\n", + "insert_position = df.columns.get_loc('Name') + 1\n", + "\n", + "# Create a new column 'Asset Type' and fill it with the corresponding row headers\n", + "df['Asset Type'] = df['Name'].where(df['Name'].apply(is_asset_type))\n", + "\n", + "# Forward fill the 'Asset Type' column to propagate the labels down to the rows\n", + "df['Asset Type'] = df['Asset Type'].ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b1b000b", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity check: let's export what we currently have a to .csv and check that Asset Type is right \n", + "df.to_csv('../data-2023/out/asset_type_sanity_check.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f774d484", + "metadata": {}, + "outputs": [], + "source": [ + "# Alright! Let's drop all the rows with null values (this includes header rows for investment data + other fund data)\n", + "\n", + "# Drop any rows with empty or null values\n", + "df = df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "344a39dd", + "metadata": {}, + "outputs": [], + "source": [ + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5766ff23", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity check: let's export what we currently have a to .csv and check that we only have investment data \n", + "df.to_csv('../data-2023/out/investment_data_sanity_check.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06801e7f", + "metadata": {}, + "outputs": [], + "source": [ + "# Next, let's clean unruly typed data. Investments should obey the schema :\n", + "# Account or Security (string)\n", + "# Coupon (Date format mm/dd/yyyy)\n", + "# Maturity Date (double) \n", + "# Quantity (double) \n", + "# Cost Val (double) \n", + "# Market Val (double)\n", + "\n", + "cnt_old_entries = len(df)\n", + "\n", + "# for example, this bond has switched its Maturity Date and Date Coupon (this is actually one of a kind outlier)\n", + "print(df.iloc[135])\n", + "\n", + "# Convert 'Maturity Date' to datetime format\n", + "df['Maturity Date'] = pd.to_datetime(df['Maturity Date'], errors='coerce')\n", + "\n", + "\n", + "# Clean commas from numeric column values using regular expressions\n", + "numeric_columns = ['Coupon', 'Quantity', 'Cost Val', 'Market Val']\n", + "\n", + "for col in numeric_columns:\n", + " # clean commas to register values as numbers \n", + " df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '', regex=True), errors='coerce')\n", + "\n", + "# Now, the unruly typed data becomes NaN or NaT\n", + "print(df.iloc[135])\n", + "\n", + "# Drop all unruly typed data \n", + "df = df.dropna()\n", + "\n", + "cnt_new_entries = len(df)\n", + "\n", + "print(f\"\\nSuccessfully cleaned {cnt_old_entries - cnt_new_entries} unruly typed values.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a654e0", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity check: let's export what we currently have a to .csv and check that all our values are the right type. \n", + "df.to_csv('../data-2023/out/df_data_typed.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bef9be05", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- cgit v1.2.3