1 files changed, 225 insertions, 0 deletions
diff --git a/scripts-2023/01-clean.ipynb b/scripts-2023/01-clean.ipynb
new file mode 100644
index 0000000..5c6ae37
--- /dev/null
+++ b/scripts-2023/01-clean.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76f1b610",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ef6781e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the CSV file into a DataFrame\n",
+    "df = pd.read_csv('../data-2023/in/23-138.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de6cb72a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rename the column names\n",
+    "df = df.rename(columns={\n",
+    "    'University of Illinois System - Investment Office': 'Name',\n",
+    "    'Unnamed: 1': 'Coupon',\n",
+    "    'Unnamed: 2': 'Maturity Date',\n",
+    "    'Unnamed: 3': 'Quantity',\n",
+    "    'Unnamed: 4': 'Cost Val',\n",
+    "    'Unnamed: 5': 'Market Val'\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d909ab8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reset the index and drop rows before '9-200100-Neuberger Intermediate'\n",
+    "index_to_reset = df.index[df['Name'] == '9-200100-Neuberger Intermediate'][0]\n",
+    "df.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "# Now your DataFrame should have the index reset, and rows before '9-200100-Neuberger Intermediate' dropped\n",
+    "# because Agency Funds are unimportant to us.\n",
+    "print(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "383efe00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Asset Types are header rows in Account or Security column \n",
+    "def is_asset_type(name):\n",
+    "    investment_types = [\n",
+    "        'Asset-Backed Securities',\n",
+    "        'Commercial Mortgage Backed Securities',\n",
+    "        'Corporate Bonds',\n",
+    "        'Municipal Bonds',\n",
+    "        'U.S. Agency Bonds',\n",
+    "        'International Government Bonds',\n",
+    "        'U.S. Treasury Securities',\n",
+    "        'Cash & Cash Equivalents',\n",
+    "    ]\n",
+    "    return name in investment_types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd54c34e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Determine the position where you want to insert the 'Asset_Type' column (e.g., after 'Investment_Type')\n",
+    "insert_position = df.columns.get_loc('Name') + 1\n",
+    "\n",
+    "# Create a new column 'Asset Type' and fill it with the corresponding row headers\n",
+    "df['Asset Type'] = df['Name'].where(df['Name'].apply(is_asset_type))\n",
+    "\n",
+    "# Forward fill the 'Asset Type' column to propagate the labels down to the rows\n",
+    "df['Asset Type'] = df['Asset Type'].ffill()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b1b000b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sanity check: let's export what we currently have a to .csv and check that Asset Type is right \n",
+    "df.to_csv('../data-2023/out/asset_type_sanity_check.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f774d484",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Alright! Let's drop all the rows with null values (this includes header rows for investment data + other fund data)\n",
+    "\n",
+    "# Drop any rows with empty or null values\n",
+    "df = df.dropna()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "344a39dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5766ff23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sanity check: let's export what we currently have a to .csv and check that we only have investment data \n",
+    "df.to_csv('../data-2023/out/investment_data_sanity_check.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06801e7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Next, let's clean unruly typed data. Investments should obey the schema :\n",
+    "# Account or Security (string)\n",
+    "# Coupon (Date format mm/dd/yyyy)\n",
+    "# Maturity Date (double) \n",
+    "# Quantity (double) \n",
+    "# Cost Val (double) \n",
+    "# Market Val (double)\n",
+    "\n",
+    "cnt_old_entries = len(df)\n",
+    "\n",
+    "# for example, this bond has switched its Maturity Date and Date Coupon (this is actually one of a kind outlier)\n",
+    "print(df.iloc[135])\n",
+    "\n",
+    "# Convert 'Maturity Date' to datetime format\n",
+    "df['Maturity Date'] = pd.to_datetime(df['Maturity Date'], errors='coerce')\n",
+    "\n",
+    "\n",
+    "# Clean commas from numeric column values using regular expressions\n",
+    "numeric_columns = ['Coupon', 'Quantity', 'Cost Val', 'Market Val']\n",
+    "\n",
+    "for col in numeric_columns:\n",
+    "    # clean commas to register values as numbers \n",
+    "    df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '', regex=True), errors='coerce')\n",
+    "\n",
+    "# Now, the unruly typed data becomes NaN or NaT\n",
+    "print(df.iloc[135])\n",
+    "\n",
+    "# Drop all unruly typed data \n",
+    "df = df.dropna()\n",
+    "\n",
+    "cnt_new_entries = len(df)\n",
+    "\n",
+    "print(f\"\\nSuccessfully cleaned {cnt_old_entries - cnt_new_entries} unruly typed values.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79a654e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sanity check: let's export what we currently have a to .csv and check that all our values are the right type. \n",
+    "df.to_csv('../data-2023/out/df_data_typed.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bef9be05",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}