summaryrefslogtreecommitdiff
path: root/scripts-2023/01-clean.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'scripts-2023/01-clean.ipynb')
-rw-r--r--scripts-2023/01-clean.ipynb225
1 files changed, 225 insertions, 0 deletions
diff --git a/scripts-2023/01-clean.ipynb b/scripts-2023/01-clean.ipynb
new file mode 100644
index 0000000..5c6ae37
--- /dev/null
+++ b/scripts-2023/01-clean.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "76f1b610",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5ef6781e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Read the CSV file into a DataFrame\n",
+ "df = pd.read_csv('../data-2023/in/23-138.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "de6cb72a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Rename the column names\n",
+ "df = df.rename(columns={\n",
+ " 'University of Illinois System - Investment Office': 'Name',\n",
+ " 'Unnamed: 1': 'Coupon',\n",
+ " 'Unnamed: 2': 'Maturity Date',\n",
+ " 'Unnamed: 3': 'Quantity',\n",
+ " 'Unnamed: 4': 'Cost Val',\n",
+ " 'Unnamed: 5': 'Market Val'\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d909ab8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Reset the index and drop rows before '9-200100-Neuberger Intermediate'\n",
+ "index_to_reset = df.index[df['Name'] == '9-200100-Neuberger Intermediate'][0]\n",
+ "df.reset_index(drop=True, inplace=True)\n",
+ "\n",
+ "# Now your DataFrame should have the index reset, and rows before '9-200100-Neuberger Intermediate' dropped\n",
+ "# because Agency Funds are unimportant to us.\n",
+ "print(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "383efe00",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Asset Types are header rows in Account or Security column \n",
+ "def is_asset_type(name):\n",
+ " investment_types = [\n",
+ " 'Asset-Backed Securities',\n",
+ " 'Commercial Mortgage Backed Securities',\n",
+ " 'Corporate Bonds',\n",
+ " 'Municipal Bonds',\n",
+ " 'U.S. Agency Bonds',\n",
+ " 'International Government Bonds',\n",
+ " 'U.S. Treasury Securities',\n",
+ " 'Cash & Cash Equivalents',\n",
+ " ]\n",
+ " return name in investment_types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dd54c34e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Determine the position where you want to insert the 'Asset_Type' column (e.g., after 'Investment_Type')\n",
+ "insert_position = df.columns.get_loc('Name') + 1\n",
+ "\n",
+ "# Create a new column 'Asset Type' and fill it with the corresponding row headers\n",
+ "df['Asset Type'] = df['Name'].where(df['Name'].apply(is_asset_type))\n",
+ "\n",
+ "# Forward fill the 'Asset Type' column to propagate the labels down to the rows\n",
+ "df['Asset Type'] = df['Asset Type'].ffill()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6b1b000b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Sanity check: let's export what we currently have a to .csv and check that Asset Type is right \n",
+ "df.to_csv('../data-2023/out/asset_type_sanity_check.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f774d484",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Alright! Let's drop all the rows with null values (this includes header rows for investment data + other fund data)\n",
+ "\n",
+ "# Drop any rows with empty or null values\n",
+ "df = df.dropna()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "344a39dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5766ff23",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Sanity check: let's export what we currently have a to .csv and check that we only have investment data \n",
+ "df.to_csv('../data-2023/out/investment_data_sanity_check.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "06801e7f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Next, let's clean unruly typed data. Investments should obey the schema :\n",
+ "# Account or Security (string)\n",
+ "# Coupon (Date format mm/dd/yyyy)\n",
+ "# Maturity Date (double) \n",
+ "# Quantity (double) \n",
+ "# Cost Val (double) \n",
+ "# Market Val (double)\n",
+ "\n",
+ "cnt_old_entries = len(df)\n",
+ "\n",
+ "# for example, this bond has switched its Maturity Date and Date Coupon (this is actually one of a kind outlier)\n",
+ "print(df.iloc[135])\n",
+ "\n",
+ "# Convert 'Maturity Date' to datetime format\n",
+ "df['Maturity Date'] = pd.to_datetime(df['Maturity Date'], errors='coerce')\n",
+ "\n",
+ "\n",
+ "# Clean commas from numeric column values using regular expressions\n",
+ "numeric_columns = ['Coupon', 'Quantity', 'Cost Val', 'Market Val']\n",
+ "\n",
+ "for col in numeric_columns:\n",
+ " # clean commas to register values as numbers \n",
+ " df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '', regex=True), errors='coerce')\n",
+ "\n",
+ "# Now, the unruly typed data becomes NaN or NaT\n",
+ "print(df.iloc[135])\n",
+ "\n",
+ "# Drop all unruly typed data \n",
+ "df = df.dropna()\n",
+ "\n",
+ "cnt_new_entries = len(df)\n",
+ "\n",
+ "print(f\"\\nSuccessfully cleaned {cnt_old_entries - cnt_new_entries} unruly typed values.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "79a654e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Sanity check: let's export what we currently have a to .csv and check that all our values are the right type. \n",
+ "df.to_csv('../data-2023/out/df_data_typed.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bef9be05",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}