{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "76f1b610", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "5ef6781e", "metadata": {}, "outputs": [], "source": [ "# Read the CSV file into a DataFrame\n", "df = pd.read_csv('../data-2023/in/23-138.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "de6cb72a", "metadata": {}, "outputs": [], "source": [ "# Rename the column names\n", "df = df.rename(columns={\n", " 'University of Illinois System - Investment Office': 'Name',\n", " 'Unnamed: 1': 'Coupon',\n", " 'Unnamed: 2': 'Maturity Date',\n", " 'Unnamed: 3': 'Quantity',\n", " 'Unnamed: 4': 'Cost Val',\n", " 'Unnamed: 5': 'Market Val'\n", "})" ] }, { "cell_type": "code", "execution_count": null, "id": "6d909ab8", "metadata": {}, "outputs": [], "source": [ "# Reset the index and drop rows before '9-200100-Neuberger Intermediate'\n", "index_to_reset = df.index[df['Name'] == '9-200100-Neuberger Intermediate'][0]\n", "df.reset_index(drop=True, inplace=True)\n", "\n", "# Now your DataFrame should have the index reset, and rows before '9-200100-Neuberger Intermediate' dropped\n", "# because Agency Funds are unimportant to us.\n", "print(df)" ] }, { "cell_type": "code", "execution_count": null, "id": "383efe00", "metadata": {}, "outputs": [], "source": [ "# Asset Types are header rows in Account or Security column \n", "def is_asset_type(name):\n", " investment_types = [\n", " 'Asset-Backed Securities',\n", " 'Commercial Mortgage Backed Securities',\n", " 'Corporate Bonds',\n", " 'Municipal Bonds',\n", " 'U.S. Agency Bonds',\n", " 'International Government Bonds',\n", " 'U.S. Treasury Securities',\n", " 'Cash & Cash Equivalents',\n", " ]\n", " return name in investment_types" ] }, { "cell_type": "code", "execution_count": null, "id": "dd54c34e", "metadata": {}, "outputs": [], "source": [ "# Determine the position where you want to insert the 'Asset_Type' column (e.g., after 'Investment_Type')\n", "insert_position = df.columns.get_loc('Name') + 1\n", "\n", "# Create a new column 'Asset Type' and fill it with the corresponding row headers\n", "df['Asset Type'] = df['Name'].where(df['Name'].apply(is_asset_type))\n", "\n", "# Forward fill the 'Asset Type' column to propagate the labels down to the rows\n", "df['Asset Type'] = df['Asset Type'].ffill()" ] }, { "cell_type": "code", "execution_count": null, "id": "6b1b000b", "metadata": {}, "outputs": [], "source": [ "# Sanity check: let's export what we currently have a to .csv and check that Asset Type is right \n", "df.to_csv('../data-2023/out/asset_type_sanity_check.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "f774d484", "metadata": {}, "outputs": [], "source": [ "# Alright! Let's drop all the rows with null values (this includes header rows for investment data + other fund data)\n", "\n", "# Drop any rows with empty or null values\n", "df = df.dropna()" ] }, { "cell_type": "code", "execution_count": null, "id": "344a39dd", "metadata": {}, "outputs": [], "source": [ "print(df)" ] }, { "cell_type": "code", "execution_count": null, "id": "5766ff23", "metadata": {}, "outputs": [], "source": [ "# Sanity check: let's export what we currently have a to .csv and check that we only have investment data \n", "df.to_csv('../data-2023/out/investment_data_sanity_check.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "06801e7f", "metadata": {}, "outputs": [], "source": [ "# Next, let's clean unruly typed data. Investments should obey the schema :\n", "# Account or Security (string)\n", "# Coupon (Date format mm/dd/yyyy)\n", "# Maturity Date (double) \n", "# Quantity (double) \n", "# Cost Val (double) \n", "# Market Val (double)\n", "\n", "cnt_old_entries = len(df)\n", "\n", "# for example, this bond has switched its Maturity Date and Date Coupon (this is actually one of a kind outlier)\n", "print(df.iloc[135])\n", "\n", "# Convert 'Maturity Date' to datetime format\n", "df['Maturity Date'] = pd.to_datetime(df['Maturity Date'], errors='coerce')\n", "\n", "\n", "# Clean commas from numeric column values using regular expressions\n", "numeric_columns = ['Coupon', 'Quantity', 'Cost Val', 'Market Val']\n", "\n", "for col in numeric_columns:\n", " # clean commas to register values as numbers \n", " df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '', regex=True), errors='coerce')\n", "\n", "# Now, the unruly typed data becomes NaN or NaT\n", "print(df.iloc[135])\n", "\n", "# Drop all unruly typed data \n", "df = df.dropna()\n", "\n", "cnt_new_entries = len(df)\n", "\n", "print(f\"\\nSuccessfully cleaned {cnt_old_entries - cnt_new_entries} unruly typed values.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "79a654e0", "metadata": {}, "outputs": [], "source": [ "# Sanity check: let's export what we currently have a to .csv and check that all our values are the right type. \n", "df.to_csv('../data-2023/out/df_data_typed.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "bef9be05", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }