{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76f1b610",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ef6781e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the CSV file into a DataFrame\n",
    "df = pd.read_csv('../data-2023/in/23-138.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de6cb72a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename the column names\n",
    "df = df.rename(columns={\n",
    "    'University of Illinois System - Investment Office': 'Name',\n",
    "    'Unnamed: 1': 'Coupon',\n",
    "    'Unnamed: 2': 'Maturity Date',\n",
    "    'Unnamed: 3': 'Quantity',\n",
    "    'Unnamed: 4': 'Cost Val',\n",
    "    'Unnamed: 5': 'Market Val'\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d909ab8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reset the index and drop rows before '9-200100-Neuberger Intermediate'\n",
    "index_to_reset = df.index[df['Name'] == '9-200100-Neuberger Intermediate'][0]\n",
    "df.reset_index(drop=True, inplace=True)\n",
    "\n",
    "# Now your DataFrame should have the index reset, and rows before '9-200100-Neuberger Intermediate' dropped\n",
    "# because Agency Funds are unimportant to us.\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "383efe00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Asset Types are header rows in Account or Security column \n",
    "def is_asset_type(name):\n",
    "    investment_types = [\n",
    "        'Asset-Backed Securities',\n",
    "        'Commercial Mortgage Backed Securities',\n",
    "        'Corporate Bonds',\n",
    "        'Municipal Bonds',\n",
    "        'U.S. Agency Bonds',\n",
    "        'International Government Bonds',\n",
    "        'U.S. Treasury Securities',\n",
    "        'Cash & Cash Equivalents',\n",
    "    ]\n",
    "    return name in investment_types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd54c34e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Determine the position where you want to insert the 'Asset_Type' column (e.g., after 'Investment_Type')\n",
    "insert_position = df.columns.get_loc('Name') + 1\n",
    "\n",
    "# Create a new column 'Asset Type' and fill it with the corresponding row headers\n",
    "df['Asset Type'] = df['Name'].where(df['Name'].apply(is_asset_type))\n",
    "\n",
    "# Forward fill the 'Asset Type' column to propagate the labels down to the rows\n",
    "df['Asset Type'] = df['Asset Type'].ffill()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b1b000b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sanity check: let's export what we currently have a to .csv and check that Asset Type is right \n",
    "df.to_csv('../data-2023/out/asset_type_sanity_check.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f774d484",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Alright! Let's drop all the rows with null values (this includes header rows for investment data + other fund data)\n",
    "\n",
    "# Drop any rows with empty or null values\n",
    "df = df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "344a39dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5766ff23",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sanity check: let's export what we currently have a to .csv and check that we only have investment data \n",
    "df.to_csv('../data-2023/out/investment_data_sanity_check.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06801e7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Next, let's clean unruly typed data. Investments should obey the schema :\n",
    "# Account or Security (string)\n",
    "# Coupon (Date format mm/dd/yyyy)\n",
    "# Maturity Date (double) \n",
    "# Quantity (double) \n",
    "# Cost Val (double) \n",
    "# Market Val (double)\n",
    "\n",
    "cnt_old_entries = len(df)\n",
    "\n",
    "# for example, this bond has switched its Maturity Date and Date Coupon (this is actually one of a kind outlier)\n",
    "print(df.iloc[135])\n",
    "\n",
    "# Convert 'Maturity Date' to datetime format\n",
    "df['Maturity Date'] = pd.to_datetime(df['Maturity Date'], errors='coerce')\n",
    "\n",
    "\n",
    "# Clean commas from numeric column values using regular expressions\n",
    "numeric_columns = ['Coupon', 'Quantity', 'Cost Val', 'Market Val']\n",
    "\n",
    "for col in numeric_columns:\n",
    "    # clean commas to register values as numbers \n",
    "    df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '', regex=True), errors='coerce')\n",
    "\n",
    "# Now, the unruly typed data becomes NaN or NaT\n",
    "print(df.iloc[135])\n",
    "\n",
    "# Drop all unruly typed data \n",
    "df = df.dropna()\n",
    "\n",
    "cnt_new_entries = len(df)\n",
    "\n",
    "print(f\"\\nSuccessfully cleaned {cnt_old_entries - cnt_new_entries} unruly typed values.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79a654e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sanity check: let's export what we currently have a to .csv and check that all our values are the right type. \n",
    "df.to_csv('../data-2023/out/df_data_typed.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bef9be05",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}