{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "12b9c4db", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T13:56:17.829502Z", "start_time": "2023-03-13T13:56:03.465027Z" }, "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: http://mirrors.aliyun.com/pypi/simple/\n", "Collecting tiktoken\n", " Downloading http://mirrors.aliyun.com/pypi/packages/5c/76/03b8286cd264f9f5550229fe21f72abc89d431a9a3c887fc365763acc5a4/tiktoken-0.3.0-cp39-cp39-macosx_10_9_x86_64.whl (735 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m735.4/735.4 kB\u001b[0m \u001b[31m256.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: requests>=2.26.0 in /Users/chunhuizhang/opt/anaconda3/lib/python3.9/site-packages (from tiktoken) (2.28.1)\n", "Requirement already satisfied: regex>=2022.1.18 in /Users/chunhuizhang/opt/anaconda3/lib/python3.9/site-packages (from tiktoken) (2022.7.9)\n", "Collecting blobfile>=2\n", " Downloading http://mirrors.aliyun.com/pypi/packages/c1/35/6b92aa0d86f26f0a8ab6959dd29ac4c7e96d5c1d948d4347bba12e07695a/blobfile-2.0.1-py3-none-any.whl (73 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.5/73.5 kB\u001b[0m \u001b[31m214.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: urllib3<3,>=1.25.3 in /Users/chunhuizhang/opt/anaconda3/lib/python3.9/site-packages (from blobfile>=2->tiktoken) (1.26.11)\n", "Requirement already satisfied: filelock~=3.0 in /Users/chunhuizhang/opt/anaconda3/lib/python3.9/site-packages (from blobfile>=2->tiktoken) (3.6.0)\n", "Collecting pycryptodomex~=3.8\n", " Downloading http://mirrors.aliyun.com/pypi/packages/78/db/ec162a8fa1c7c8e03488616a01de59bb752b985f1c507ffb127b40b9d456/pycryptodomex-3.17-cp35-abi3-macosx_10_9_x86_64.whl (1.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m272.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: lxml~=4.9 in /Users/chunhuizhang/opt/anaconda3/lib/python3.9/site-packages (from blobfile>=2->tiktoken) (4.9.1)\n", "Requirement already satisfied: certifi>=2017.4.17 in /Users/chunhuizhang/opt/anaconda3/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2022.9.24)\n", "Requirement already satisfied: charset-normalizer<3,>=2 in /Users/chunhuizhang/opt/anaconda3/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2.0.4)\n", "Requirement already satisfied: idna<4,>=2.5 in /Users/chunhuizhang/opt/anaconda3/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.3)\n", "Installing collected packages: pycryptodomex, blobfile, tiktoken\n", "Successfully installed blobfile-2.0.1 pycryptodomex-3.17 tiktoken-0.3.0\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install tiktoken" ] }, { "cell_type": "markdown", "id": "f56bbe1c", "metadata": {}, "source": [ "## 认识数据集" ] }, { "cell_type": "code", "execution_count": 84, "id": "76150440", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T14:51:31.301676Z", "start_time": "2023-03-13T14:51:31.297972Z" } }, "outputs": [], "source": [ "# imports\n", "import pandas as pd\n", "import tiktoken\n", "import openai\n", "from openai.embeddings_utils import get_embedding\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "id": "fa311d89", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T13:57:09.514327Z", "start_time": "2023-03-13T13:57:09.510859Z" } }, "outputs": [], "source": [ "# embedding model parameters\n", "embedding_model = \"text-embedding-ada-002\"\n", "embedding_encoding = \"cl100k_base\" # this the encoding for text-embedding-ada-002\n", "max_tokens = 8191 # the maximum for text-embedding-ada-002 is 8191" ] }, { "cell_type": "code", "execution_count": 85, "id": "63c73803", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T14:52:36.612466Z", "start_time": "2023-03-13T14:52:36.609031Z" } }, "outputs": [], "source": [ "input_file = './data/fine_food_reviews_1k.csv'" ] }, { "cell_type": "code", "execution_count": 88, "id": "b4a220f1", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T14:54:11.245064Z", "start_time": "2023-03-13T14:54:11.210401Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1000, 6)\n", "(762, 6)\n" ] } ], "source": [ "df = pd.read_csv(input_file, index_col=0)\n", "df = df[[\"Time\", \"ProductId\", \"UserId\", \"Score\", \"Summary\", \"Text\"]]\n", "df = df.sort_values('Time')\n", "df.dropna(inplace=True)\n", "print(df.shape)\n", "df.drop_duplicates(subset=['Summary', 'Text'], keep='last', inplace=True)\n", "print(df.shape)\n", "df['Combined'] = 'Title: ' + df.Summary.str.strip() + '; Content: ' + df.Text.str.strip()" ] }, { "cell_type": "code", "execution_count": 89, "id": "a6c7bbd8", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T14:55:05.038778Z", "start_time": "2023-03-13T14:55:04.891622Z" } }, "outputs": [ { "data": { "text/plain": [ "100" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_n = 100\n", "encoding = tiktoken.get_encoding(embedding_encoding)\n", "# omit reviews that are too long to embed\n", "df[\"n_tokens\"] = df.Combined.apply(lambda x: len(encoding.encode(x)))\n", "df = df[df.n_tokens <= max_tokens].tail(top_n)\n", "len(df)" ] }, { "cell_type": "code", "execution_count": 90, "id": "262994d4", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T14:56:38.163617Z", "start_time": "2023-03-13T14:55:27.448765Z" } }, "outputs": [], "source": [ "openai.api_key = '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'\n", "df['embedding'] = df.Combined.apply(lambda x: get_embedding(x, engine=embedding_model))" ] }, { "cell_type": "markdown", "id": "9510f2b6", "metadata": {}, "source": [ "## embedding" ] }, { "cell_type": "markdown", "id": "e918cedc", "metadata": {}, "source": [ "- dimension\n", "- norm" ] }, { "cell_type": "code", "execution_count": 91, "id": "92261327", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T14:57:45.646492Z", "start_time": "2023-03-13T14:57:45.620730Z" } }, "outputs": [], "source": [ "df['embed_len'] = df.embedding.apply(lambda x: len(x))\n", "df['embed_norm'] = df.embedding.apply(lambda x: np.linalg.norm(x))" ] }, { "cell_type": "code", "execution_count": 92, "id": "eaba5054", "metadata": { "ExecuteTime": { "end_time": "2023-03-13T14:57:46.858110Z", "start_time": "2023-03-13T14:57:46.795263Z" } }, "outputs": [ { "data": { "text/html": [ "
| \n", " | Time | \n", "ProductId | \n", "UserId | \n", "Score | \n", "Summary | \n", "Text | \n", "Combined | \n", "n_tokens | \n", "embedding | \n", "embed_len | \n", "embed_norm | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 650 | \n", "1351209600 | \n", "B0051O6P36 | \n", "A1VC6419THHIET | \n", "5 | \n", "Good for all cats. | \n", "I just got these treats last week and they're ... | \n", "Title: Good for all cats.; Content: I just got... | \n", "81 | \n", "[-0.02040177956223488, -0.022390257567167282, ... | \n", "1536 | \n", "1.0 | \n", "
| 651 | \n", "1351209600 | \n", "B001EO5RSQ | \n", "A33W5JAFGHYRQZ | \n", "5 | \n", "Love this Cereal! | \n", "There is nothing else like this on the market.... | \n", "Title: Love this Cereal!; Content: There is no... | \n", "55 | \n", "[-0.012976857833564281, -0.008588296361267567,... | \n", "1536 | \n", "1.0 | \n", "
| 652 | \n", "1351209600 | \n", "B0045H264C | \n", "A3IYSIAKYOMKTO | \n", "5 | \n", "Wild Honey | \n", "This really is unfiltered honey made from wild... | \n", "Title: Wild Honey; Content: This really is unf... | \n", "107 | \n", "[0.002022168133407831, -0.010228604078292847, ... | \n", "1536 | \n", "1.0 | \n", "
| 679 | \n", "1351209600 | \n", "B000UBD88A | \n", "AWRFQYLG7LQKJ | \n", "2 | \n", "Not very strong | \n", "Not as strong as the regular dark coffee. Dis... | \n", "Title: Not very strong; Content: Not as strong... | \n", "45 | \n", "[-0.0016124029643833637, -0.026590621098876, 0... | \n", "1536 | \n", "1.0 | \n", "
| 654 | \n", "1351209600 | \n", "B001XWRMAU | \n", "A1KWVBDHBG50VZ | \n", "5 | \n", "Outstanding product!..... | \n", "Great flavor.....lotsa "heat"....I use... | \n", "Title: Outstanding product!.....; Content: Gre... | \n", "43 | \n", "[-0.00573874544352293, 0.007031316868960857, 0... | \n", "1536 | \n", "1.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 623 | \n", "1351209600 | \n", "B0000CFXYA | \n", "A3GS4GWPIBV0NT | \n", "1 | \n", "Strange inflammation response | \n", "Truthfully wasn't crazy about the taste of the... | \n", "Title: Strange inflammation response; Content:... | \n", "110 | \n", "[0.00011091353371739388, -0.00466986745595932,... | \n", "1536 | \n", "1.0 | \n", "
| 624 | \n", "1351209600 | \n", "B0001BH5YM | \n", "A1BZ3HMAKK0NC | \n", "5 | \n", "My favorite and only MUSTARD | \n", "You've just got to experience this mustard... ... | \n", "Title: My favorite and only MUSTARD; Content:... | \n", "80 | \n", "[-0.020869314670562744, -0.013138455338776112,... | \n", "1536 | \n", "1.0 | \n", "
| 625 | \n", "1351209600 | \n", "B0009ET7TC | \n", "A2FSDQY5AI6TNX | \n", "5 | \n", "My furbabies LOVE these! | \n", "Shake the container and they come running. Eve... | \n", "Title: My furbabies LOVE these!; Content: Shak... | \n", "47 | \n", "[-0.009749102406203747, -0.0068712360225617886... | \n", "1536 | \n", "1.0 | \n", "
| 619 | \n", "1351209600 | \n", "B007PA32L2 | \n", "A15FF2P7RPKH6G | \n", "5 | \n", "got this for the daughter | \n", "all i have heard since she got a kuerig is why... | \n", "Title: got this for the daughter; Content: all... | \n", "50 | \n", "[-0.005320307798683643, 0.0009131018887273967,... | \n", "1536 | \n", "1.0 | \n", "
| 999 | \n", "1351209600 | \n", "B001EQ5GEO | \n", "A3VYU0VO6DYV6I | \n", "5 | \n", "I love Maui Coffee! | \n", "My first experience with Maui Coffee was bring... | \n", "Title: I love Maui Coffee!; Content: My first ... | \n", "118 | \n", "[-0.006057822611182928, -0.015015840530395508,... | \n", "1536 | \n", "1.0 | \n", "
100 rows × 11 columns
\n", "