2 files changed, 451 insertions, 0 deletions
diff --git a/llm/tutorials/07_whisper.ipynb b/llm/tutorials/07_whisper.ipynb
new file mode 100644
index 0000000..7be994f
--- /dev/null
+++ b/llm/tutorials/07_whisper.ipynb
@@ -0,0 +1,451 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2a9ef54d",
+   "metadata": {},
+   "source": [
+    "## summary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a9a3c07",
+   "metadata": {},
+   "source": [
+    "- 一种多模态\n",
+    "    - audio => text\n",
+    "        - model:\n",
+    "            - AudioEncoder\n",
+    "            - TextDecoder\n",
+    "        - 典型的 A Transformer sequence-to-sequence model\n",
+    "- 安装\n",
+    "    ```\n",
+    "    # https://github.com/openai/whisper\n",
+    "    pip install -U openai-whisper\n",
+    "\n",
+    "    ```\n",
+    "- 模型下载地址\n",
+    "    - `~/.cache/whisper/xx.pt`\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "448be8f5",
+   "metadata": {},
+   "source": [
+    "## ffmpeg: video => audio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d912e69",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "sudo apt update && sudo apt install ffmpeg\n",
+    "ffmpeg -i sample.avi -q:a 0 -map a sample.mp3\n",
+    "```\n",
+    "\n",
+    "- `-i`: input\n",
+    "- `-q:a 0` for variable bit rate \n",
+    "    -  https://trac.ffmpeg.org/wiki/Encode/MP3\n",
+    "- `-map a`: exclude video/subtitles and only grab audio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39349968",
+   "metadata": {},
+   "source": [
+    "## cli "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d23a9c96",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "whisper --language Chinese --model large video.mp3 -o output_dir\n",
+    "```\n",
+    "\n",
+    "- 参数\n",
+    "    - `--task {transcribe,translate}`：默认 `transcribe`（也就是转录，asr）\n",
+    "    - `--language Chinese`：按哪种语言进行转录；\n",
+    "    - `--model {base, medium, large}`\n",
+    "    - `--device device`：default cuda，默认运行在一张卡上；\n",
+    "    - `-o output_dir`：生成的文本文件保存到的文件夹\n",
+    "        - `xx.json`\n",
+    "        - `xx.srt`\n",
+    "        - `xx.tsv`\n",
+    "        - `xx.txt`\n",
+    "        - `xx.vtt`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "beab9dfe",
+   "metadata": {},
+   "source": [
+    "## python script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "16478037",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:48:00.439627Z",
+     "start_time": "2023-04-12T14:47:59.175697Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import whisper\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "64ebdc86",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:48:06.010807Z",
+     "start_time": "2023-04-12T14:48:06.006368Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def get_params(model):\n",
+    "    model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n",
+    "    params = sum([np.prod(p.size()) for p in model_parameters])\n",
+    "    return params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ad4922bd",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:48:57.558262Z",
+     "start_time": "2023-04-12T14:48:33.388568Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "base\t 71825920\n",
+      "medium\t 762321920\n",
+      "large\t 1541384960\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Whisper(\n",
+       "  (encoder): AudioEncoder(\n",
+       "    (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))\n",
+       "    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))\n",
+       "    (blocks): ModuleList(\n",
+       "      (0-31): 32 x ResidualAttentionBlock(\n",
+       "        (attn): MultiHeadAttention(\n",
+       "          (query): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (key): Linear(in_features=1280, out_features=1280, bias=False)\n",
+       "          (value): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (out): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): Sequential(\n",
+       "          (0): Linear(in_features=1280, out_features=5120, bias=True)\n",
+       "          (1): GELU(approximate='none')\n",
+       "          (2): Linear(in_features=5120, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (ln_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "  )\n",
+       "  (decoder): TextDecoder(\n",
+       "    (token_embedding): Embedding(51865, 1280)\n",
+       "    (blocks): ModuleList(\n",
+       "      (0-31): 32 x ResidualAttentionBlock(\n",
+       "        (attn): MultiHeadAttention(\n",
+       "          (query): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (key): Linear(in_features=1280, out_features=1280, bias=False)\n",
+       "          (value): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (out): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "        (cross_attn): MultiHeadAttention(\n",
+       "          (query): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (key): Linear(in_features=1280, out_features=1280, bias=False)\n",
+       "          (value): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (out): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (cross_attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): Sequential(\n",
+       "          (0): Linear(in_features=1280, out_features=5120, bias=True)\n",
+       "          (1): GELU(approximate='none')\n",
+       "          (2): Linear(in_features=5120, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = whisper.load_model(\"base\")\n",
+    "print(\"base\\t\", get_params(model))\n",
+    "model = whisper.load_model(\"medium\")\n",
+    "print(\"medium\\t\", get_params(model))\n",
+    "model = whisper.load_model(\"large\")\n",
+    "print(\"large\\t\", get_params(model))\n",
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "33f6d3fb",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:50:40.837557Z",
+     "start_time": "2023-04-12T14:50:39.391533Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(21241974,)\n",
+      "[ 0.00305176  0.00213623  0.00241089 ... -0.00765991 -0.00476074\n",
+      " -0.00314331] <class 'numpy.ndarray'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load audio and pad/trim it to fit 30 seconds\n",
+    "audio = whisper.load_audio(\"./data/video/video.mp3\")\n",
+    "print(audio.shape)\n",
+    "print(audio, type(audio))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "617edeef",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T13:57:12.715017Z",
+     "start_time": "2023-04-12T13:57:12.705721Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "22.12705625"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "21241974/16000/60"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8f4426c0",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:51:39.541036Z",
+     "start_time": "2023-04-12T14:51:39.499299Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(480000,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load audio and pad/trim it to fit 30 seconds\n",
+    "audio = whisper.pad_or_trim(audio)\n",
+    "print(audio.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b692376f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:52:04.084340Z",
+     "start_time": "2023-04-12T14:52:04.074789Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "device(type='cuda', index=0)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "5cb9c565",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:52:52.836425Z",
+     "start_time": "2023-04-12T14:52:46.740159Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language: zh\n",
+      "好,B站的朋友们大家晚上好今天给大家开启一个插入一个新的系列就是面向小白的深度学习软硬件的装机指南然后这个系列呢我在上一期里面带着大家去实际开箱了一下我周末配置好的一个深度学习服务器然后大致的一个硬件一个配置的情况给大家展示了一把那这一节呢\n"
+     ]
+    }
+   ],
+   "source": [
+    "# make log-Mel spectrogram and move to the same device as the model\n",
+    "mel = whisper.log_mel_spectrogram(audio).to(model.device)\n",
+    "\n",
+    "# detect the spoken language\n",
+    "_, probs = model.detect_language(mel)\n",
+    "# print(probs)\n",
+    "print(f\"Detected language: {max(probs, key=probs.get)}\")\n",
+    "\n",
+    "# decode the audio\n",
+    "options = whisper.DecodingOptions()\n",
+    "# inference\n",
+    "result = whisper.decode(model, mel, options)\n",
+    "\n",
+    "# print the recognized text\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e754b3b",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2023-04-12T14:53:59.945Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language: zh\n",
+      "好,B站的朋友们大家晚上好今天给大家开启一个插入一个新的系列就是面向小白的深度学习软硬件的装机指南然后这个系列呢我在上一期里面带着大家去实际开箱了一下我周末配置好的一个深度学习服务器然后大致的一个硬件一个配置的情况给大家展示了一把那这一节呢\n"
+     ]
+    }
+   ],
+   "source": [
+    "import whisper\n",
+    "model = whisper.load_model(\"large\")\n",
+    "audio = whisper.load_audio(\"./data/video/video.mp3\")\n",
+    "# load audio and pad/trim it to fit 30 seconds\n",
+    "audio = whisper.pad_or_trim(audio)\n",
+    "# make log-Mel spectrogram and move to the same device as the model\n",
+    "mel = whisper.log_mel_spectrogram(audio).to(model.device)\n",
+    "\n",
+    "# detect the spoken language\n",
+    "_, probs = model.detect_language(mel)\n",
+    "# print(probs)\n",
+    "print(f\"Detected language: {max(probs, key=probs.get)}\")\n",
+    "\n",
+    "# decode the audio\n",
+    "# default task：transcribe\n",
+    "options = whisper.DecodingOptions()\n",
+    "result = whisper.decode(model, mel, options)\n",
+    "# print the recognized text\n",
+    "print(result.text)\n",
+    "\n",
+    "options = whisper.DecodingOptions(task='translate')\n",
+    "result = whisper.decode(model, mel, options)\n",
+    "# print the recognized text\n",
+    "print(result.text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "calc(100% - 180px)",
+    "left": "10px",
+    "top": "150px",
+    "width": "295px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/llm/tutorials/data/video/video.mp3 b/llm/tutorials/data/video/video.mp3
new file mode 100644
index 0000000..312d929
--- /dev/null
+++ b/llm/tutorials/data/video/video.mp3