whisper

author: lanchunhui <zch921005@126.com> 2023-04-12 22:56:13 +0800
committer: lanchunhui <zch921005@126.com> 2023-04-12 22:56:13 +0800
commit: d3d0753f748bf567f5fc47cd432056dc57eb19c9 (patch)
tree: 7889f01129e1884250434fe432604d551b0ece4a
parent: b3912af9173faf8b213e777866aeb546de391826 (diff)
2 files changed, 451 insertions, 0 deletions
diff --git a/llm/tutorials/07_whisper.ipynb b/llm/tutorials/07_whisper.ipynb
new file mode 100644
index 0000000..7be994f
--- /dev/null
+++ b/llm/tutorials/07_whisper.ipynb
@@ -0,0 +1,451 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2a9ef54d",
+   "metadata": {},
+   "source": [
+    "## summary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a9a3c07",
+   "metadata": {},
+   "source": [
+    "- 一种多模态\n",
+    "    - audio => text\n",
+    "        - model:\n",
+    "            - AudioEncoder\n",
+    "            - TextDecoder\n",
+    "        - 典型的 A Transformer sequence-to-sequence model\n",
+    "- 安装\n",
+    "    ```\n",
+    "    # https://github.com/openai/whisper\n",
+    "    pip install -U openai-whisper\n",
+    "\n",
+    "    ```\n",
+    "- 模型下载地址\n",
+    "    - `~/.cache/whisper/xx.pt`\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "448be8f5",
+   "metadata": {},
+   "source": [
+    "## ffmpeg: video => audio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d912e69",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "sudo apt update && sudo apt install ffmpeg\n",
+    "ffmpeg -i sample.avi -q:a 0 -map a sample.mp3\n",
+    "```\n",
+    "\n",
+    "- `-i`: input\n",
+    "- `-q:a 0` for variable bit rate \n",
+    "    -  https://trac.ffmpeg.org/wiki/Encode/MP3\n",
+    "- `-map a`: exclude video/subtitles and only grab audio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39349968",
+   "metadata": {},
+   "source": [
+    "## cli "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d23a9c96",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "whisper --language Chinese --model large video.mp3 -o output_dir\n",
+    "```\n",
+    "\n",
+    "- 参数\n",
+    "    - `--task {transcribe,translate}`：默认 `transcribe`（也就是转录，asr）\n",
+    "    - `--language Chinese`：按哪种语言进行转录；\n",
+    "    - `--model {base, medium, large}`\n",
+    "    - `--device device`：default cuda，默认运行在一张卡上；\n",
+    "    - `-o output_dir`：生成的文本文件保存到的文件夹\n",
+    "        - `xx.json`\n",
+    "        - `xx.srt`\n",
+    "        - `xx.tsv`\n",
+    "        - `xx.txt`\n",
+    "        - `xx.vtt`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "beab9dfe",
+   "metadata": {},
+   "source": [
+    "## python script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "16478037",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:48:00.439627Z",
+     "start_time": "2023-04-12T14:47:59.175697Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import whisper\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "64ebdc86",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:48:06.010807Z",
+     "start_time": "2023-04-12T14:48:06.006368Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def get_params(model):\n",
+    "    model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n",
+    "    params = sum([np.prod(p.size()) for p in model_parameters])\n",
+    "    return params"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ad4922bd",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:48:57.558262Z",
+     "start_time": "2023-04-12T14:48:33.388568Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "base\t 71825920\n",
+      "medium\t 762321920\n",
+      "large\t 1541384960\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Whisper(\n",
+       "  (encoder): AudioEncoder(\n",
+       "    (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))\n",
+       "    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))\n",
+       "    (blocks): ModuleList(\n",
+       "      (0-31): 32 x ResidualAttentionBlock(\n",
+       "        (attn): MultiHeadAttention(\n",
+       "          (query): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (key): Linear(in_features=1280, out_features=1280, bias=False)\n",
+       "          (value): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (out): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): Sequential(\n",
+       "          (0): Linear(in_features=1280, out_features=5120, bias=True)\n",
+       "          (1): GELU(approximate='none')\n",
+       "          (2): Linear(in_features=5120, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (ln_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "  )\n",
+       "  (decoder): TextDecoder(\n",
+       "    (token_embedding): Embedding(51865, 1280)\n",
+       "    (blocks): ModuleList(\n",
+       "      (0-31): 32 x ResidualAttentionBlock(\n",
+       "        (attn): MultiHeadAttention(\n",
+       "          (query): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (key): Linear(in_features=1280, out_features=1280, bias=False)\n",
+       "          (value): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (out): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "        (cross_attn): MultiHeadAttention(\n",
+       "          (query): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (key): Linear(in_features=1280, out_features=1280, bias=False)\n",
+       "          (value): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "          (out): Linear(in_features=1280, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (cross_attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): Sequential(\n",
+       "          (0): Linear(in_features=1280, out_features=5120, bias=True)\n",
+       "          (1): GELU(approximate='none')\n",
+       "          (2): Linear(in_features=5120, out_features=1280, bias=True)\n",
+       "        )\n",
+       "        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "      )\n",
+       "    )\n",
+       "    (ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = whisper.load_model(\"base\")\n",
+    "print(\"base\\t\", get_params(model))\n",
+    "model = whisper.load_model(\"medium\")\n",
+    "print(\"medium\\t\", get_params(model))\n",
+    "model = whisper.load_model(\"large\")\n",
+    "print(\"large\\t\", get_params(model))\n",
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "33f6d3fb",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:50:40.837557Z",
+     "start_time": "2023-04-12T14:50:39.391533Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(21241974,)\n",
+      "[ 0.00305176  0.00213623  0.00241089 ... -0.00765991 -0.00476074\n",
+      " -0.00314331] <class 'numpy.ndarray'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load audio and pad/trim it to fit 30 seconds\n",
+    "audio = whisper.load_audio(\"./data/video/video.mp3\")\n",
+    "print(audio.shape)\n",
+    "print(audio, type(audio))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "617edeef",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T13:57:12.715017Z",
+     "start_time": "2023-04-12T13:57:12.705721Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "22.12705625"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "21241974/16000/60"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8f4426c0",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:51:39.541036Z",
+     "start_time": "2023-04-12T14:51:39.499299Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(480000,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load audio and pad/trim it to fit 30 seconds\n",
+    "audio = whisper.pad_or_trim(audio)\n",
+    "print(audio.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b692376f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:52:04.084340Z",
+     "start_time": "2023-04-12T14:52:04.074789Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "device(type='cuda', index=0)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "5cb9c565",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-04-12T14:52:52.836425Z",
+     "start_time": "2023-04-12T14:52:46.740159Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language: zh\n",
+      "好,B站的朋友们大家晚上好今天给大家开启一个插入一个新的系列就是面向小白的深度学习软硬件的装机指南然后这个系列呢我在上一期里面带着大家去实际开箱了一下我周末配置好的一个深度学习服务器然后大致的一个硬件一个配置的情况给大家展示了一把那这一节呢\n"
+     ]
+    }
+   ],
+   "source": [
+    "# make log-Mel spectrogram and move to the same device as the model\n",
+    "mel = whisper.log_mel_spectrogram(audio).to(model.device)\n",
+    "\n",
+    "# detect the spoken language\n",
+    "_, probs = model.detect_language(mel)\n",
+    "# print(probs)\n",
+    "print(f\"Detected language: {max(probs, key=probs.get)}\")\n",
+    "\n",
+    "# decode the audio\n",
+    "options = whisper.DecodingOptions()\n",
+    "# inference\n",
+    "result = whisper.decode(model, mel, options)\n",
+    "\n",
+    "# print the recognized text\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e754b3b",
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2023-04-12T14:53:59.945Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language: zh\n",
+      "好,B站的朋友们大家晚上好今天给大家开启一个插入一个新的系列就是面向小白的深度学习软硬件的装机指南然后这个系列呢我在上一期里面带着大家去实际开箱了一下我周末配置好的一个深度学习服务器然后大致的一个硬件一个配置的情况给大家展示了一把那这一节呢\n"
+     ]
+    }
+   ],
+   "source": [
+    "import whisper\n",
+    "model = whisper.load_model(\"large\")\n",
+    "audio = whisper.load_audio(\"./data/video/video.mp3\")\n",
+    "# load audio and pad/trim it to fit 30 seconds\n",
+    "audio = whisper.pad_or_trim(audio)\n",
+    "# make log-Mel spectrogram and move to the same device as the model\n",
+    "mel = whisper.log_mel_spectrogram(audio).to(model.device)\n",
+    "\n",
+    "# detect the spoken language\n",
+    "_, probs = model.detect_language(mel)\n",
+    "# print(probs)\n",
+    "print(f\"Detected language: {max(probs, key=probs.get)}\")\n",
+    "\n",
+    "# decode the audio\n",
+    "# default task：transcribe\n",
+    "options = whisper.DecodingOptions()\n",
+    "result = whisper.decode(model, mel, options)\n",
+    "# print the recognized text\n",
+    "print(result.text)\n",
+    "\n",
+    "options = whisper.DecodingOptions(task='translate')\n",
+    "result = whisper.decode(model, mel, options)\n",
+    "# print the recognized text\n",
+    "print(result.text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "calc(100% - 180px)",
+    "left": "10px",
+    "top": "150px",
+    "width": "295px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/llm/tutorials/data/video/video.mp3 b/llm/tutorials/data/video/video.mp3
new file mode 100644
index 0000000..312d929
--- /dev/null
+++ b/llm/tutorials/data/video/video.mp3
author	lanchunhui <zch921005@126.com>	2023-04-12 22:56:13 +0800
committer	lanchunhui <zch921005@126.com>	2023-04-12 22:56:13 +0800
commit	d3d0753f748bf567f5fc47cd432056dc57eb19c9 (patch)
tree	7889f01129e1884250434fe432604d551b0ece4a
parent	b3912af9173faf8b213e777866aeb546de391826 (diff)