Add files via upload

datawhalechina · Aug 5, 2024 · f2ca736 · f2ca736
1 parent b6279eb
commit f2ca736
Showing 1 changed file with 315 additions and 0 deletions.
diff --git a/competition/科大讯飞AI开发者大赛2024/机器翻译质量评估挑战赛_baseline.ipynb b/competition/科大讯飞AI开发者大赛2024/机器翻译质量评估挑战赛_baseline.ipynb
@@ -0,0 +1,315 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "6d43e6f2-d352-415e-a6fa-8ff13dd3fbd7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "device = \"cuda\" # the device to load the model onto\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    \"/home/lyz/hf-models/Qwen/Qwen1.5-1.8B-Chat/\",\n",
+    "    torch_dtype=\"auto\",\n",
+    "    device_map=\"auto\"\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"/home/lyz/hf-models/Qwen/Qwen1.5-1.8B-Chat/\")\n",
+    "\n",
+    "prompt = \"Give me a short introduction to large language model.\"\n",
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "    {\"role\": \"user\", \"content\": prompt}\n",
+    "]\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True\n",
+    ")\n",
+    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(device)\n",
+    "\n",
+    "generated_ids = model.generate(\n",
+    "    model_inputs.input_ids,\n",
+    "    max_new_tokens=512\n",
+    ")\n",
+    "generated_ids = [\n",
+    "    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
+    "]\n",
+    "\n",
+    "response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "8ae01e82-9cd4-4851-860d-307b5342b4df",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"\\n\\nLarge Language Models (LLMs) are artificial intelligence systems that have been developed to perform complex natural language processing tasks with unprecedented accuracy and scalability. They are designed to understand, generate, and generate human-like text based on input data, which can include structured or unstructured text such as sentences, paragraphs, or even entire documents.\\n\\nThe idea behind LLMs is to create a neural network architecture that allows them to learn from vast amounts of text data and adapt their responses to changing language patterns and contexts. These models are typically trained using supervised learning algorithms, which involve feeding the LLM large datasets of labeled text examples, allowing it to identify patterns and relationships between words and phrases in the training data.\\n\\nOnce trained, an LLM can be fine-tuned on specific domains or applications, such as question-answering, language translation, sentiment analysis, or even language generation for creative writing or poetry. The resulting LLM can produce coherent and grammatically correct responses to questions, written passages, or even spoken utterances, often surpassing human-level performance in certain tasks.\\n\\nOne of the key advantages of LLMs is their ability to handle a wide range of languages and dialects, thanks to their ability to encode and process multiple languages simultaneously. This makes them particularly useful in multilingual environments, where the same task may require understanding multiple languages at once. Additionally, some modern LLMs can generate code snippets, technical reports, or research papers, demonstrating their ability to work with complex computational processes and mathematical expressions.\\n\\nDespite their impressive capabilities, LLMs still face several challenges in practice. One major challenge is the sheer volume and diversity of available text data, making it difficult to train these models with sufficient quality and quantity. Another challenge is the lack of interpretability, especially when dealing with tasks such as language generation or understanding, where the model's output can be subjective or context-dependent.\\n\\nTo address these challenges, researchers and developers are exploring various techniques, such as attention mechanisms, self-attention layers, and transformer architectures, to enhance the model's capacity for reasoning and understanding of language. Additionally, there is ongoing research into transfer learning, where pre-trained LLMs can be fine-tuned on smaller datasets or specialized domains to improve their performance.\\n\\nOverall, Large Language Models represent a significant advancement in the field of AI, offering new possibilities for natural language processing and communication across diverse languages and platforms. As these models continue to evolve and mature, we can expect to see increasingly sophisticated and sophisticated applications in fields ranging from business to education, healthcare, and entertainment\""
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "94287f31-a1f4-4077-a328-f3225ae43ba3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import string\n",
+    "import numpy as np\n",
+    "\n",
+    "data = pd.read_csv('dataset/test.txt', sep='\\t', header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "6210aa79-3b6f-4748-97f9-8f9a9afdb632",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>在变电所和供电系统的设计和运行中，基于如下用途必须进行短路电流的计算：</td>\n",
+       "      <td>In the design and operation of substations and...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>The Prostate Testing for Cancer and Treatment ...</td>\n",
+       "      <td>前列腺癌检测与治疗（ProtecT）研究比较了PSA检测出前列腺癌的男性患者的前列腺切除术与...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Particles with nonzero electric charge interac...</td>\n",
+       "      <td>电电荷为非零的粒子通过交换光子（电磁力的载体）相互作用。</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>中国的一个租船人,租了一条10万吨的美国船东的油轮,从上海装货去美国。</td>\n",
+       "      <td>A Chinese charterer chartered a 100000 ton tan...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>为了节省成本，运营商在5G建网初期都会选择NSA。</td>\n",
+       "      <td>In order to save costs, operators will choose ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>495</th>\n",
+       "      <td>在周五黄昏巨浪冲击海岸时，有数百人聚集在海滩庆祝节日，很多人被巨浪冲走，巨浪所到之处一切尽毁。</td>\n",
+       "      <td>Hundreds of people gathered on the beach to ce...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>496</th>\n",
+       "      <td>我不确定。但在我们解决之前不要做任何事。</td>\n",
+       "      <td>Well, I'm not sure. But don't do anything unti...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>497</th>\n",
+       "      <td>在本法规定的范围以外需要悬挂国微或者使用国徽图案的，由全国人民代表大会常务委员会办公厅或者国...</td>\n",
+       "      <td>If it is necessary to hang the national emblem...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>498</th>\n",
+       "      <td>贸易的吹鼓手们对此负有一定的责任。</td>\n",
+       "      <td>The trade drummers are responsible for this.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>499</th>\n",
+       "      <td>苹果 。</td>\n",
+       "      <td>Apples .</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>500 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                     0  \\\n",
+       "0                  在变电所和供电系统的设计和运行中，基于如下用途必须进行短路电流的计算：   \n",
+       "1    The Prostate Testing for Cancer and Treatment ...   \n",
+       "2    Particles with nonzero electric charge interac...   \n",
+       "3                  中国的一个租船人,租了一条10万吨的美国船东的油轮,从上海装货去美国。   \n",
+       "4                            为了节省成本，运营商在5G建网初期都会选择NSA。   \n",
+       "..                                                 ...   \n",
+       "495    在周五黄昏巨浪冲击海岸时，有数百人聚集在海滩庆祝节日，很多人被巨浪冲走，巨浪所到之处一切尽毁。   \n",
+       "496                               我不确定。但在我们解决之前不要做任何事。   \n",
+       "497  在本法规定的范围以外需要悬挂国微或者使用国徽图案的，由全国人民代表大会常务委员会办公厅或者国...   \n",
+       "498                                  贸易的吹鼓手们对此负有一定的责任。   \n",
+       "499                                               苹果 。   \n",
+       "\n",
+       "                                                     1  \n",
+       "0    In the design and operation of substations and...  \n",
+       "1    前列腺癌检测与治疗（ProtecT）研究比较了PSA检测出前列腺癌的男性患者的前列腺切除术与...  \n",
+       "2                         电电荷为非零的粒子通过交换光子（电磁力的载体）相互作用。  \n",
+       "3    A Chinese charterer chartered a 100000 ton tan...  \n",
+       "4    In order to save costs, operators will choose ...  \n",
+       "..                                                 ...  \n",
+       "495  Hundreds of people gathered on the beach to ce...  \n",
+       "496  Well, I'm not sure. But don't do anything unti...  \n",
+       "497  If it is necessary to hang the national emblem...  \n",
+       "498       The trade drummers are responsible for this.  \n",
+       "499                                           Apples .  \n",
+       "\n",
+       "[500 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "e595f5be-6e65-406e-a9f4-a0016ad686be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.86921406 0.8665448 ]\n",
+      " [0.899487   0.8784326 ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "sentences_1 = [\"样例数据-1\", \"样例数据-2\"]\n",
+    "sentences_2 = [\"样例数据-3\", \"样例数据-4\"]\n",
+    "bge_model = SentenceTransformer('/home/lyz/hf-models/bge-small-zh-v1.5/')\n",
+    "embeddings_1 = bge_model.encode(sentences_1, normalize_embeddings=True)\n",
+    "embeddings_2 = bge_model.encode(sentences_2, normalize_embeddings=True)\n",
+    "similarity = embeddings_1 @ embeddings_2.T\n",
+    "print(similarity)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "2df95120-a710-4441-975e-554579e6413a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for row in data.iterrows():\n",
+    "    if row[1][0][0].lower() in string.ascii_letters:\n",
+    "        s1 = row[1][0]\n",
+    "        s2 = row[1][1]\n",
+    "    else:\n",
+    "        s1 = row[1][1]\n",
+    "        s2 = row[1][0]\n",
+    "\n",
+    "    messages = [\n",
+    "        {\"role\": \"user\", \"content\": f\"将英文翻译为中文，不要有其他输出：{s1}\"},\n",
+    "    ]\n",
+    "    text = tokenizer.apply_chat_template(\n",
+    "        messages,\n",
+    "        tokenize=False,\n",
+    "        add_generation_prompt=True\n",
+    "    )\n",
+    "    model_inputs = tokenizer([text], return_tensors=\"pt\").to(device)\n",
+    "    \n",
+    "    generated_ids = model.generate(\n",
+    "        model_inputs.input_ids,\n",
+    "        max_new_tokens=512\n",
+    "    )\n",
+    "    generated_ids = [\n",
+    "        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
+    "    ]\n",
+    "    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
+    "\n",
+    "    embeddings_1 = bge_model.encode([response, s2], normalize_embeddings=True)\n",
+    "    score = np.dot(embeddings_1[0], embeddings_1[1])\n",
+    "    score = int(score * 100)\n",
+    "\n",
+    "    with open('a.csv', 'a') as up:\n",
+    "        up.write(f'{score}\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cc48c08-54d5-4576-9e03-212f86d4ebc7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py3.11",
+   "language": "python",
+   "name": "py3.11"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}