Merge remote-tracking branch 'origin/main' into fix-dsv2

InternLM · Jan 14, 2025 · fc62c92 · fc62c92
2 parents 6f603bc + 8b7812b
commit fc62c92
Show file tree

Hide file tree

Showing 6 changed files with 27 additions and 24 deletions.
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -443,11 +443,12 @@ def match(cls, model_path: str) -> Optional[str]:
             model_path (str): the model path used for matching.
         """
         path = model_path.lower()
-        if all([c not in path for c in ['internlm2', '8k']]) and \
+        if all([c not in path for c in ['internlm3', 'internlm2', '8k']]) and \
                 all([c in path for c in ['internlm', 'chat']]):
             return 'internlm'
 
 
+@MODELS.register_module(name='internlm3')
 @MODELS.register_module(name='internlm2')
 class InternLM2Chat7B(InternLMChat7B):
     """Chat template and generation parameters of InternLM2-Chat-7B."""
@@ -490,6 +491,8 @@ def match(cls, model_path: str) -> Optional[str]:
         path = model_path.lower()
         if 'internlm2' in path and ('chat' in path or 'math' in path):
             return 'internlm2'
+        if 'internlm3' in path and ('instruct' in path):
+            return 'internlm3'
 
     def messages2prompt(self,
                         messages,

diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -574,7 +574,6 @@ def stream_infer(
                           **kwargs)
 
     async def _get_prompt_input(self,
-                                session_id: int,
                                 prompt: str,
                                 do_preprocess: bool,
                                 sequence_start: bool,

diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py
@@ -31,25 +31,25 @@ async def _async_get_logits(
         logits = [None] * len(input_ids)
 
         async def _proc(i):
-            async for out in self.generate(
-                    messages=None,
-                    input_ids=input_ids[i],
-                    step=0 if steps is None else steps[i],
-                    session_id=i,
-                    # `max_new_tokens=0` means we don't need engine to
-                    # generate tokens and `output_logits=all` requests engine
-                    # to output logits of all input tokens
-                    gen_config=GenerationConfig(max_new_tokens=0,
-                                                output_logits='all'),
-                    stream_response=False,
-                    sequence_start=sequence_start,
-                    sequence_end=sequence_end):
-                # In the last iteration, the yielded `out` is an empty response
-                # indicating the finish_reason, which should be ignored here
-                if out.finish_reason is None:
-                    # Try not to return in async for loop. Otherwise, there
-                    # will be `GeneratorExit` exception
-                    logits[i] = out.logits
+            async with self.model_inst(session_id=i) as inst:
+                input_len = len(input_ids[i])
+                # TODO(lvhan): Fix the ugly code later on
+                max_new_tokens = 1 if self.backend == 'turbomind' else 0
+                gen_config = GenerationConfig(max_new_tokens=max_new_tokens,
+                                              output_logits='all')
+                async with self.safe_run(inst,
+                                         session_id=i,
+                                         input_ids=input_ids[i],
+                                         gen_config=gen_config,
+                                         stream_output=False,
+                                         sequence_start=sequence_start,
+                                         sequence_end=sequence_end,
+                                         step=steps[i] if steps else 0) as gen:
+                    async for outputs in gen:
+                        pass
+                    logits[i] = outputs.logits[:input_len, :]
+                if sequence_end and self.backend == 'pytorch':
+                    await inst.async_end(session_id=i)
 
         tasks = [_proc(i) for i in range(len(input_ids))]
         await asyncio.gather(*tasks)
@@ -211,4 +211,5 @@ def _get_ppl(self,
             loss = flat_loss_matrix.sum()
             target_count = target_mask.sum()
             result.append(loss.item() / target_count.item())
+        logger.info(f'ppl result: {result}')
         return result
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -191,7 +191,7 @@ def model_info(self):
 
         return dict(
             size_per_head=head_dim,
-            rotary_embedding=hidden_units // attn_head_num,
+            rotary_embedding=head_dim,
             num_layer=num_layer,
             norm_eps=norm_eps,
             head_num=attn_head_num,

diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
@@ -13,6 +13,8 @@
     InternLMForCausalLM='llama',
     # internlm2
     InternLM2ForCausalLM='internlm2',
+    # internlm3
+    InternLM3ForCausalLM='llama',
     # llama, llama2, alpaca, vicuna, codellama, ultracm, yi,
     # deepseek-coder, deepseek-llm
     LlamaForCausalLM='llama',

diff --git a/tests/test_lmdeploy/test_auto_backend.py b/tests/test_lmdeploy/test_auto_backend.py
@@ -33,8 +33,6 @@ def models(self):
             ('tiiuae/falcon-7b-instruct', True, False),
             ('01-ai/Yi-34B-Chat', True, True),
             ('codellama/CodeLlama-7b-Instruct-hf', True, True),
-            ('mistralai/Mistral-7B-Instruct-v0.1', True, True),
-            ('mistralai/Mixtral-8x7B-Instruct-v0.1', True, True),
             ('Qwen/Qwen-7B-Chat', True, True),
             ('Qwen/Qwen-VL-Chat', False, True),
             ('Qwen/Qwen1.5-4B-Chat', True, True),