fix[lang]: fix encoding of string literals (#3091)

this commit fixes bad runtime encoding of unicode strings. `parse_Str` used the utility function `string_to_bytes`, which rejects characters with value larger than 255 and otherwise produces the ascii encoding of the string. the issue is that bytes in the range 128-255 specify different characters in utf-8 than in ascii encodings, resulting in different values at runtime than at compile-time. this can be seen from differing compile-vs-runtime behavior of `keccak256` (this example was provided in GH issue 3088): ```vyper @external @view def compile_hash() -> bytes32: return keccak256("è") @external @view def runtime_hash() -> bytes32: s: String[1] = "è" return keccak256(s) ``` this commit fixes and simplifies `parse_Str` by using python's `str.encode()` builtin, which encodes using utf-8 by default. it also increases strictness of string validation to reject bytes in the range 128-255, since in utf-8 these can encode multibyte characters, which we reject in vyper (see more discussion in GH issue 2338).
vyperlang · Jan 12, 2025 · 43259f8 · 43259f8
1 parent 9b5523e
commit 43259f8
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 49 deletions.
diff --git a/tests/functional/syntax/test_string.py b/tests/functional/syntax/test_string.py
@@ -1,7 +1,7 @@
 import pytest
 
 from vyper import compiler
-from vyper.exceptions import StructureException
+from vyper.exceptions import InvalidLiteral, StructureException
 
 valid_list = [
     """
@@ -11,25 +11,13 @@ def foo() -> String[10]:
     """,
     """
 @external
-def foo():
-    x: String[11] = "¡très bien!"
-    """,
-    """
-@external
 def foo() -> bool:
-    x: String[15] = "¡très bien!"
+    x: String[15] = "tres bien!"
     y: String[15] = "test"
     return x != y
     """,
     """
 @external
-def foo() -> bool:
-    x: String[15] = "¡très bien!"
-    y: String[12] = "test"
-    return x != y
-    """,
-    """
-@external
 def test() -> String[100]:
     return "hello world!"
     """,
@@ -46,13 +34,36 @@ def test_string_success(good_code):
         """
 @external
 def foo():
+    # invalid type annotation - should be String[N]
     a: String = "abc"
     """,
         StructureException,
-    )
+    ),
+    (
+        """
+@external
+@view
+def compile_hash() -> bytes32:
+    # GH issue #3088 - ord("è") == 232
+    return keccak256("è")
+    """,
+        InvalidLiteral,
+    ),
+    (
+        """
+@external
+def foo() -> bool:
+    # ord("¡") == 161
+    x: String[15] = "¡très bien!"
+    y: String[12] = "test"
+    return x != y
+    """,
+        InvalidLiteral,
+    ),
 ]
 
 
 @pytest.mark.parametrize("bad_code,exc", invalid_list)
-def test_string_fail(assert_compile_failed, get_contract, bad_code, exc):
-    assert_compile_failed(lambda: get_contract(bad_code), exc)
+def test_string_fail(get_contract, bad_code, exc):
+    with pytest.raises(exc):
+        compiler.compile_code(bad_code)
diff --git a/vyper/ast/nodes.py b/vyper/ast/nodes.py
@@ -873,7 +873,10 @@ class Str(Constant):
 
     def validate(self):
         for c in self.value:
-            if ord(c) >= 256:
+            # in utf-8, bytes in the 128 and up range deviate from latin1 and
+            # can be control bytes, allowing multi-byte characters.
+            # reject them here.
+            if ord(c) >= 128:
                 raise InvalidLiteral(f"'{c}' is not an allowed string literal character", self)
 
 

diff --git a/vyper/codegen/expr.py b/vyper/codegen/expr.py
@@ -61,13 +61,7 @@
 from vyper.semantics.types.bytestrings import _BytestringT
 from vyper.semantics.types.function import ContractFunctionT, MemberFunctionT
 from vyper.semantics.types.shortcuts import BYTES32_T, UINT256_T
-from vyper.utils import (
-    DECIMAL_DIVISOR,
-    bytes_to_int,
-    is_checksum_encoded,
-    string_to_bytes,
-    vyper_warn,
-)
+from vyper.utils import DECIMAL_DIVISOR, bytes_to_int, is_checksum_encoded, vyper_warn
 
 ENVIRONMENT_VARIABLES = {"block", "msg", "tx", "chain"}
 
@@ -135,24 +129,21 @@ def parse_Hex(self):
 
     # String literals
     def parse_Str(self):
-        bytez, bytez_length = string_to_bytes(self.expr.value)
-        typ = StringT(bytez_length)
-        return self._make_bytelike(typ, bytez, bytez_length)
+        bytez = self.expr.value.encode("utf-8")
+        return self._make_bytelike(StringT, bytez)
 
     # Byte literals
     def parse_Bytes(self):
-        return self._parse_bytes()
+        return self._make_bytelike(BytesT, self.expr.value)
 
     def parse_HexBytes(self):
-        return self._parse_bytes()
-
-    def _parse_bytes(self):
-        bytez = self.expr.value
-        bytez_length = len(self.expr.value)
-        typ = BytesT(bytez_length)
-        return self._make_bytelike(typ, bytez, bytez_length)
+        # HexBytes already has value as bytes
+        assert isinstance(self.expr.value, bytes)
+        return self._make_bytelike(BytesT, self.expr.value)
 
-    def _make_bytelike(self, btype, bytez, bytez_length):
+    def _make_bytelike(self, typeclass, bytez):
+        bytez_length = len(bytez)
+        btype = typeclass(bytez_length)
         placeholder = self.context.new_internal_variable(btype)
         seq = []
         seq.append(["mstore", placeholder, bytez_length])

diff --git a/vyper/utils.py b/vyper/utils.py
@@ -11,7 +11,7 @@
 import warnings
 from typing import Generic, List, TypeVar, Union
 
-from vyper.exceptions import CompilerPanic, DecimalOverrideException, InvalidLiteral, VyperException
+from vyper.exceptions import CompilerPanic, DecimalOverrideException, VyperException
 
 _T = TypeVar("_T")
 
@@ -310,17 +310,6 @@ def round_towards_zero(d: decimal.Decimal) -> int:
     return int(d.to_integral_exact(decimal.ROUND_DOWN))
 
 
-# Converts string to bytes
-def string_to_bytes(str):
-    bytez = b""
-    for c in str:
-        if ord(c) >= 256:
-            raise InvalidLiteral(f"Cannot insert special character {c} into byte array")
-        bytez += bytes([ord(c)])
-    bytez_length = len(bytez)
-    return bytez, bytez_length
-
-
 # Converts a provided hex string to an integer
 def hex_to_int(inp):
     if inp[:2] == "0x":