Skip to content

Commit

Permalink
fix[lang]: fix encoding of string literals (#3091)
Browse files Browse the repository at this point in the history
this commit fixes bad runtime encoding of unicode strings. `parse_Str`
used the utility function `string_to_bytes`, which rejects characters
with value larger than 255 and otherwise produces the ascii encoding
of the string. the issue is that bytes in the range 128-255 specify
different characters in utf-8 than in ascii encodings, resulting in
different values at runtime than at compile-time.

this can be seen from differing compile-vs-runtime behavior of
`keccak256` (this example was provided in GH issue 3088):

```vyper
@external @view def compile_hash() -> bytes32:
    return keccak256("è")

@external @view def runtime_hash() -> bytes32:
    s: String[1] = "è" return keccak256(s)
```

this commit fixes and simplifies `parse_Str` by using python's
`str.encode()` builtin, which encodes using utf-8 by default. it also
increases strictness of string validation to reject bytes in the range
128-255, since in utf-8 these can encode multibyte characters, which we
reject in vyper (see more discussion in GH issue 2338).
  • Loading branch information
charles-cooper authored Jan 12, 2025
1 parent 9b5523e commit 43259f8
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 49 deletions.
45 changes: 28 additions & 17 deletions tests/functional/syntax/test_string.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest

from vyper import compiler
from vyper.exceptions import StructureException
from vyper.exceptions import InvalidLiteral, StructureException

valid_list = [
"""
Expand All @@ -11,25 +11,13 @@ def foo() -> String[10]:
""",
"""
@external
def foo():
x: String[11] = "¡très bien!"
""",
"""
@external
def foo() -> bool:
x: String[15] = "¡très bien!"
x: String[15] = "tres bien!"
y: String[15] = "test"
return x != y
""",
"""
@external
def foo() -> bool:
x: String[15] = "¡très bien!"
y: String[12] = "test"
return x != y
""",
"""
@external
def test() -> String[100]:
return "hello world!"
""",
Expand All @@ -46,13 +34,36 @@ def test_string_success(good_code):
"""
@external
def foo():
# invalid type annotation - should be String[N]
a: String = "abc"
""",
StructureException,
)
),
(
"""
@external
@view
def compile_hash() -> bytes32:
# GH issue #3088 - ord("è") == 232
return keccak256("è")
""",
InvalidLiteral,
),
(
"""
@external
def foo() -> bool:
# ord("¡") == 161
x: String[15] = "¡très bien!"
y: String[12] = "test"
return x != y
""",
InvalidLiteral,
),
]


@pytest.mark.parametrize("bad_code,exc", invalid_list)
def test_string_fail(assert_compile_failed, get_contract, bad_code, exc):
assert_compile_failed(lambda: get_contract(bad_code), exc)
def test_string_fail(get_contract, bad_code, exc):
with pytest.raises(exc):
compiler.compile_code(bad_code)
5 changes: 4 additions & 1 deletion vyper/ast/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,10 @@ class Str(Constant):

def validate(self):
for c in self.value:
if ord(c) >= 256:
# in utf-8, bytes in the 128 and up range deviate from latin1 and
# can be control bytes, allowing multi-byte characters.
# reject them here.
if ord(c) >= 128:
raise InvalidLiteral(f"'{c}' is not an allowed string literal character", self)


Expand Down
29 changes: 10 additions & 19 deletions vyper/codegen/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,7 @@
from vyper.semantics.types.bytestrings import _BytestringT
from vyper.semantics.types.function import ContractFunctionT, MemberFunctionT
from vyper.semantics.types.shortcuts import BYTES32_T, UINT256_T
from vyper.utils import (
DECIMAL_DIVISOR,
bytes_to_int,
is_checksum_encoded,
string_to_bytes,
vyper_warn,
)
from vyper.utils import DECIMAL_DIVISOR, bytes_to_int, is_checksum_encoded, vyper_warn

ENVIRONMENT_VARIABLES = {"block", "msg", "tx", "chain"}

Expand Down Expand Up @@ -135,24 +129,21 @@ def parse_Hex(self):

# String literals
def parse_Str(self):
bytez, bytez_length = string_to_bytes(self.expr.value)
typ = StringT(bytez_length)
return self._make_bytelike(typ, bytez, bytez_length)
bytez = self.expr.value.encode("utf-8")
return self._make_bytelike(StringT, bytez)

# Byte literals
def parse_Bytes(self):
return self._parse_bytes()
return self._make_bytelike(BytesT, self.expr.value)

def parse_HexBytes(self):
return self._parse_bytes()

def _parse_bytes(self):
bytez = self.expr.value
bytez_length = len(self.expr.value)
typ = BytesT(bytez_length)
return self._make_bytelike(typ, bytez, bytez_length)
# HexBytes already has value as bytes
assert isinstance(self.expr.value, bytes)
return self._make_bytelike(BytesT, self.expr.value)

def _make_bytelike(self, btype, bytez, bytez_length):
def _make_bytelike(self, typeclass, bytez):
bytez_length = len(bytez)
btype = typeclass(bytez_length)
placeholder = self.context.new_internal_variable(btype)
seq = []
seq.append(["mstore", placeholder, bytez_length])
Expand Down
13 changes: 1 addition & 12 deletions vyper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import warnings
from typing import Generic, List, TypeVar, Union

from vyper.exceptions import CompilerPanic, DecimalOverrideException, InvalidLiteral, VyperException
from vyper.exceptions import CompilerPanic, DecimalOverrideException, VyperException

_T = TypeVar("_T")

Expand Down Expand Up @@ -310,17 +310,6 @@ def round_towards_zero(d: decimal.Decimal) -> int:
return int(d.to_integral_exact(decimal.ROUND_DOWN))


# Converts string to bytes
def string_to_bytes(str):
bytez = b""
for c in str:
if ord(c) >= 256:
raise InvalidLiteral(f"Cannot insert special character {c} into byte array")
bytez += bytes([ord(c)])
bytez_length = len(bytez)
return bytez, bytez_length


# Converts a provided hex string to an integer
def hex_to_int(inp):
if inp[:2] == "0x":
Expand Down

0 comments on commit 43259f8

Please sign in to comment.