feat/3103: Ensure consistency in HexBytes coercion (#3200)

* Refactor: Replace hexbytes dependency with custom HexBytes implementation

* Removed the hexbytes library and integrated a custom HexBytes class to ensure compatibility with the codebase.
* Updated imports across multiple files to use the new HexBytes class.
* Added tests for the HexBytes class to validate its functionality and ensure proper behavior with various input types.

* Update hexbytes error handling test to reject lists as input type

* Remove TypeError test for unsupported list input in HexBytes error handling

* Refactor: Improve formatting of hex method in HexBytes class for better readability

* Refactor: Clean up comments and improve readability in hex method of HexBytes class

* Refactor: Rename methods in HexBytes class for clarity and consistency

* Updated method names from `to_bytes` to `_to_bytes` and `hexstr_to_bytes` to `_hexstr_to_bytes` to indicate their private nature.
* Adjusted method calls within the class to reflect the new names, enhancing code readability and maintainability.

* * Removed support for bool and int types in HexBytes constructor, streamlining input handling and Introduced a new fromhex method to create HexBytes from hex strings, improving clarity.

* Remove hexbytes dependency from lockfile and related configurations

* Enhance hex method in HexBytes class to support custom separators and bytes per separator. This improves flexibility in hex encoding output while maintaining the existing functionality.

* Refactor hex method in HexBytes class to improve parameter handling and readability. Updated the method signature to clarify the use of custom separators and bytes per separator, ensuring consistent behavior with existing functionality.

* Update hex method in HexBytes class to remove unnecessary noqa comments, enhancing code clarity and consistency.
This commit is contained in:
Menna
2025-10-20 22:22:06 +02:00
committed by GitHub
parent fe567414dc
commit e2ef7c1ec8
11 changed files with 172 additions and 26 deletions

View File

@@ -4,7 +4,6 @@ import dataclasses
from datetime import date, datetime, time # noqa: I251
from typing import Any, Callable, List, Protocol, IO, Union, Dict
from uuid import UUID
from hexbytes import HexBytes
from enum import Enum
try:
@@ -18,7 +17,7 @@ from dlt.common.pendulum import pendulum
from dlt.common.arithmetics import Decimal
from dlt.common.wei import Wei
from dlt.common.utils import map_nested_values_in_place
from dlt.common.libs.hexbytes import HexBytes
TPuaDecoders = List[Callable[[Any], Any]]

View File

@@ -0,0 +1,88 @@
from typing import (
TYPE_CHECKING,
Union,
cast,
overload,
)
if TYPE_CHECKING:
from typing import (
SupportsIndex,
)
BytesLike = Union[bytearray, bytes, str, memoryview]
HEX_PREFIX_LOWER = "0x"
HEX_PREFIX_UPPER = "0X"
class HexBytes(bytes):
"""
HexBytes is a custom library that replaces the hexbytes library to ensure compatibility with the rest of the codebase.
It has these changes:
1. It always appends 0x prefix to the hex string.
2. The representation at console (__repr__) is 0x-prefixed
"""
def __new__(cls, val: BytesLike) -> "HexBytes":
bytesval = HexBytes._to_bytes(val)
return cast(HexBytes, super().__new__(cls, bytesval)) # type: ignore # https://github.com/python/typeshed/issues/2630 # noqa: E501
def hex( # noqa: A003
self, sep: Union[str, bytes] = None, bytes_per_sep: "SupportsIndex" = 1
) -> str:
"""
Output hex-encoded bytes, with an "0x" prefix.
Everything following the "0x" is output exactly like :meth:`bytes.hex`.
"""
return HEX_PREFIX_LOWER + (
super().hex() if sep is None else super().hex(sep, bytes_per_sep)
)
@overload
def __getitem__(self, key: "SupportsIndex") -> int: # noqa: F811
...
@overload # noqa: F811
def __getitem__(self, key: slice) -> "HexBytes": # noqa: F811
...
def __getitem__( # noqa: F811
self, key: Union["SupportsIndex", slice]
) -> Union[int, "HexBytes"]:
result = super().__getitem__(key)
return cast(int, result) if isinstance(key, int) else self.__class__(cast(bytes, result))
def __repr__(self) -> str:
return f"HexBytes({self.hex()!r})"
@staticmethod
def _to_bytes(val: BytesLike) -> bytes:
"""
Convert BytesLike input to bytes representation.
Args:
val: bytes, str (hex), bytearray, or memoryview
Returns:
bytes representation of the input
"""
if isinstance(val, bytes):
return val
if isinstance(val, str):
return HexBytes.fromhex(val)
return bytes(val)
@classmethod
def fromhex(cls, hexstr: str) -> "HexBytes":
"""
Create HexBytes from hex string, handling optional 0x prefix.
Args:
hexstr: Hex string with or without 0x/0X prefix
Returns:
HexBytes instance
"""
cleaned_hex = hexstr.removeprefix(HEX_PREFIX_LOWER).removeprefix(HEX_PREFIX_UPPER)
return super(HexBytes, cls).__new__(cls, bytes.fromhex(cleaned_hex))

View File

@@ -1,12 +1,12 @@
import datetime # noqa: 251
from typing import Any, Optional, Type
from hexbytes import HexBytes
from dlt.common.pendulum import pendulum
from dlt.common.wei import Wei
from dlt.common.data_types import TDataType
from dlt.common.time import parse_iso_like_datetime
from dlt.common.libs.hexbytes import HexBytes
_NOW_TS: float = pendulum.now().timestamp()

View File

@@ -35,7 +35,6 @@ dependencies = [
"simplejson>=3.17.5",
"PyYAML>=5.4.1",
"semver>=3.0.0",
"hexbytes>=0.2.2",
"tzdata>=2022.1",
"tomlkit>=0.11.3",
"pathvalidate>=2.5.2",

View File

@@ -2,7 +2,6 @@ import datetime # noqa: I251
import hashlib
from typing import Dict, List, Any, Sequence, Tuple, Literal, Union
import base64
from hexbytes import HexBytes
from copy import deepcopy
from string import ascii_lowercase
import random
@@ -22,6 +21,7 @@ from dlt.common.time import (
ensure_pendulum_time,
ensure_pendulum_date,
)
from dlt.common.libs.hexbytes import HexBytes
from dlt.common.schema import TColumnSchema, TTableSchemaColumns
from tests.utils import TPythonTableFormat, TestDataItemFormat, arrow_item_from_pandas

View File

@@ -0,0 +1,74 @@
import pytest
from dlt.common.libs.hexbytes import HexBytes
def test_hexbytes_from_bytes():
# Test creation from bytes
binary_string = HexBytes(b"binary string")
assert isinstance(binary_string, HexBytes)
assert isinstance(binary_string, bytes)
assert binary_string == b"binary string"
assert binary_string.hex() == "0x62696e61727920737472696e67"
def test_hexbytes_from_hex_string():
# Test creation from hex string with and without 0x prefix
hex_with_prefix = "0x62696e61727920737472696e67"
hex_without_prefix = "62696e61727920737472696e67"
hex_bytes_with_prefix = HexBytes(hex_with_prefix)
hex_bytes_without_prefix = HexBytes(hex_without_prefix)
assert hex_bytes_with_prefix == hex_bytes_without_prefix
assert hex_bytes_with_prefix == b"binary string"
assert hex_bytes_with_prefix.hex() == "0x62696e61727920737472696e67"
def test_hexbytes_indexing():
# Test indexing behavior
test_bytes = HexBytes(b"binary")
# Test single item access
first_byte = test_bytes[0]
last_byte = test_bytes[-1]
assert first_byte == ord("b") # should return int
assert last_byte == ord("y")
# Test slicing
middle_slice = test_bytes[1:3]
assert isinstance(middle_slice, HexBytes) # slices should return HexBytes
assert middle_slice == b"in"
assert test_bytes[:2] == b"bi" # prefix slice
assert test_bytes[-2:] == b"ry" # suffix slice
def test_hexbytes_representation():
# Test string representation
test_bytes = HexBytes(b"test")
assert repr(test_bytes) == "HexBytes('0x74657374')"
# Test actual bytes content
assert bytes(test_bytes) == b"test"
def test_hexbytes_comparison():
# Test equality comparisons
first_hex = HexBytes(b"test")
same_as_first = HexBytes(b"test")
different_hex = HexBytes(b"different")
assert first_hex == same_as_first
assert first_hex != different_hex
assert first_hex == b"test" # Compare with bytes
assert first_hex != b"different"
def test_hexbytes_hex_method():
# Test hex() method specifically
single_char = HexBytes(b"A")
assert single_char.hex() == "0x41" # Should always include 0x prefix
# Test with empty bytes
empty_bytes = HexBytes(b"")
assert empty_bytes.hex() == "0x"

View File

@@ -3,7 +3,7 @@ from copy import copy
from typing import Any, Type
import pytest
import datetime # noqa: I251
from hexbytes import HexBytes
from dlt.common.libs.hexbytes import HexBytes
from enum import Enum
from pendulum.tz import UTC
@@ -32,10 +32,9 @@ def test_coerce_type_to_text() -> None:
# bytes to text (base64)
assert coerce_value("text", "binary", b"binary string") == "YmluYXJ5IHN0cmluZw=="
# HexBytes to text (hex with prefix)
assert coerce_value("text", "binary", HexBytes(b"binary string")) in [
"0x62696e61727920737472696e67",
"62696e61727920737472696e67",
]
assert (
coerce_value("text", "binary", HexBytes(b"binary string")) == "0x62696e61727920737472696e67"
)
# Str enum value
class StrEnum(Enum):

View File

@@ -1,5 +1,4 @@
from hexbytes import HexBytes
from dlt.common.libs.hexbytes import HexBytes
from dlt.common import pendulum, Decimal, Wei
from dlt.common.schema.utils import autodetect_sc_type
from dlt.common.schema.detections import (

View File

@@ -3,8 +3,7 @@ from pendulum import UTC
import pytest
from copy import deepcopy
from typing import Any, Iterator, List, Sequence
from hexbytes import HexBytes
from dlt.common.libs.hexbytes import HexBytes
from dlt.common import Wei, Decimal, pendulum, json
from dlt.common.configuration.container import Container
from dlt.common.destination.capabilities import DestinationCapabilitiesContext

View File

@@ -5,12 +5,12 @@ import os
import contextlib
from subprocess import CalledProcessError
from typing import List, Tuple, Optional
from hexbytes import HexBytes
import pytest
from unittest import mock
import re
from packaging.requirements import Requirement
from typing import Dict
from dlt.common.libs.hexbytes import HexBytes
# import that because O3 modules cannot be unloaded
import cryptography.hazmat.bindings._rust

11
uv.lock generated
View File

@@ -2048,7 +2048,6 @@ dependencies = [
{ name = "fsspec" },
{ name = "gitpython" },
{ name = "giturlparse" },
{ name = "hexbytes" },
{ name = "humanize" },
{ name = "jsonpath-ng" },
{ name = "orjson", version = "3.10.18", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' and sys_platform != 'emscripten'" },
@@ -2365,7 +2364,6 @@ requires-dist = [
{ name = "google-cloud-bigquery", marker = "extra == 'gcp'", specifier = ">=2.26.0" },
{ name = "grpcio", marker = "extra == 'bigquery'", specifier = ">=1.50.0" },
{ name = "grpcio", marker = "extra == 'gcp'", specifier = ">=1.50.0" },
{ name = "hexbytes", specifier = ">=0.2.2" },
{ name = "humanize", specifier = ">=4.4.0" },
{ name = "ibis-framework", marker = "python_full_version >= '3.10' and extra == 'workspace'", specifier = ">=10.5.0" },
{ name = "jsonpath-ng", specifier = ">=1.5.3" },
@@ -3749,15 +3747,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d0/9e/984486f2d0a0bd2b024bf4bc1c62688fcafa9e61991f041fb0e2def4a982/h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0", size = 60957, upload-time = "2025-02-01T11:02:26.481Z" },
]
[[package]]
name = "hexbytes"
version = "0.3.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/c1/94/fbfd526e8964652eec6a7b74ae18d1426e225ab602553858531ec6567d05/hexbytes-0.3.1.tar.gz", hash = "sha256:a3fe35c6831ee8fafd048c4c086b986075fc14fd46258fa24ecb8d65745f9a9d", size = 6188, upload-time = "2023-06-08T20:36:59.73Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/0b/9e/fdfe374c28d448a58563e7e43f569f8cf8cf600db092efac2e8ac2f86782/hexbytes-0.3.1-py3-none-any.whl", hash = "sha256:383595ad75026cf00abd570f44b368c6cdac0c6becfae5c39ff88829877f8a59", size = 5944, upload-time = "2023-06-08T20:36:58.066Z" },
]
[[package]]
name = "hpack"
version = "4.1.0"