temp script to compare wheels

2025-12-17 19:31:34 +00:00 · 2025-12-04 11:24:20 -05:00
parent 4f2de0659e
commit c31417a092
1 changed files with 420 additions and 0 deletions
--- a/scripts/compare_wheels.py
+++ b/scripts/compare_wheels.py
@@ -0,0 +1,420 @@
+#!/usr/bin/env python3
+"""
+Compare two wheel files to verify build artifact equivalence.
+
+This script compares wheels built with different build systems (e.g., hatch vs setuptools)
+to ensure the build artifacts are functionally equivalent, while ignoring expected
+differences like timestamps and build tool signatures.
+
+Usage:
+    python compare_wheels.py <wheel1.whl> <wheel2.whl>
+
+Example:
+    python compare_wheels.py dist/dbt_core-1.0.0-py3-none-any.whl dist_hatch/dbt_core-1.0.0-py3-none-any.whl
+"""
+
+import argparse
+import difflib
+import hashlib
+import sys
+import tempfile
+import zipfile
+from pathlib import Path
+
+
+# ANSI color codes for terminal output
+class Colors:
+    RED = "\033[91m"
+    GREEN = "\033[92m"
+    YELLOW = "\033[93m"
+    BLUE = "\033[94m"
+    CYAN = "\033[96m"
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+
+
+def print_header(text: str) -> None:
+    print(f"\n{Colors.BOLD}{Colors.BLUE}{'=' * 60}{Colors.RESET}")
+    print(f"{Colors.BOLD}{Colors.BLUE}{text}{Colors.RESET}")
+    print(f"{Colors.BOLD}{Colors.BLUE}{'=' * 60}{Colors.RESET}")
+
+
+def print_success(text: str) -> None:
+    print(f"{Colors.GREEN}✓ {text}{Colors.RESET}")
+
+
+def print_warning(text: str) -> None:
+    print(f"{Colors.YELLOW}⚠ {text}{Colors.RESET}")
+
+
+def print_error(text: str) -> None:
+    print(f"{Colors.RED}✗ {text}{Colors.RESET}")
+
+
+def print_info(text: str) -> None:
+    print(f"{Colors.CYAN}ℹ {text}{Colors.RESET}")
+
+
+def file_hash(content: bytes) -> str:
+    """Calculate SHA256 hash of content."""
+    return hashlib.sha256(content).hexdigest()
+
+
+def normalize_record_file(content: str) -> set[str]:
+    """
+    Normalize RECORD file for comparison.
+
+    RECORD files contain hashes and sizes that may differ between builds.
+    We extract just the filenames for comparison.
+    """
+    files = set()
+    for line in content.strip().split("\n"):
+        if line:
+            # RECORD format: filename,hash,size
+            filename = line.split(",")[0]
+            files.add(filename)
+    return files
+
+
+def normalize_wheel_file(content: str) -> dict[str, str]:
+    """
+    Normalize WHEEL metadata file for comparison.
+
+    Ignore fields that are expected to differ between build tools:
+    - Generator (build tool name)
+    - Build (build number, if present)
+    """
+    ignore_keys = {"Generator", "Build"}
+    result = {}
+    for line in content.strip().split("\n"):
+        if ":" in line:
+            key, value = line.split(":", 1)
+            key = key.strip()
+            value = value.strip()
+            if key not in ignore_keys:
+                result[key] = value
+    return result
+
+
+def normalize_metadata_file(content: str) -> dict[str, list[str]]:
+    """
+    Normalize METADATA file for comparison.
+
+    Some fields may have different ordering or formatting.
+    """
+    result: dict[str, list[str]] = {}
+    current_key = None
+    current_value = []
+
+    for line in content.split("\n"):
+        if line.startswith(" ") or line.startswith("\t"):
+            # Continuation of previous field
+            if current_key:
+                current_value.append(line.strip())
+        elif ":" in line:
+            # Save previous field
+            if current_key:
+                if current_key in result:
+                    result[current_key].append(" ".join(current_value))
+                else:
+                    result[current_key] = [" ".join(current_value)]
+
+            # Start new field
+            key, value = line.split(":", 1)
+            current_key = key.strip()
+            current_value = [value.strip()]
+        elif line == "":
+            # End of headers, start of description
+            if current_key:
+                if current_key in result:
+                    result[current_key].append(" ".join(current_value))
+                else:
+                    result[current_key] = [" ".join(current_value)]
+            current_key = "Description-Body"
+            current_value = []
+        elif current_key == "Description-Body":
+            current_value.append(line)
+
+    # Save last field
+    if current_key:
+        if current_key in result:
+            result[current_key].append(" ".join(current_value))
+        else:
+            result[current_key] = [" ".join(current_value)]
+
+    # Sort multi-value fields for consistent comparison
+    for key in result:
+        result[key] = sorted(result[key])
+
+    return result
+
+
+def extract_wheel(wheel_path: Path, extract_dir: Path) -> dict[str, bytes]:
+    """Extract wheel and return dict of filename -> content."""
+    files = {}
+    with zipfile.ZipFile(wheel_path, "r") as zf:
+        for name in zf.namelist():
+            files[name] = zf.read(name)
+    return files
+
+
+def compare_file_lists(
+    files1: dict[str, bytes], files2: dict[str, bytes], name1: str, name2: str
+) -> tuple[set[str], set[str], set[str]]:
+    """Compare file lists between two wheels."""
+    set1 = set(files1.keys())
+    set2 = set(files2.keys())
+
+    only_in_1 = set1 - set2
+    only_in_2 = set2 - set1
+    common = set1 & set2
+
+    return only_in_1, only_in_2, common
+
+
+def compare_python_files(content1: bytes, content2: bytes, filename: str) -> list[str]:
+    """Compare Python source files, ignoring minor whitespace differences."""
+    try:
+        text1 = content1.decode("utf-8")
+        text2 = content2.decode("utf-8")
+    except UnicodeDecodeError:
+        # Binary comparison
+        if content1 == content2:
+            return []
+        return ["Binary content differs"]
+
+    # Normalize line endings
+    lines1 = text1.replace("\r\n", "\n").split("\n")
+    lines2 = text2.replace("\r\n", "\n").split("\n")
+
+    if lines1 == lines2:
+        return []
+
+    # Generate unified diff
+    diff = list(
+        difflib.unified_diff(lines1, lines2, fromfile="wheel1", tofile="wheel2", lineterm="")
+    )
+    return diff
+
+
+def is_dist_info_file(filename: str) -> bool:
+    """Check if file is in the .dist-info directory."""
+    return ".dist-info/" in filename
+
+
+def get_dist_info_type(filename: str) -> str | None:
+    """Get the type of dist-info file."""
+    if filename.endswith("/RECORD"):
+        return "RECORD"
+    elif filename.endswith("/WHEEL"):
+        return "WHEEL"
+    elif filename.endswith("/METADATA"):
+        return "METADATA"
+    elif filename.endswith("/entry_points.txt"):
+        return "entry_points"
+    elif filename.endswith("/top_level.txt"):
+        return "top_level"
+    return None
+
+
+def compare_wheels(wheel1_path: Path, wheel2_path: Path, verbose: bool = False) -> bool:
+    """
+    Compare two wheel files.
+
+    Returns True if wheels are equivalent, False otherwise.
+    """
+    print_header("Comparing Wheels")
+    print_info(f"Wheel 1: {wheel1_path}")
+    print_info(f"Wheel 2: {wheel2_path}")
+
+    # Validate wheels exist
+    if not wheel1_path.exists():
+        print_error(f"Wheel 1 not found: {wheel1_path}")
+        return False
+    if not wheel2_path.exists():
+        print_error(f"Wheel 2 not found: {wheel2_path}")
+        return False
+
+    # Extract wheels
+    print_header("Extracting Wheels")
+    files1 = extract_wheel(wheel1_path, Path(tempfile.mkdtemp()))
+    files2 = extract_wheel(wheel2_path, Path(tempfile.mkdtemp()))
+    print_success(f"Wheel 1 contains {len(files1)} files")
+    print_success(f"Wheel 2 contains {len(files2)} files")
+
+    # Compare file lists
+    print_header("Comparing File Lists")
+    only_in_1, only_in_2, common = compare_file_lists(
+        files1, files2, wheel1_path.name, wheel2_path.name
+    )
+
+    all_equivalent = True
+
+    if only_in_1:
+        print_error("Files only in wheel 1:")
+        for f in sorted(only_in_1):
+            print(f"    - {f}")
+        all_equivalent = False
+
+    if only_in_2:
+        print_error("Files only in wheel 2:")
+        for f in sorted(only_in_2):
+            print(f"    - {f}")
+        all_equivalent = False
+
+    if not only_in_1 and not only_in_2:
+        print_success(f"Both wheels contain the same {len(common)} files")
+
+    # Compare file contents
+    print_header("Comparing File Contents")
+
+    differing_files = []
+    identical_files = 0
+    metadata_diffs = []
+
+    for filename in sorted(common):
+        content1 = files1[filename]
+        content2 = files2[filename]
+
+        dist_info_type = get_dist_info_type(filename)
+
+        if dist_info_type == "RECORD":
+            # Special handling for RECORD files
+            try:
+                record1 = normalize_record_file(content1.decode("utf-8"))
+                record2 = normalize_record_file(content2.decode("utf-8"))
+                if record1 != record2:
+                    only_in_record1 = record1 - record2
+                    only_in_record2 = record2 - record1
+                    metadata_diffs.append(("RECORD", only_in_record1, only_in_record2))
+                else:
+                    identical_files += 1
+            except Exception as e:
+                differing_files.append((filename, [f"Error parsing RECORD: {e}"]))
+
+        elif dist_info_type == "WHEEL":
+            # Special handling for WHEEL metadata
+            try:
+                wheel1 = normalize_wheel_file(content1.decode("utf-8"))
+                wheel2 = normalize_wheel_file(content2.decode("utf-8"))
+                if wheel1 != wheel2:
+                    diff_keys = set(wheel1.keys()) ^ set(wheel2.keys())
+                    diff_values = {
+                        k for k in wheel1.keys() & wheel2.keys() if wheel1[k] != wheel2[k]
+                    }
+                    if diff_keys or diff_values:
+                        metadata_diffs.append(("WHEEL", wheel1, wheel2))
+                else:
+                    identical_files += 1
+                # Always show generator info
+                gen1 = "unknown"
+                gen2 = "unknown"
+                for line in content1.decode("utf-8").split("\n"):
+                    if line.startswith("Generator:"):
+                        gen1 = line.split(":", 1)[1].strip()
+                for line in content2.decode("utf-8").split("\n"):
+                    if line.startswith("Generator:"):
+                        gen2 = line.split(":", 1)[1].strip()
+                print_info(f"Wheel 1 Generator: {gen1}")
+                print_info(f"Wheel 2 Generator: {gen2}")
+            except Exception as e:
+                differing_files.append((filename, [f"Error parsing WHEEL: {e}"]))
+
+        elif dist_info_type == "METADATA":
+            # Special handling for METADATA
+            try:
+                meta1 = normalize_metadata_file(content1.decode("utf-8"))
+                meta2 = normalize_metadata_file(content2.decode("utf-8"))
+                if meta1 != meta2:
+                    metadata_diffs.append(("METADATA", meta1, meta2))
+                else:
+                    identical_files += 1
+            except Exception as e:
+                differing_files.append((filename, [f"Error parsing METADATA: {e}"]))
+
+        elif content1 == content2:
+            identical_files += 1
+
+        else:
+            # Content differs - try to show diff for text files
+            diff = compare_python_files(content1, content2, filename)
+            if diff:
+                differing_files.append((filename, diff))
+
+    print_success(f"{identical_files} files are identical")
+
+    if metadata_diffs:
+        print_header("Metadata Differences (Expected)")
+        for meta_type, data1, data2 in metadata_diffs:
+            if meta_type == "RECORD":
+                print_warning("RECORD file differences (file lists):")
+                if data1:
+                    print(f"    Only in wheel 1: {data1}")
+                if data2:
+                    print(f"    Only in wheel 2: {data2}")
+            elif meta_type == "WHEEL":
+                print_warning("WHEEL metadata differences (ignoring Generator):")
+                all_keys = set(data1.keys()) | set(data2.keys())
+                for key in sorted(all_keys):
+                    v1 = data1.get(key, "<missing>")
+                    v2 = data2.get(key, "<missing>")
+                    if v1 != v2:
+                        print(f"    {key}: '{v1}' vs '{v2}'")
+                        all_equivalent = False
+            elif meta_type == "METADATA":
+                print_warning("METADATA differences:")
+                all_keys = set(data1.keys()) | set(data2.keys())
+                for key in sorted(all_keys):
+                    v1 = data1.get(key, ["<missing>"])
+                    v2 = data2.get(key, ["<missing>"])
+                    if v1 != v2:
+                        print(f"    {key}:")
+                        print(f"        Wheel 1: {v1}")
+                        print(f"        Wheel 2: {v2}")
+                        # Only mark as non-equivalent for important fields
+                        if key not in {"Description-Body"}:
+                            all_equivalent = False
+
+    if differing_files:
+        print_header("Content Differences")
+        all_equivalent = False
+        for filename, diff in differing_files:
+            print_error(f"File differs: {filename}")
+            if verbose and diff:
+                print("    Diff:")
+                for line in diff[:50]:  # Limit output
+                    print(f"    {line}")
+                if len(diff) > 50:
+                    print(f"    ... ({len(diff) - 50} more lines)")
+
+    # Summary
+    print_header("Summary")
+    if all_equivalent:
+        print_success("Wheels are functionally equivalent!")
+        print_info("(Ignoring expected differences in Generator, timestamps, etc.)")
+        return True
+    else:
+        print_error("Wheels have meaningful differences!")
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare two wheel files for equivalence",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument("wheel1", type=Path, help="Path to first wheel file")
+    parser.add_argument("wheel2", type=Path, help="Path to second wheel file")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="Show detailed diffs for differing files"
+    )
+
+    args = parser.parse_args()
+
+    equivalent = compare_wheels(args.wheel1, args.wheel2, args.verbose)
+    sys.exit(0 if equivalent else 1)
+
+
+if __name__ == "__main__":
+    main()