Skip to content

bug: ArrayDistinct returns float/NaN instead of int/None when using Spark Connect #11860

@davidlghellin

Description

@davidlghellin

What happened?

When using array.unique() (ArrayDistinct) with the PySpark backend via Spark Connect,
the result contains floats and NaN instead of integers and None.

Actual result:

[1.0, 3.0]           # floats instead of ints
[nan, 1.0, 3.0]      # NaN instead of None
Expected result:


[1, 3]               # integers
[None, 1, 3]         # None for nulls
The same query executed directly via PySpark (without Ibis) returns the correct types.
This suggests the issue is in how Ibis processes the results from Spark Connect,
not in the Spark Connect backend itself.

### What version of ibis are you using?

11

### What backend(s) are you using, if any?

pyspark

### Relevant log output

```sh
#!/usr/bin/env python3
"""
Reproduction script for Ibis ArrayDistinct type conversion bug.

This script demonstrates that:
1. PySpark directly returns correct types (int, None)
2. Ibis returns incorrect types (float, NaN)

Usage:
    # Start Sail server first:
    # hatch run test-ibis:scripts/spark-tests/run-server.sh

    # Then run this script:
    SPARK_REMOTE="sc://localhost:50051" python scripts/ibis_array_distinct_bug.py
"""

import os
import sys

# Check SPARK_REMOTE is set
SPARK_REMOTE = os.environ.get("SPARK_REMOTE", "sc://localhost:50051")
print(f"Using SPARK_REMOTE: {SPARK_REMOTE}")
print("=" * 70)

# Test data with nulls in arrays
TEST_DATA = [
    ([1, 3, 3],),
    ([1, 3, None, 3],),
    ([42, 42],),
    ([],),
    ([None],),
    (None,),
]

EXPECTED = [
    [1, 3],
    [None, 1, 3],  # or [1, 3, None] - order may vary
    [42],
    [],
    [None],
    None,
]


def test_pyspark_direct():
    """Test using PySpark directly - this works correctly."""
    print("\n" + "=" * 70)
    print("TEST 1: PySpark Direct (Expected: PASS)")
    print("=" * 70)

    from pyspark.sql import SparkSession
    from pyspark.sql.functions import array_distinct
    from pyspark.sql.types import ArrayType, LongType, StructField, StructType

    spark = SparkSession.builder.remote(SPARK_REMOTE).getOrCreate()

    schema = StructType([
        StructField("a", ArrayType(LongType(), containsNull=True), nullable=True)
    ])
    df = spark.createDataFrame(TEST_DATA, schema)

    print("\nInput DataFrame:")
    df.show()

    result = df.select(array_distinct("a").alias("unique"))
    print("Result DataFrame:")
    result.show()

    rows = result.collect()
    all_correct = True

    print("\nType checking:")
    for i, row in enumerate(rows):
        val = row["unique"]
        if val is None:
            print(f"  Row {i}: None (correct)")
            continue

        types_correct = True
        for elem in val:
            if elem is None:
                continue  # None is correct
            if not isinstance(elem, int):
                types_correct = False
                print(f"  Row {i}: {val} - WRONG TYPE: {type(elem).__name__} (expected int)")
                break

        if types_correct:
            print(f"  Row {i}: {val} - OK (int/None types)")
        else:
            all_correct = False

    spark.stop()

    if all_correct:
        print("\n✅ PySpark Direct: PASSED - All types correct (int, None)")
    else:
        print("\n❌ PySpark Direct: FAILED - Wrong types detected")

    return all_correct


def test_ibis_memtable():
    """Test using Ibis memtable - this shows the bug."""
    print("\n" + "=" * 70)
    print("TEST 2: Ibis memtable (Expected: FAIL - demonstrates bug)")
    print("=" * 70)

    import ibis
    import math

    # Create memtable with same data
    input_data = {"a": [[1, 3, 3], [1, 3, None, 3], [42, 42], [], [None], None]}
    t = ibis.memtable(input_data)

    print(f"\nTable schema: {t.schema()}")

    # Connect and execute
    con = ibis.pyspark.connect()
    result = con.execute(t.a.unique())

    print("\nResult:")
    print(result)

    all_correct = True
    print("\nType checking:")
    for i, val in enumerate(result):
        if val is None:
            print(f"  Row {i}: None (correct)")
            continue

        types_correct = True
        for elem in val:
            if elem is None:
                continue  # None is correct
            if isinstance(elem, float):
                if math.isnan(elem):
                    print(f"  Row {i}: {val} - WRONG: contains NaN (expected None)")
                else:
                    print(f"  Row {i}: {val} - WRONG TYPE: float (expected int)")
                types_correct = False
                break

        if types_correct:
            print(f"  Row {i}: {val} - OK")
        else:
            all_correct = False

    con.disconnect()

    if all_correct:
        print("\n✅ Ibis memtable: PASSED")
    else:
        print("\n❌ Ibis memtable: FAILED - Wrong types (float/NaN instead of int/None)")
        print("   This demonstrates the bug in Ibis when using Spark Connect.")

    return all_correct


def main():
    print("Ibis ArrayDistinct Type Conversion Bug - Reproduction Script")
    print("=" * 70)

    pyspark_ok = test_pyspark_direct()
    ibis_ok = test_ibis_memtable()

    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"  PySpark Direct: {'✅ PASS' if pyspark_ok else '❌ FAIL'}")
    print(f"  Ibis memtable:  {'✅ PASS' if ibis_ok else '❌ FAIL'}")

    if pyspark_ok and not ibis_ok:
        print("\n📋 CONCLUSION: Bug is in Ibis, not in the Spark Connect backend.")
        print("   Ibis converts int→float and None→NaN when processing results.")

    return 0 if (pyspark_ok and ibis_ok) else 1


if __name__ == "__main__":
    sys.exit(main())


output


Using SPARK_REMOTE: sc://localhost:50051
======================================================================
Ibis ArrayDistinct Type Conversion Bug - Reproduction Script
======================================================================

======================================================================
TEST 1: PySpark Direct (Expected: PASS)
======================================================================

Input DataFrame:
+---------------+
|              a|
+---------------+
|      [1, 3, 3]|
|[1, 3, NULL, 3]|
|       [42, 42]|
|             []|
|         [NULL]|
|           NULL|
+---------------+

Result DataFrame:
+------------+
|      unique|
+------------+
|      [1, 3]|
|[NULL, 1, 3]|
|        [42]|
|          []|
|      [NULL]|
|        NULL|
+------------+


Type checking:
  Row 0: [1, 3] - OK (int/None types)
  Row 1: [None, 1, 3] - OK (int/None types)
  Row 2: [42] - OK (int/None types)
  Row 3: [] - OK (int/None types)
  Row 4: [None] - OK (int/None types)
  Row 5: None (correct)

✅ PySpark Direct: PASSED - All types correct (int, None)

======================================================================
TEST 2: Ibis memtable (Expected: FAIL - demonstrates bug)
======================================================================

Table schema: ibis.Schema {
  a  array<int64>
}

Result:
0         [1.0, 3.0]
1    [nan, 1.0, 3.0]
2             [42.0]
3                 []
4              [nan]
5               None
Name: ArrayDistinct(a), dtype: object

Type checking:
  Row 0: [1.0, 3.0] - WRONG TYPE: float (expected int)
  Row 1: [nan, 1.0, 3.0] - WRONG: contains NaN (expected None)
  Row 2: [42.0] - WRONG TYPE: float (expected int)
  Row 3: [] - OK
  Row 4: [nan] - WRONG: contains NaN (expected None)
  Row 5: None (correct)

❌ Ibis memtable: FAILED - Wrong types (float/NaN instead of int/None)
   This demonstrates the bug in Ibis when using Spark Connect.

======================================================================
SUMMARY
======================================================================
  PySpark Direct: ✅ PASS
  Ibis memtable:  ❌ FAIL

📋 CONCLUSION: Bug is in Ibis, not in the Spark Connect backend.
   Ibis converts intfloat and NoneNaN when processing results.

Code of Conduct

  • I agree to follow this project's Code of Conduct

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugIncorrect behavior inside of ibis

    Type

    No type

    Projects

    Status

    backlog

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions