From 2094c3fec17730d945dd5bb303ef814907bf3817 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 21 Aug 2024 11:15:51 -0700 Subject: [PATCH 01/42] Update Series and DataFrame constructors to handle lazy Index objects, add tests for the same --- .../snowpark/modin/pandas/dataframe.py | 78 ++- src/snowflake/snowpark/modin/pandas/series.py | 33 +- .../compiler/snowflake_query_compiler.py | 115 ++++ .../test_df_series_creation_with_index.py | 525 ++++++++++++++++++ 4 files changed, 729 insertions(+), 22 deletions(-) create mode 100644 tests/integ/modin/index/test_df_series_creation_with_index.py diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index a6850941fa..b2787aa6f5 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -155,19 +155,30 @@ def __init__( # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native + from snowflake.snowpark.modin.plugin.extensions.index import Index self._siblings = [] + if isinstance(index, DataFrame): + raise ValueError("Index data must be 1-dimensional") + # Engine.subscribe(_update_engine) + if isinstance(data, Index): + # If the data is an Index object, we need to convert it to a DataFrame to make sure + # that the values are in the correct format -- as a data column, not an index column. + # Additionally, if an index is provided, converting it to an Index object ensures that + # its values are an index column. + query_compiler = data.to_frame(index=False, name=data.name)._query_compiler + if index is not None: + index = index if isinstance(index, Index) else Index(index) + query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( + index._query_compiler + ) + if isinstance(data, (DataFrame, Series)): self._query_compiler = data._query_compiler.copy() - if index is not None and any(i not in data.index for i in index): - ErrorMessage.not_implemented( - "Passing non-existant columns or index values to constructor not" - + " yet implemented." - ) # pragma: no cover if isinstance(data, Series): - # We set the column name if it is not in the provided Series + # We set the column name if it is not in the provided Series `data`. if data.name is None: self.columns = [0] if columns is None else columns # If the columns provided are not in the named Series, pandas clears @@ -177,22 +188,61 @@ def __init__( self.__constructor__(columns=columns) )._query_compiler if index is not None: + # The `index` parameter is used to select the rows from `data` that will be in the resultant + # DataFrame. If a value in `index` is not present in `data`'s index, it will be filled with a + # NaN value. + # 1. The `index` is converted to an Index object so that the index values are in an index column. + index = index if isinstance(index, Index) else Index(index) + # 2. A right outer join is performed between `data` and `index` to create a Series object where + # any index values in `data`'s index that are not in `index` are filled with NaN. + data = Series( + query_compiler=data._query_compiler.create_qc_with_data_and_index_joined_on_index( + index._query_compiler + ), + name=0 if data.name is None else data.name, + ) + # 3. Perform .loc[] on `data` to select the rows that are in the `index`. self._query_compiler = data.loc[index]._query_compiler + elif columns is None and index is None: data._add_sibling(self) + else: - if columns is not None and any(i not in data.columns for i in columns): - ErrorMessage.not_implemented( - "Passing non-existant columns or index values to constructor not" - + " yet implemented." - ) # pragma: no cover - if index is None: - index = slice(None) + # The `columns` parameter is used to select the columns from `data` that will be in the resultant + # DataFrame. If a value in `columns` is not present in `data`'s columns, it will be added as a + # new column filled with NaN values. These columns are tracked by the `extra_columns` variable. + extra_columns = None if columns is None: + # In case `columns` is not provided, `columns` is set to slice(None) to select all columns. columns = slice(None) + else: + extra_columns = [col for col in columns if col not in data.columns] + + # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. + # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. + if index is None: + # In case `index` is not provided, `index` is set to slice(None) to select all rows. + index = slice(None) + data = DataFrame( + query_compiler=data._query_compiler.create_qc_with_data_and_index_joined_on_index( + extra_columns=extra_columns + ) + ) + else: + # The `index` is converted to an Index object so that the index values are in an index column. + index = index if isinstance(index, Index) else Index(index) + # A right outer join is performed between `data` and `index` to create a DataFrame object where any + # index values in `data`'s index that are not in `index` are filled with NaN. + data = DataFrame( + query_compiler=data._query_compiler.create_qc_with_data_and_index_joined_on_index( + index._query_compiler, + extra_columns=extra_columns, + ) + ) + # 3. Perform .loc[] on `data` to select the rows and columns that are in `index` and `columns`. self._query_compiler = data.loc[index, columns]._query_compiler - # Check type of data and use appropriate constructor + # Check the type of data and use the appropriate constructor elif query_compiler is None: distributed_frame = from_non_pandas(data, index, columns, dtype) if distributed_frame is not None: diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index e99e9cc89f..4f5d7a8a23 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -129,19 +129,36 @@ def __init__( # modified: # Engine.subscribe(_update_engine) + from snowflake.snowpark.modin.plugin.extensions.index import Index # Convert lazy index to Series without pulling the data to client. - if isinstance(data, pd.Index): - query_compiler = data.to_series(index=index, name=name)._query_compiler - query_compiler = query_compiler.reset_index(drop=True) + if isinstance(data, Index): + # If the data is an Index object, we need to convert it to a DataFrame to make sure + # that the values are in the correct format -- as a data column, not an index column. + # Additionally, if an index is provided, converting it to an Index object ensures that + # its values are an index column. + query_compiler = data.to_frame(index=False, name=data.name)._query_compiler + if index is not None: + index = index if isinstance(index, Index) else Index(index) + query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( + index._query_compiler + ) elif isinstance(data, type(self)): query_compiler = data._query_compiler.copy() if index is not None: - if any(i not in data.index for i in index): - ErrorMessage.not_implemented( - "Passing non-existent columns or index values to constructor " - + "not yet implemented." - ) # pragma: no cover + # The `index` parameter is used to select the rows from `data` that will be in the resultant Series. + # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. + # 1. The `index` is converted to an Index object so that the index values are in an index column. + index = index if isinstance(index, Index) else Index(index) + # 2. A right outer join is performed between `data` and `index` to create a Series object where any + # index values in `data`'s index that are not in `index` are filled with NaN. + data = Series( + query_compiler=data._query_compiler.create_qc_with_data_and_index_joined_on_index( + index._query_compiler + ), + name=data.name, + ) + # 3. Perform .loc[] on `data` to select the rows that are in `index`. query_compiler = data.loc[index]._query_compiler if query_compiler is None: # Defaulting to pandas diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 108b594faf..577efe500e 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -17342,3 +17342,118 @@ def compare( """ return result + + def create_qc_with_index_data_and_qc_index( + self, index_qc: "SnowflakeQueryCompiler" + ) -> "SnowflakeQueryCompiler": + """ + This is a helper function for creating a DataFrame/Series where the data is an Index + and an index is provided. + Before this method is called, the provided index is converted to an Index object; + the query compilers of the data and index are then joined. + + Parameters + ---------- + index_qc : SnowflakeQueryCompiler + The query compiler of the index to be joined with the data. + + Returns + ------- + SnowflakeQueryCompiler + A new query compiler with the data and index joined. + """ + self_frame = self._modin_frame.ensure_row_position_column() + other_frame = index_qc._modin_frame.ensure_row_position_column() + + new_internal_frame, _ = join_utils.join( + self_frame, + other_frame, + how="left", + left_on=[self_frame.row_position_snowflake_quoted_identifier], + right_on=[other_frame.row_position_snowflake_quoted_identifier], + inherit_join_index=InheritJoinIndex.FROM_RIGHT, + ) + + return SnowflakeQueryCompiler(new_internal_frame) + + def create_qc_with_data_and_index_joined_on_index( + self, + index_qc: Optional["SnowflakeQueryCompiler"] = None, + extra_columns: Optional[List[Hashable]] = None, + ) -> "SnowflakeQueryCompiler": + """ + This is a helper function for creating a DataFrame/Series where the data is a DataFrame/Series object. + This is a special case since only the values where the index value matches in the `data` and `index` provided + take on an actual value from the given `data`. Otherwise, they take on a NaN value. + + For instance, + + >>> data = pd.Series(["A", "B", "C", "D"], index=[1.1, 2.2, 3, 4], name="index series name") + >>> index = pd.Index([1, 2, 3, 4], name="some name") + >>> df = pd.DataFrame(data=data, index=index) + >>> df # doctest: +SKIP + index series name + some name + 1 NaN + 2 NaN + 3 C + 4 D + + Notice how only the data for index values 3 and 4 have an actual value while 1 and 2 have a NaN value. + 3 and 4 are values present in the index of the `data` and `index` provided. 1 and 2 are not present. + + Parameters + ---------- + index_qc : SnowflakeQueryCompiler, default None + The query compiler of the index to be joined with the data. If no query compiler is provided, + skip this join operation. + extra_columns : list of hashable, default None + If the DataFrame being created has new columns that are not a part of the data, they can be passed here + and appended as NaN columns. + + Returns + ------- + SnowflakeQueryCompiler + A new query compiler with the data and index joined. + """ + self_frame = self._modin_frame + + if extra_columns: + # Append the new columns to the data's internal frame. + new_snowflake_quoted_identifiers = self._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers( + pandas_labels=extra_columns, + excluded=self_frame.data_column_snowflake_quoted_identifiers, + ) + new_ordered_frame = append_columns( + self_frame.ordered_dataframe, + new_snowflake_quoted_identifiers, + [pandas_lit(np.nan)] * len(extra_columns), + ) + self_frame = InternalFrame.create( + ordered_dataframe=new_ordered_frame, + data_column_pandas_labels=self_frame.data_column_pandas_labels + + extra_columns, + data_column_snowflake_quoted_identifiers=self_frame.data_column_snowflake_quoted_identifiers + + new_snowflake_quoted_identifiers, + data_column_pandas_index_names=self_frame.data_column_pandas_index_names, + index_column_pandas_labels=self_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=self_frame.index_column_snowflake_quoted_identifiers, + data_column_types=None, + index_column_types=None, + ) + + if index_qc is None: + new_internal_frame = self._modin_frame + else: + # Join the index and data internal frames. + other_frame = index_qc._modin_frame + new_internal_frame, _ = join_utils.join( + other_frame, + self_frame, + how="outer", + left_on=other_frame.index_column_snowflake_quoted_identifiers, + right_on=self_frame.index_column_snowflake_quoted_identifiers, + inherit_join_index=InheritJoinIndex.FROM_LEFT, + ) + + return SnowflakeQueryCompiler(new_internal_frame) diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py new file mode 100644 index 0000000000..9c3bae1b22 --- /dev/null +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -0,0 +1,525 @@ +# +# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. +# +import modin.pandas as pd +import pandas as native_pd +import pytest + +import snowflake.snowpark.modin.plugin # noqa: F401 +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.utils import assert_frame_equal, assert_series_equal + + +@pytest.mark.parametrize( + "data", [[1, 2, 3, 4], list(range(250)), ["A", None, 2.3, 1], []] +) +@sql_count_checker(query_count=1) +def test_create_df_with_index_as_data(data): + """ + Creating a DataFrame where the data is an Index. + """ + # Create Snowpark pandas DataFrame and native pandas DataFrame from an Index object. + native_idx = native_pd.Index(data, name="some name") + snow_idx = pd.Index(native_idx) + assert_frame_equal(pd.DataFrame(snow_idx), native_pd.DataFrame(native_idx)) + + +@pytest.mark.parametrize( + "data", [[1, 2, 3, 4], list(range(250)), ["A", None, 2.3, 1], []] +) +@sql_count_checker(query_count=1) +def test_create_series_with_index_as_data(data): + """ + Creating a Series where the data is an Index. + """ + # Create Snowpark pandas Series and native pandas Series from an Index object. + native_idx = native_pd.Index(data, name="some name") + snow_idx = pd.Index(native_idx) + assert_series_equal(pd.Series(snow_idx), native_pd.Series(native_idx)) + + +@pytest.mark.parametrize( + "data, index", + [ + ([1, 2, 3, 4], ["A", "B", "C", "D"]), + (list(range(100)), list(range(200, 300))), + (["A", None, 2.3, 1], [None, "B", 0, 3.14]), + ([], []), + ], +) +@sql_count_checker(query_count=2) +def test_create_df_with_index_as_index(data, index): + """ + Creating a DataFrame where the index is an Index. + """ + # Two queries are issued: one when creating the DataFrame (the index is converted + # to a native pandas object), one when materializing the DataFrame for comparison. + # Create Snowpark pandas DataFrame and native pandas DataFrame with an Index object as the index. + native_idx = native_pd.Index(index, name="some name") + snow_idx = pd.Index(native_idx) + assert_frame_equal( + pd.DataFrame(data, index=snow_idx), + native_pd.DataFrame(data, index=native_idx), + check_dtype=False, + check_index_type=False, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + "data, index", + [ + ([1, 2, 3, 4], ["A", "B", "C", "D"]), + (list(range(100)), list(range(100, 200))), + (["A", None, 2.3, 1], [None, "B", 0, 3.14]), + ([], []), + ], +) +@sql_count_checker(query_count=2) +def test_create_series_with_index_as_index(data, index): + """ + Creating a Series where the index is an Index. + """ + # Two queries are issued: one when creating the Series (the index is converted + # to a native pandas object), one when materializing the Series for comparison. + # Create Snowpark pandas Series and native pandas Series with an Index object as the index. + native_idx = native_pd.Index(index, name="some name") + snow_idx = pd.Index(native_idx) + assert_series_equal( + pd.Series(data, index=snow_idx), + native_pd.Series(data, index=native_idx), + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + "data, index", + [ + ([1, 2, 3, 4], ["A", "B", "C", "D"]), + (list(range(250)), list(range(250, 500))), + (["A", None, 2.3, 1], [None, "B", 0, 3.14]), + ([], []), + ], +) +@sql_count_checker(query_count=1, join_count=1) +def test_create_df_with_index_as_data_and_index(data, index): + """ + Creating a DataFrame where the data is an Index and the index is also an Index. + """ + # Create Snowpark pandas DataFrame and native pandas DataFrame from Index objects. + native_idx_data = native_pd.Index(data, name="data name") + snow_idx_data = pd.Index(native_idx_data) + native_idx_index = native_pd.Index(index, name="index name") + snow_idx_index = pd.Index(native_idx_index) + assert_frame_equal( + pd.DataFrame(snow_idx_data, index=snow_idx_index), + native_pd.DataFrame(native_idx_data, index=native_idx_index), + ) + + +@pytest.mark.parametrize( + "data, index", + [ + ([1, 2, 3, 4], ["A", "B", "C", "D"]), + (list(range(100)), list(range(100, 200))), + (["A", None, 2.3, 1], [None, "B", 0, 3.14]), + ([], []), + ], +) +@sql_count_checker(query_count=1, join_count=1) +def test_create_series_with_index_as_data_and_index(data, index): + """ + Creating a Series where the data is an Index and the index is also an Index. + """ + # Create Snowpark pandas Series and native pandas Series from Index objects. + # TODO: Index is not being set at all. + native_idx_data = native_pd.Index(data, name="data name") + snow_idx_data = pd.Index(native_idx_data) + native_idx_index = native_pd.Index(index, name="index name") + snow_idx_index = pd.Index(native_idx_index) + assert_series_equal( + pd.Series(snow_idx_data, index=snow_idx_index), + native_pd.Series(native_idx_data, index=native_idx_index), + ) + + +@pytest.mark.parametrize( + "data, native_series", + [ + ( + [1, 2, 3, 4], + native_pd.Series( + ["A", "B", "C", "D"], + index=[1.1, 2.2, 3.3, 4.4], + name="index series name", + ), + ), + (list(range(100)), native_pd.Series(list(range(100, 200)))), + ( + ["A", None, 2.3, 1], + native_pd.Series([None, "B", 0, 3.14], name="mixed series as index"), + ), + ([], native_pd.Series([], name="empty series")), + ], +) +@sql_count_checker(query_count=1, join_count=1) +def test_create_df_with_index_as_data_and_series_as_index(data, native_series): + """ + Creating a DataFrame where the data is an Index and the index is a Series. + """ + snow_series = pd.Series(native_series) + native_index = native_pd.Index(data, name="index data name") + snow_index = pd.Index(native_index) + assert_frame_equal( + pd.DataFrame(snow_index, index=snow_series), + native_pd.DataFrame(native_index, index=native_series), + ) + + +@pytest.mark.parametrize( + "data, native_series", + [ + ( + [1, 2, 3, 4], + native_pd.Series( + ["A", "B", "C", "D"], + index=[1.1, 2.2, 3.3, 4.4], + name="index series name", + ), + ), + (list(range(100)), native_pd.Series(list(range(100, 200)))), + ( + ["A", None, 2.3, 1], + native_pd.Series([None, "B", 0, 3.14], name="mixed series as index"), + ), + ([], native_pd.Series([], name="empty series")), + ], +) +@sql_count_checker(query_count=1, join_count=1) +def test_create_series_with_index_as_data_and_series_as_index(data, native_series): + """ + Creating a Series where the data is an Index and the index is a Series. + """ + snow_series = pd.Series(native_series) + native_index = native_pd.Index(data, name="index data name") + snow_index = pd.Index(native_index) + assert_series_equal( + pd.Series(snow_index, index=snow_series), + native_pd.Series(native_index, index=native_series), + ) + + +@pytest.mark.parametrize( + "native_series, native_index", + [ + ( + native_pd.Series( + ["A", "B", "C", "D"], index=[1.1, 2.2, 3, 4], name="index series name" + ), + native_pd.Index([1, 2, 3, 4], name="some name"), + ), # some index values are missing + ( + native_pd.Series(list(range(100))), + native_pd.Index(list(range(-50, 100, 4)), name="skip numbers"), + ), # some index values are missing + ( + native_pd.Series( + [10, 20, 30, 40], + index=native_pd.Index([None, "B", 0, 3.14], name="mixed"), + name="mixed series as index", + ), + native_pd.Index(["B", 0, None, 3.14]), + ), # rearranged index values + ( + native_pd.Series(["A", "B", "C", "D", "E"], name="series"), + native_pd.Index([3, 4], name="index"), + ), # subset of index values + ( + native_pd.Series( + list(range(20)), index=native_pd.Index(list(range(20)), name=20) + ), + native_pd.Index(list(range(20))), + ), # all index values match + ( + native_pd.Series(["A", "V", "D", "R"]), + native_pd.Index([10, 20, 30, 40], name="none"), + ), # no index values match + ( + native_pd.Series([], name="empty series", dtype="int64"), + native_pd.Index([], name="empty index", dtype="int64"), + ), # empty series and index + ], +) +@sql_count_checker(query_count=1, join_count=2) +def test_create_df_with_series_as_data_and_index_as_index(native_series, native_index): + """ + Creating a DataFrame where the data is a Series and the index is an Index. + """ + # Two joins are performed: one from joining the data and index parameters to have a query compiler whose + # index columns match the provided index, and one from performing .loc[] to filter the generated qc. + snow_series = pd.Series(native_series) + snow_index = pd.Index(native_index) + assert_frame_equal( + pd.DataFrame(snow_series, index=snow_index), + native_pd.DataFrame(native_series, index=native_index), + ) + + +@pytest.mark.parametrize( + "native_series, native_index", + [ + ( + native_pd.Series( + ["A", "B", "C", "D"], index=[1.1, 2.2, 3, 4], name="index series name" + ), + native_pd.Index([1, 2, 3, 4], name="some name"), + ), # some index values are missing + ( + native_pd.Series(list(range(100))), + native_pd.Index(list(range(-50, 100, 4)), name="skip numbers"), + ), # some index values are missing + ( + native_pd.Series( + [10, 20, 30, 40], + index=native_pd.Index([None, "B", 0, 3.14], name="mixed"), + name="mixed series as index", + ), + native_pd.Index(["B", 0, None, 3.14]), + ), # rearranged index values + ( + native_pd.Series(["A", "B", "C", "D", "E"], name="series"), + native_pd.Index([3, 4], name="index"), + ), # subset of index values + ( + native_pd.Series( + list(range(20)), index=native_pd.Index(list(range(20)), name=20) + ), + native_pd.Index(list(range(20))), + ), # all index values match + ( + native_pd.Series(["A", "V", "D", "R"]), + native_pd.Index([10, 20, 30, 40], name="none"), + ), # no index values match + ( + native_pd.Series([], name="empty series", dtype="int64"), + native_pd.Index([], name="empty index", dtype="int64"), + ), # empty series and index + ], +) +@sql_count_checker(query_count=1, join_count=2) +def test_create_series_with_series_as_data_and_index_as_index( + native_series, native_index +): + """ + Creating a Series where the data is a Series and the index is an Index. + """ + # Two joins are performed: one from joining the data and index parameters to have a query compiler whose + # index columns match the provided index, and one from performing .loc[] to filter the generated qc. + snow_series = pd.Series(native_series) + snow_index = pd.Index(native_index) + assert_series_equal( + pd.Series(snow_series, index=snow_index), + native_pd.Series(native_series, index=native_index), + ) + + +@pytest.mark.parametrize( + "native_df, native_index", + [ + # Single column DataFrames. + ( + native_pd.DataFrame( + ["A", "B", "C", "D"], index=[1.1, 2.2, 3, 4], columns=["df column!"] + ), + native_pd.Index([1, 2, 3, 4], name="some name"), + ), # some index values are missing + ( + native_pd.DataFrame(list(range(100))), + native_pd.Index(list(range(-50, 100, 4)), name="skip numbers"), + ), # some index values are missing + ( + native_pd.DataFrame( + [10, 20, 30, 40], + index=native_pd.Index([None, "B", 0, 3.14], name="mixed"), + columns=["C"], + ), + native_pd.Index(["B", 0, None, 3.14]), + ), # rearranged index values + ( + native_pd.DataFrame(["A", "B", "C", "D", "E"], columns=["B"]), + native_pd.Index([3, 4], name="index"), + ), # subset of index values + ( + native_pd.DataFrame(list(range(20))), + native_pd.Index(list(range(20))), + ), # all index values match + ( + native_pd.DataFrame(["A", "V", "D", "R"]), + native_pd.Index([10, 20, 30, 40], name="none"), + ), # no index values match + # Multi-column DataFrames. + ( + native_pd.DataFrame( + {"col1": ["A", "B", "C", "D"], "col2": ["B", "H", "T", "W"]}, + index=[1.1, 2.2, 3, 4], + ), + native_pd.Index([1, 2, 3, 4], name="some name"), + ), # some index values are missing + ( + native_pd.DataFrame( + [[10, 20, 30, 40], [2, 4, 6, 7], [-1, -2, -3, -4], [90, 50, 30, 10]], + index=native_pd.Index([None, "B", 0, 3.14], name="mixed"), + columns=["C", "L", "M", "W"], + ), + native_pd.Index(["B", 0, None, 3.14]), + ), # rearranged index values + ( + native_pd.DataFrame( + [["A", "B", "C", "D", "E"], ["R", "S", "T", "U", "V"]], + columns=[1, 2, 3, 4, 5], + ), + native_pd.Index([3, 4], name="index"), + ), # subset of index values + ( + native_pd.DataFrame([list(range(20)), list(range(20))]), + native_pd.Index(list(range(20))), + ), # all index values match + ( + native_pd.DataFrame( + { + "A": ["A", "V", "D", "R"], + "V": ["V", "D", "R", "A"], + "D": ["D", "R", "A", "V"], + "R": ["R", "A", "V", "D"], + } + ), + native_pd.Index([10, 20, 30, 40], name="none"), + ), # no index values match + ], +) +@sql_count_checker(query_count=1, join_count=2) +def test_create_df_with_df_as_data_and_index_as_index(native_df, native_index): + """ + Creating a DataFrame where the data is a DataFrame and the index is an Index. + """ + # Two joins are performed: one from joining the data and index parameters to have a query compiler whose + # index columns match the provided index, and one from performing .loc[] to filter the generated qc. + snow_df = pd.DataFrame(native_df) + snow_index = pd.Index(native_index) + assert_frame_equal( + pd.DataFrame(snow_df, index=snow_index), + native_pd.DataFrame(native_df, index=native_index), + ) + + +@pytest.mark.parametrize( + "native_df, native_index", + [ + # Single column DataFrames. + ( + native_pd.DataFrame([]), + native_pd.Index([], name="empty index", dtype="int64"), + ), # empty series and index + # Multi-column DataFrames. + ( + native_pd.DataFrame([]), + native_pd.Index(["A", "V"], name="non-empty index"), + ), # empty df and index + ], +) +@sql_count_checker(query_count=1, join_count=2) +def test_create_df_with_empty_df_as_data_and_index_as_index(native_df, native_index): + """ + Creating a DataFrame where the data is an empty DataFrame and the index is an Index. + """ + # Two joins are performed: one from joining the data and index parameters to have a query compiler whose + # index columns match the provided index, and one from performing .loc[] to filter the generated qc. + snow_df = pd.DataFrame(native_df) + snow_index = pd.Index(native_index) + assert_frame_equal( + pd.DataFrame(snow_df, index=snow_index), + native_pd.DataFrame(native_df, index=native_index), + check_column_type=False, + ) + + +@pytest.mark.parametrize( + "native_df, native_index, columns", + [ + # Single column DataFrames. + ( + native_pd.DataFrame(list(range(20))), + native_pd.Index(list(range(20))), + [1], + ), # all index values match + ( + native_pd.DataFrame(["A", "V", "D", "R"]), + native_pd.Index([10, 20, 30, 40], name="none"), + ["A"], + ), # no index values match, column missing + # Multi-column DataFrames. + ( + native_pd.DataFrame( + {"col1": ["A", "B", "C", "D"], "col2": ["B", "H", "T", "W"]}, + index=[1.1, 2.2, 3, 4], + ), + native_pd.Index([1, 2, 3, 4], name="some name"), + ["col1"], + ), # some index values are missing, subset of columns + ( + native_pd.DataFrame( + [[10, 20, 30, 40], [2, 4, 6, 7], [-1, -2, -3, -4], [90, 50, 30, 10]], + index=native_pd.Index([None, "B", 0, 3.14], name="mixed"), + columns=["C", "L", "M", "W"], + ), + native_pd.Index(["B", 0, None, 3.14]), + [3, 1], + ), # rearranged index and column values + ( + native_pd.DataFrame( + [["A", "B", "C", "D", "E"], ["R", "S", "T", "U", "V"]], + columns=[1, 2, 3, 4, 5], + ), + native_pd.Index([3, 4], name="index"), + ["A", "V", "C"], + ), # subset of index values + ( + native_pd.DataFrame([list(range(20)), list(range(20))]), + native_pd.Index(list(range(20))), + [1], + ), # all index values match + ( + native_pd.DataFrame( + { + "A": ["A", "V", "D", "R"], + "V": ["V", "D", "R", "A"], + "D": ["D", "R", "A", "V"], + "R": ["R", "A", "V", "D"], + } + ), + native_pd.Index([10, 20, 30, 40], name="none"), + ["A", "X", "D", "R"], + ), # no index values match + ], +) +@pytest.mark.parametrize("column_type", ["list", "index"]) +def test_create_df_with_df_as_data_and_index_as_index_and_different_columns( + native_df, native_index, columns, column_type +): + """ + Creating a DataFrame where the data is a DataFrame, the index is an Index, and non-existent columns. + """ + # Two joins are performed: one from joining the data and index parameters to have a query compiler whose + # index columns match the provided index, and one from performing .loc[] to filter the generated qc. + # One extra query is required to create the columns if it is an Index (column_type is "index"). + native_columns = columns if column_type == "list" else native_pd.Index(columns) + snow_columns = columns if column_type == "list" else pd.Index(columns) + snow_df = pd.DataFrame(native_df) + snow_index = pd.Index(native_index) + with SqlCounter(query_count=1 if column_type == "list" else 2, join_count=2): + assert_frame_equal( + pd.DataFrame(snow_df, index=snow_index, columns=native_columns), + native_pd.DataFrame(native_df, index=native_index, columns=snow_columns), + check_dtype=False, + ) From 19792570c54be9224d1d03d82a908691f8ebe370 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 21 Aug 2024 11:24:12 -0700 Subject: [PATCH 02/42] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab546703fd..bf2fc4c32d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ - Added support for `DatetimeIndex.month_name` and `DatetimeIndex.day_name`. - Added support for `Series.dt.weekday`, `Series.dt.time`, and `DatetimeIndex.time`. - Added support for subtracting two timestamps to get a Timedelta. +- Added support for creating `Series` and `DataFrame` objects with the lazy `Index` object as `data`, `index`, and `columns` parameters. #### Bug Fixes From 5dbb76dacf8afa5a92afbe66908d8ccdf80d2441 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 21 Aug 2024 11:49:06 -0700 Subject: [PATCH 03/42] add more tests --- .../snowpark/modin/pandas/dataframe.py | 2 +- .../test_df_series_creation_with_index.py | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 9e34db4b04..5dd6210685 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -156,7 +156,7 @@ def __init__( self._siblings = [] - if isinstance(index, DataFrame): + if isinstance(index, DataFrame): # pandas raises the same error raise ValueError("Index data must be 1-dimensional") # Engine.subscribe(_update_engine) diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 9c3bae1b22..f3aaa15965 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -1,6 +1,8 @@ # # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # +import re + import modin.pandas as pd import pandas as native_pd import pytest @@ -501,6 +503,16 @@ def test_create_df_with_empty_df_as_data_and_index_as_index(native_df, native_in native_pd.Index([10, 20, 30, 40], name="none"), ["A", "X", "D", "R"], ), # no index values match + ( + native_pd.DataFrame([]), + native_pd.Index([], name="empty index", dtype="int64"), + [], + ), # empty data, index, and columns + ( + native_pd.DataFrame([]), + native_pd.Index(["A", "V"], name="non-empty index"), + ["A", "V"], + ), # empty data, non-empty index and columns ], ) @pytest.mark.parametrize("column_type", ["list", "index"]) @@ -523,3 +535,14 @@ def test_create_df_with_df_as_data_and_index_as_index_and_different_columns( native_pd.DataFrame(native_df, index=native_index, columns=snow_columns), check_dtype=False, ) + + +@sql_count_checker(query_count=0) +def test_create_df_with_df_index_negative(): + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + pd.DataFrame([1, 2, 3], index=pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + with pytest.raises( + ValueError, + match=re.escape("Shape of passed values is (3, 1), indices imply (2, 1)"), + ): + pd.DataFrame([1, 2, 3], index=[[1, 2], [3, 4], [5, 6]]) From 7de467f704be57b727f63a0128c7f56fc2d1dc03 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 21 Aug 2024 13:43:40 -0700 Subject: [PATCH 04/42] fix minor bug --- .../plugin/compiler/snowflake_query_compiler.py | 2 +- .../index/test_df_series_creation_with_index.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 454fe4eec9..fefe867bf9 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -17491,7 +17491,7 @@ def create_qc_with_data_and_index_joined_on_index( ) if index_qc is None: - new_internal_frame = self._modin_frame + new_internal_frame = self_frame else: # Join the index and data internal frames. other_frame = index_qc._modin_frame diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index f3aaa15965..2615be8dca 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -537,6 +537,20 @@ def test_create_df_with_df_as_data_and_index_as_index_and_different_columns( ) +@sql_count_checker(query_count=1) +def test_create_df_with_new_columns(): + """ + Creating a DataFrame with columns that don't exist in `data`. + """ + native_df = native_pd.DataFrame(list(range(100))) + snow_df = pd.DataFrame(native_df) + assert_frame_equal( + pd.DataFrame(snow_df, columns=["new column"]), + native_pd.DataFrame(native_df, columns=["new column"]), + check_dtype=False, + ) + + @sql_count_checker(query_count=0) def test_create_df_with_df_index_negative(): with pytest.raises(ValueError, match="Index data must be 1-dimensional"): From 5dd06fddc14807fd3f38bfbb9cf42834c713c983 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 21 Aug 2024 14:21:53 -0700 Subject: [PATCH 05/42] fix isocalendar docstring error --- src/snowflake/snowpark/modin/pandas/series.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 8cc2665ddc..213711a1ac 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -133,11 +133,16 @@ def __init__( # Convert lazy index to Series without pulling the data to client. if isinstance(data, Index): - # If the data is an Index object, we need to convert it to a DataFrame to make sure + # If the data is an Index object, we need to convert it to a Series to make sure # that the values are in the correct format -- as a data column, not an index column. # Additionally, if an index is provided, converting it to an Index object ensures that # its values are an index column. - query_compiler = data.to_frame(index=False, name=data.name)._query_compiler + query_compiler = ( + data.to_series(index=None, name=name) + .reset_index(drop=True) + ._query_compiler + ) + if index is not None: index = index if isinstance(index, Index) else Index(index) query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( From 8b944623a1195f7c65f63558bea242624ccdc4db Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 21 Aug 2024 17:32:29 -0700 Subject: [PATCH 06/42] truncate tests, update changelog wording, reduce 2 queries to one query one join --- CHANGELOG.md | 2 +- .../snowpark/modin/pandas/dataframe.py | 22 +- src/snowflake/snowpark/modin/pandas/series.py | 11 +- .../test_df_series_creation_with_index.py | 294 ++++++------------ 4 files changed, 124 insertions(+), 205 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf2fc4c32d..a7cc298a72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,7 +35,7 @@ - Added support for `DatetimeIndex.month_name` and `DatetimeIndex.day_name`. - Added support for `Series.dt.weekday`, `Series.dt.time`, and `DatetimeIndex.time`. - Added support for subtracting two timestamps to get a Timedelta. -- Added support for creating `Series` and `DataFrame` objects with the lazy `Index` object as `data`, `index`, and `columns` parameters. +- Added support for constructing `Series` and `DataFrame` objects with the lazy `Index` object as `data`, `index`, and `columns` arguments. #### Bug Fixes diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 5dd6210685..609f5bf55e 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -165,7 +165,12 @@ def __init__( # that the values are in the correct format -- as a data column, not an index column. # Additionally, if an index is provided, converting it to an Index object ensures that # its values are an index column. - query_compiler = data.to_frame(index=False, name=data.name)._query_compiler + # We set the column name if it is not in the provided Index `data`. + if data.name is None: + new_name = 0 if columns is None else columns[0] + else: + new_name = data.name + query_compiler = data.to_frame(index=False, name=new_name)._query_compiler if index is not None: index = index if isinstance(index, Index) else Index(index) query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( @@ -291,14 +296,25 @@ def __init__( k: v._to_pandas() if isinstance(v, Series) else v for k, v in data.items() } + + new_index = index + if isinstance(index, Index): + # Skip turning this into a native pandas object here since this issues an extra query. + # Instead, first get the query compiler from native pandas and then add the index column. + new_index = None pandas_df = pandas.DataFrame( data=try_convert_index_to_native(data), - index=try_convert_index_to_native(index), + index=try_convert_index_to_native(new_index), columns=try_convert_index_to_native(columns), dtype=dtype, copy=copy, ) - self._query_compiler = from_pandas(pandas_df)._query_compiler + query_compiler = from_pandas(pandas_df)._query_compiler + if isinstance(index, Index): + query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( + index._query_compiler + ) + self._query_compiler = query_compiler else: self._query_compiler = query_compiler diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 213711a1ac..f59c1a7939 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -175,11 +175,16 @@ def __init__( ): name = data.name + new_index = index + if isinstance(index, Index): + # Skip turning this into a native pandas object here since this issues an extra query. + # Instead, first get the query compiler from native pandas and then add the index column. + new_index = None query_compiler = from_pandas( pandas.DataFrame( pandas.Series( data=try_convert_index_to_native(data), - index=try_convert_index_to_native(index), + index=try_convert_index_to_native(new_index), dtype=dtype, name=name, copy=copy, @@ -187,6 +192,10 @@ def __init__( ) ) )._query_compiler + if isinstance(index, Index): + query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( + index._query_compiler + ) self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 2615be8dca..5b2571ccca 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -12,259 +12,151 @@ from tests.integ.modin.utils import assert_frame_equal, assert_series_equal -@pytest.mark.parametrize( - "data", [[1, 2, 3, 4], list(range(250)), ["A", None, 2.3, 1], []] -) -@sql_count_checker(query_count=1) -def test_create_df_with_index_as_data(data): - """ - Creating a DataFrame where the data is an Index. +def obj_type_helper(obj_type: str) -> tuple: """ - # Create Snowpark pandas DataFrame and native pandas DataFrame from an Index object. - native_idx = native_pd.Index(data, name="some name") - snow_idx = pd.Index(native_idx) - assert_frame_equal(pd.DataFrame(snow_idx), native_pd.DataFrame(native_idx)) + Helper function to return the appropriate objects and kwargs based on the object type. + Parameters + ---------- + obj_type : str + The type of object to be created. Can be either "df" or "series". -@pytest.mark.parametrize( - "data", [[1, 2, 3, 4], list(range(250)), ["A", None, 2.3, 1], []] -) -@sql_count_checker(query_count=1) -def test_create_series_with_index_as_data(data): - """ - Creating a Series where the data is an Index. + Returns + ------- + tuple + A tuple containing the assert_equal_func, Snowpark pandas object dtype, native pandas object dtype, and kwargs. """ - # Create Snowpark pandas Series and native pandas Series from an Index object. - native_idx = native_pd.Index(data, name="some name") - snow_idx = pd.Index(native_idx) - assert_series_equal(pd.Series(snow_idx), native_pd.Series(native_idx)) + if obj_type == "df": + assert_equal_func = assert_frame_equal + snow_obj, native_obj = pd.DataFrame, native_pd.DataFrame + kwargs = {"check_column_type": False} + else: + assert_equal_func = assert_series_equal + snow_obj, native_obj = pd.Series, native_pd.Series + kwargs = {} + return assert_equal_func, snow_obj, native_obj, kwargs @pytest.mark.parametrize( - "data, index", + "native_idx", [ - ([1, 2, 3, 4], ["A", "B", "C", "D"]), - (list(range(100)), list(range(200, 300))), - (["A", None, 2.3, 1], [None, "B", 0, 3.14]), - ([], []), + native_pd.Index([1, 2, 3, 4], name="some name"), + native_pd.Index(list(range(250))), + native_pd.Index(["A", None, 2.3, 1], name="AAAAA"), + native_pd.Index([]), ], ) -@sql_count_checker(query_count=2) -def test_create_df_with_index_as_index(data, index): +@pytest.mark.parametrize("obj_type", ["series", "df"]) +@sql_count_checker(query_count=1) +def test_create_with_index_as_data(native_idx, obj_type): """ - Creating a DataFrame where the index is an Index. + Creating a Series where the data is an Index. """ - # Two queries are issued: one when creating the DataFrame (the index is converted - # to a native pandas object), one when materializing the DataFrame for comparison. - # Create Snowpark pandas DataFrame and native pandas DataFrame with an Index object as the index. - native_idx = native_pd.Index(index, name="some name") snow_idx = pd.Index(native_idx) - assert_frame_equal( - pd.DataFrame(data, index=snow_idx), - native_pd.DataFrame(data, index=native_idx), - check_dtype=False, - check_index_type=False, - check_column_type=False, - ) + assert_equal_func, snow_obj, native_obj, _ = obj_type_helper(obj_type) + assert_equal_func(snow_obj(snow_idx), native_obj(native_idx)) @pytest.mark.parametrize( - "data, index", + "data, native_idx", [ - ([1, 2, 3, 4], ["A", "B", "C", "D"]), - (list(range(100)), list(range(100, 200))), - (["A", None, 2.3, 1], [None, "B", 0, 3.14]), - ([], []), + ([1, 2, 3, 4], native_pd.Index(["A", "B", "C", "D"], name="some name")), + (list(range(100)), native_pd.Index(list(range(200, 300)))), + (["A", None, 2.3, 1], native_pd.Index([None, "B", 0, 3.14])), + ([], native_pd.Index([], name="empty index")), ], ) -@sql_count_checker(query_count=2) -def test_create_series_with_index_as_index(data, index): +@pytest.mark.parametrize("obj_type", ["series", "df"]) +@sql_count_checker(query_count=1, join_count=1) +def test_create_with_index_as_index(data, native_idx, obj_type): """ - Creating a Series where the index is an Index. + Creating a Series/DataFrame where the index is an Index. """ - # Two queries are issued: one when creating the Series (the index is converted - # to a native pandas object), one when materializing the Series for comparison. - # Create Snowpark pandas Series and native pandas Series with an Index object as the index. - native_idx = native_pd.Index(index, name="some name") + # A join is performed to set the index columns of the generated Series/DataFrame. snow_idx = pd.Index(native_idx) - assert_series_equal( - pd.Series(data, index=snow_idx), - native_pd.Series(data, index=native_idx), + assert_equal_func, snow_obj, native_obj, kwargs = obj_type_helper(obj_type) + assert_equal_func( + snow_obj(data, index=snow_idx), + native_obj(data, index=native_idx), check_dtype=False, check_index_type=False, + **kwargs, ) @pytest.mark.parametrize( - "data, index", - [ - ([1, 2, 3, 4], ["A", "B", "C", "D"]), - (list(range(250)), list(range(250, 500))), - (["A", None, 2.3, 1], [None, "B", 0, 3.14]), - ([], []), - ], -) -@sql_count_checker(query_count=1, join_count=1) -def test_create_df_with_index_as_data_and_index(data, index): - """ - Creating a DataFrame where the data is an Index and the index is also an Index. - """ - # Create Snowpark pandas DataFrame and native pandas DataFrame from Index objects. - native_idx_data = native_pd.Index(data, name="data name") - snow_idx_data = pd.Index(native_idx_data) - native_idx_index = native_pd.Index(index, name="index name") - snow_idx_index = pd.Index(native_idx_index) - assert_frame_equal( - pd.DataFrame(snow_idx_data, index=snow_idx_index), - native_pd.DataFrame(native_idx_data, index=native_idx_index), - ) - - -@pytest.mark.parametrize( - "data, index", - [ - ([1, 2, 3, 4], ["A", "B", "C", "D"]), - (list(range(100)), list(range(100, 200))), - (["A", None, 2.3, 1], [None, "B", 0, 3.14]), - ([], []), - ], -) -@sql_count_checker(query_count=1, join_count=1) -def test_create_series_with_index_as_data_and_index(data, index): - """ - Creating a Series where the data is an Index and the index is also an Index. - """ - # Create Snowpark pandas Series and native pandas Series from Index objects. - # TODO: Index is not being set at all. - native_idx_data = native_pd.Index(data, name="data name") - snow_idx_data = pd.Index(native_idx_data) - native_idx_index = native_pd.Index(index, name="index name") - snow_idx_index = pd.Index(native_idx_index) - assert_series_equal( - pd.Series(snow_idx_data, index=snow_idx_index), - native_pd.Series(native_idx_data, index=native_idx_index), - ) - - -@pytest.mark.parametrize( - "data, native_series", + "native_idx_data, native_idx_index", [ ( - [1, 2, 3, 4], - native_pd.Series( - ["A", "B", "C", "D"], - index=[1.1, 2.2, 3.3, 4.4], - name="index series name", - ), + native_pd.Index([1, 2, 3, 4], name="data name"), + native_pd.Index(["A", "B", "C", "D"]), ), - (list(range(100)), native_pd.Series(list(range(100, 200)))), ( - ["A", None, 2.3, 1], - native_pd.Series([None, "B", 0, 3.14], name="mixed series as index"), + native_pd.Index(list(range(250))), + native_pd.Index(list(range(250, 500)), name="index name"), ), - ([], native_pd.Series([], name="empty series")), + ( + native_pd.Index(["A", None, 2.3, 1], name="data name"), + native_pd.Index([None, "B", 0, 3.14], name="index name"), + ), + (native_pd.Index([]), native_pd.Index([])), ], ) +@pytest.mark.parametrize("obj_type", ["series", "df"]) @sql_count_checker(query_count=1, join_count=1) -def test_create_df_with_index_as_data_and_series_as_index(data, native_series): +def test_create_with_index_as_data_and_index( + native_idx_data, native_idx_index, obj_type +): """ - Creating a DataFrame where the data is an Index and the index is a Series. + Creating a Series/DataFrame where the data is an Index and the index is also an Index. """ - snow_series = pd.Series(native_series) - native_index = native_pd.Index(data, name="index data name") - snow_index = pd.Index(native_index) - assert_frame_equal( - pd.DataFrame(snow_index, index=snow_series), - native_pd.DataFrame(native_index, index=native_series), + # A join is required to combine the query compilers of the data and index objects. + snow_idx_data = pd.Index(native_idx_data) + snow_idx_index = pd.Index(native_idx_index) + assert_equal_func, snow_obj, native_obj, _ = obj_type_helper(obj_type) + assert_equal_func( + snow_obj(data=snow_idx_data, index=snow_idx_index), + native_obj(data=native_idx_data, index=native_idx_index), ) @pytest.mark.parametrize( - "data, native_series", + "native_index, native_series", [ ( - [1, 2, 3, 4], + native_pd.Index([1, 2, 3, 4], name="index name"), native_pd.Series( ["A", "B", "C", "D"], index=[1.1, 2.2, 3.3, 4.4], name="index series name", ), ), - (list(range(100)), native_pd.Series(list(range(100, 200)))), ( - ["A", None, 2.3, 1], - native_pd.Series([None, "B", 0, 3.14], name="mixed series as index"), + native_pd.Index(list(range(100)), name="AAAAA"), + native_pd.Series(list(range(100, 200))), + ), + ( + native_pd.Index(["A", None, 2.3, 1]), + native_pd.Series([None, "B", 0, 3.14]), ), - ([], native_pd.Series([], name="empty series")), + (native_pd.Index([]), native_pd.Series([], name="empty series")), ], ) +@pytest.mark.parametrize("obj_type", ["series", "df"]) @sql_count_checker(query_count=1, join_count=1) -def test_create_series_with_index_as_data_and_series_as_index(data, native_series): +def test_create_with_index_as_data_and_series_as_index( + native_index, native_series, obj_type +): """ - Creating a Series where the data is an Index and the index is a Series. + Creating a Series/DataFrame where the data is an Index and the index is a Series. """ - snow_series = pd.Series(native_series) - native_index = native_pd.Index(data, name="index data name") + # A join is required to combine the query compilers of the data and index objects. snow_index = pd.Index(native_index) - assert_series_equal( - pd.Series(snow_index, index=snow_series), - native_pd.Series(native_index, index=native_series), - ) - - -@pytest.mark.parametrize( - "native_series, native_index", - [ - ( - native_pd.Series( - ["A", "B", "C", "D"], index=[1.1, 2.2, 3, 4], name="index series name" - ), - native_pd.Index([1, 2, 3, 4], name="some name"), - ), # some index values are missing - ( - native_pd.Series(list(range(100))), - native_pd.Index(list(range(-50, 100, 4)), name="skip numbers"), - ), # some index values are missing - ( - native_pd.Series( - [10, 20, 30, 40], - index=native_pd.Index([None, "B", 0, 3.14], name="mixed"), - name="mixed series as index", - ), - native_pd.Index(["B", 0, None, 3.14]), - ), # rearranged index values - ( - native_pd.Series(["A", "B", "C", "D", "E"], name="series"), - native_pd.Index([3, 4], name="index"), - ), # subset of index values - ( - native_pd.Series( - list(range(20)), index=native_pd.Index(list(range(20)), name=20) - ), - native_pd.Index(list(range(20))), - ), # all index values match - ( - native_pd.Series(["A", "V", "D", "R"]), - native_pd.Index([10, 20, 30, 40], name="none"), - ), # no index values match - ( - native_pd.Series([], name="empty series", dtype="int64"), - native_pd.Index([], name="empty index", dtype="int64"), - ), # empty series and index - ], -) -@sql_count_checker(query_count=1, join_count=2) -def test_create_df_with_series_as_data_and_index_as_index(native_series, native_index): - """ - Creating a DataFrame where the data is a Series and the index is an Index. - """ - # Two joins are performed: one from joining the data and index parameters to have a query compiler whose - # index columns match the provided index, and one from performing .loc[] to filter the generated qc. snow_series = pd.Series(native_series) - snow_index = pd.Index(native_index) - assert_frame_equal( - pd.DataFrame(snow_series, index=snow_index), - native_pd.DataFrame(native_series, index=native_index), + assert_equal_func, snow_obj, native_obj, _ = obj_type_helper(obj_type) + assert_equal_func( + snow_obj(data=snow_index, index=snow_series), + native_obj(data=native_index, index=native_series), ) @@ -309,20 +201,22 @@ def test_create_df_with_series_as_data_and_index_as_index(native_series, native_ ), # empty series and index ], ) +@pytest.mark.parametrize("obj_type", ["series", "df"]) @sql_count_checker(query_count=1, join_count=2) -def test_create_series_with_series_as_data_and_index_as_index( - native_series, native_index +def test_create_with_series_as_data_and_index_as_index( + native_series, native_index, obj_type ): """ - Creating a Series where the data is a Series and the index is an Index. + Creating a Series/DataFrame where the data is a Series and the index is an Index. """ # Two joins are performed: one from joining the data and index parameters to have a query compiler whose # index columns match the provided index, and one from performing .loc[] to filter the generated qc. snow_series = pd.Series(native_series) snow_index = pd.Index(native_index) - assert_series_equal( - pd.Series(snow_series, index=snow_index), - native_pd.Series(native_series, index=native_index), + assert_equal_func, snow_obj, native_obj, _ = obj_type_helper(obj_type) + assert_equal_func( + snow_obj(data=snow_series, index=snow_index), + native_obj(data=native_series, index=native_index), ) From a9376c1200c403fc8aafe0ed7dce4e884e073bb2 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Thu, 22 Aug 2024 09:32:39 -0700 Subject: [PATCH 07/42] Get rid of the join performed when only index is an Index object and data is not a Snowpark pandas object --- src/snowflake/snowpark/modin/pandas/dataframe.py | 15 ++------------- src/snowflake/snowpark/modin/pandas/series.py | 11 +---------- .../index/test_df_series_creation_with_index.py | 6 ++++-- 3 files changed, 7 insertions(+), 25 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 609f5bf55e..a6ecaa58ff 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -296,25 +296,14 @@ def __init__( k: v._to_pandas() if isinstance(v, Series) else v for k, v in data.items() } - - new_index = index - if isinstance(index, Index): - # Skip turning this into a native pandas object here since this issues an extra query. - # Instead, first get the query compiler from native pandas and then add the index column. - new_index = None pandas_df = pandas.DataFrame( data=try_convert_index_to_native(data), - index=try_convert_index_to_native(new_index), + index=try_convert_index_to_native(index), columns=try_convert_index_to_native(columns), dtype=dtype, copy=copy, ) - query_compiler = from_pandas(pandas_df)._query_compiler - if isinstance(index, Index): - query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( - index._query_compiler - ) - self._query_compiler = query_compiler + self._query_compiler = from_pandas(pandas_df)._query_compiler else: self._query_compiler = query_compiler diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index f59c1a7939..213711a1ac 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -175,16 +175,11 @@ def __init__( ): name = data.name - new_index = index - if isinstance(index, Index): - # Skip turning this into a native pandas object here since this issues an extra query. - # Instead, first get the query compiler from native pandas and then add the index column. - new_index = None query_compiler = from_pandas( pandas.DataFrame( pandas.Series( data=try_convert_index_to_native(data), - index=try_convert_index_to_native(new_index), + index=try_convert_index_to_native(index), dtype=dtype, name=name, copy=copy, @@ -192,10 +187,6 @@ def __init__( ) ) )._query_compiler - if isinstance(index, Index): - query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( - index._query_compiler - ) self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 5b2571ccca..3396336999 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -67,12 +67,14 @@ def test_create_with_index_as_data(native_idx, obj_type): ], ) @pytest.mark.parametrize("obj_type", ["series", "df"]) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=2) def test_create_with_index_as_index(data, native_idx, obj_type): """ Creating a Series/DataFrame where the index is an Index. """ - # A join is performed to set the index columns of the generated Series/DataFrame. + # Two queries are issued: one when creating the Series/DataFrame (the index is + # converted to a native pandas object), one when materializing the Series/DataFrame + # for comparison. snow_idx = pd.Index(native_idx) assert_equal_func, snow_obj, native_obj, kwargs = obj_type_helper(obj_type) assert_equal_func( From 420a5ac9897fc92d38a34211d294bef36bd66093 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Thu, 22 Aug 2024 15:28:35 -0700 Subject: [PATCH 08/42] Add back the index join query to DataFrame/Series constructor, update the constructor tests, rewrite concat tests --- .../snowpark/modin/pandas/dataframe.py | 15 +- src/snowflake/snowpark/modin/pandas/series.py | 11 +- .../test_df_series_creation_with_index.py | 6 +- tests/integ/modin/test_concat.py | 437 ++++++++++-------- 4 files changed, 282 insertions(+), 187 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index a6ecaa58ff..609f5bf55e 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -296,14 +296,25 @@ def __init__( k: v._to_pandas() if isinstance(v, Series) else v for k, v in data.items() } + + new_index = index + if isinstance(index, Index): + # Skip turning this into a native pandas object here since this issues an extra query. + # Instead, first get the query compiler from native pandas and then add the index column. + new_index = None pandas_df = pandas.DataFrame( data=try_convert_index_to_native(data), - index=try_convert_index_to_native(index), + index=try_convert_index_to_native(new_index), columns=try_convert_index_to_native(columns), dtype=dtype, copy=copy, ) - self._query_compiler = from_pandas(pandas_df)._query_compiler + query_compiler = from_pandas(pandas_df)._query_compiler + if isinstance(index, Index): + query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( + index._query_compiler + ) + self._query_compiler = query_compiler else: self._query_compiler = query_compiler diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 213711a1ac..f59c1a7939 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -175,11 +175,16 @@ def __init__( ): name = data.name + new_index = index + if isinstance(index, Index): + # Skip turning this into a native pandas object here since this issues an extra query. + # Instead, first get the query compiler from native pandas and then add the index column. + new_index = None query_compiler = from_pandas( pandas.DataFrame( pandas.Series( data=try_convert_index_to_native(data), - index=try_convert_index_to_native(index), + index=try_convert_index_to_native(new_index), dtype=dtype, name=name, copy=copy, @@ -187,6 +192,10 @@ def __init__( ) ) )._query_compiler + if isinstance(index, Index): + query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( + index._query_compiler + ) self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 3396336999..5b2571ccca 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -67,14 +67,12 @@ def test_create_with_index_as_data(native_idx, obj_type): ], ) @pytest.mark.parametrize("obj_type", ["series", "df"]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_create_with_index_as_index(data, native_idx, obj_type): """ Creating a Series/DataFrame where the index is an Index. """ - # Two queries are issued: one when creating the Series/DataFrame (the index is - # converted to a native pandas object), one when materializing the Series/DataFrame - # for comparison. + # A join is performed to set the index columns of the generated Series/DataFrame. snow_idx = pd.Index(native_idx) assert_equal_func, snow_obj, native_obj, kwargs = obj_type_helper(obj_type) assert_equal_func( diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index f3e149a37f..981a2932a2 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -25,56 +25,56 @@ @pytest.fixture(scope="function") def df1(): - return pd.DataFrame( + return native_pd.DataFrame( { "C": [1, 2, 3], "A": ["a", "b", "c"], "D": [3, 2, 1], }, - index=pd.Index([3, 1, 2], name="left_i"), + index=native_pd.Index([3, 1, 2], name="left_i"), ) @pytest.fixture(scope="function") def df2(): - return pd.DataFrame( + return native_pd.DataFrame( { "P": [3, 2, 1, 3], "A": ["a", "b", "c", "a"], "C": [1, 2, 3, 2], }, - index=pd.Index([2, 0, 3, 4], name="right_i"), + index=native_pd.Index([2, 0, 3, 4], name="right_i"), ) @pytest.fixture(scope="function") def df_single_col(): - return pd.DataFrame([1], columns=["A"]) + return native_pd.DataFrame([1], columns=["A"]) @pytest.fixture(scope="function") def zero_rows_df(): - return pd.DataFrame(columns=["A", "B"]) + return native_pd.DataFrame(columns=["A", "B"]) @pytest.fixture(scope="function") def zero_columns_df(): - return pd.DataFrame(index=pd.Index([1, 2])) + return native_pd.DataFrame(index=pd.Index([1, 2])) @pytest.fixture(scope="function") def empty_df(): - return pd.DataFrame() + return native_pd.DataFrame() @pytest.fixture(scope="function") def series1(): - return pd.Series([1, 2]) + return native_pd.Series([1, 2]) @pytest.fixture(scope="function") def series2(): - return pd.Series([2, 1]) + return native_pd.Series([2, 1]) @pytest.fixture(params=["inner", "outer"]) @@ -109,11 +109,10 @@ def axis(request): return request.param -def _concat_operation(objs, native_objs=None, **kwargs): - if native_objs is None: - native_objs = [obj.to_pandas() for obj in objs] +# TODO: redefine df1, df2, and _concat_operation +def _concat_operation(snow_objs, native_objs, **kwargs): return ( - lambda x: pd.concat(objs, **kwargs) + lambda x: pd.concat(snow_objs, **kwargs) if x == "pd" else native_pd.concat(native_objs, **kwargs) ) @@ -121,12 +120,19 @@ def _concat_operation(objs, native_objs=None, **kwargs): def test_concat_basic(df1, df2, join, sort, axis, ignore_index): expected_join_count = 1 if axis == 1 else 0 - with SqlCounter(query_count=3, join_count=expected_join_count): + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( "pd", "native_pd", _concat_operation( - [df1, df2], axis=axis, join=join, sort=sort, ignore_index=ignore_index + snow_objs, + native_objs, + axis=axis, + join=join, + sort=sort, + ignore_index=ignore_index, ), ) @@ -136,7 +142,7 @@ def test_concat_no_items_negative(): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([]), + _concat_operation(snow_objs=[], native_objs=[]), expect_exception=True, expect_exception_type=ValueError, expect_exception_match="No objects to concatenate", @@ -147,6 +153,7 @@ def test_concat_exclude_none(df1, df2, axis): expected_join_count = 2 if axis == 1 else 0 with SqlCounter(query_count=2, join_count=expected_join_count): # Verify that none objects are simply ignored. + df1, df2 = pd.DataFrame(df1), pd.DataFrame(df2) pieces = [df1, None, df2, None] result = pd.concat(pieces, axis=axis) expected = pd.concat([df1, df2], axis=axis) @@ -158,7 +165,7 @@ def test_concat_all_none_negative(): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([None, None], [None, None]), + _concat_operation(snow_objs=[None, None], native_objs=[None, None]), expect_exception=True, expect_exception_type=ValueError, expect_exception_match="All objects passed were None", @@ -170,43 +177,53 @@ def test_concat_mixed_objs(df1, df2, series1, series2, axis, join): expected_join_count_with_duplicates = 2 if axis == 1 else 0 # Series and Dataframes - with SqlCounter(query_count=3, join_count=expected_join_count): + native_objs = [df1, series1] + snow_objs = [pd.DataFrame(df1), pd.Series(series1)] + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, series1], axis=axis, join=join), + _concat_operation(snow_objs, native_objs, axis=axis, join=join), ) # All dataframes - with SqlCounter(query_count=3, join_count=expected_join_count): + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(df) for df in native_objs] + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2], axis=axis, join=join), + _concat_operation(snow_objs, native_objs, axis=axis, join=join), ) # All dataframes with duplicates - with SqlCounter(query_count=4, join_count=expected_join_count_with_duplicates): + native_objs = [df1, df2, df1] + snow_objs = [pd.DataFrame(df) for df in native_objs] + with SqlCounter(query_count=1, join_count=expected_join_count_with_duplicates): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2, df1], axis=axis, join=join), + _concat_operation(snow_objs, native_objs, axis=axis, join=join), ) # All series - with SqlCounter(query_count=3, join_count=expected_join_count): + native_objs = [series1, series2] + snow_objs = [pd.Series(series) for series in native_objs] + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([series1, series2], axis=axis, join=join), + _concat_operation(snow_objs, native_objs, axis=axis, join=join), ) # All series with duplicates - with SqlCounter(query_count=4, join_count=expected_join_count_with_duplicates): + native_objs = [series1, series2, series1] + snow_objs = [pd.Series(series) for series in native_objs] + with SqlCounter(query_count=1, join_count=expected_join_count_with_duplicates): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([series1, series2, series1], axis=axis, join=join), + _concat_operation(snow_objs, native_objs, axis=axis, join=join), ) @@ -220,15 +237,15 @@ def test_concat_mixed_objs(df1, df2, series1, series2, axis, join): ("foo", "foo", ["foo", "foo"]), ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_concat_series_names_axis1(series1, series2, name1, name2, expected_columns): - series1 = series1.rename(name1) - series2 = series2.rename(name2) - native_s1 = series1.to_pandas() - native_s2 = series2.to_pandas() + native_series1 = series1.rename(name1) + native_series2 = series2.rename(name2) + snow_series1 = pd.Series(series1).rename(name1) + snow_series2 = pd.Series(series2).rename(name2) # snow result - snow_res = pd.concat([series1, series2], axis=1) - native_res = native_pd.concat([native_s1, native_s2], axis=1) + snow_res = pd.concat([snow_series1, snow_series2], axis=1) + native_res = native_pd.concat([native_series1, native_series2], axis=1) assert_frame_equal(snow_res, native_res) # Explicit check for column names assert snow_res.columns.tolist() == expected_columns @@ -244,26 +261,28 @@ def test_concat_series_names_axis1(series1, series2, name1, name2, expected_colu ("foo", "foo", "foo"), ], ) -@sql_count_checker(query_count=3, union_count=1) +@sql_count_checker(query_count=1, union_count=1) def test_concat_series_names_axis0(series1, series2, name1, name2, expected_name): - series1 = series1.rename(name1) - series2 = series2.rename(name2) - native_s1 = series1.to_pandas() - native_s2 = series2.to_pandas() + native_series1 = series1.rename(name1) + native_series2 = series2.rename(name2) + snow_series1 = pd.Series(series1).rename(name1) + snow_series2 = pd.Series(series2).rename(name2) # snow result - snow_res = pd.concat([series1, series2]) - native_res = native_pd.concat([native_s1, native_s2]) + snow_res = pd.concat([snow_series1, snow_series2]) + native_res = native_pd.concat([native_series1, native_series2]) assert_series_equal(snow_res, native_res) # Explicit check for column names assert snow_res.name == expected_name -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_concat_invalid_join_negative(df1, df2): + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(df) for df in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2], join="left"), + _concat_operation(snow_objs, native_objs, join="left"), expect_exception=True, expect_exception_type=ValueError, expect_exception_match=r"Only can inner \(intersect\) or outer \(union\) join the other axis", @@ -272,7 +291,8 @@ def test_concat_invalid_join_negative(df1, df2): def test_concat_iterables(df1, df2, axis): # verify that concat works with tuples, list, deque, generators and custom iterables - expected = native_pd.concat([df1.to_pandas(), df2.to_pandas()], axis=axis) + expected = native_pd.concat([df1, df2], axis=axis) + df1, df2 = pd.DataFrame(df1), pd.DataFrame(df2) expected_join_count = 1 if axis == 1 else 0 @@ -330,7 +350,7 @@ def test_concat_non_iterables_negative(): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation("abc", "abc"), + _concat_operation(snow_objs="abc", native_objs="abc"), expect_exception=True, expect_exception_type=TypeError, expect_exception_match=msg, @@ -355,12 +375,14 @@ def test_concat_native_object_negative(obj): pd.concat({"a": obj}) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_concat_invalid_type_negative(df1): + native_objs = [df1, "abc"] + snow_objs = [pd.DataFrame(df1), "abc"] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, "abc"], [df1.to_pandas(), "abc"]), + _concat_operation(snow_objs, native_objs), expect_exception=True, expect_exception_type=TypeError, expect_exception_match="cannot concatenate object of type ''; only Series and DataFrame objs are valid", @@ -398,9 +420,8 @@ def _multiindex(labels: list[tuple[Hashable, ...]]) -> MultiIndex: def test_concat_multiindex_columns_axis1( columns1, columns2, df_single_col, expected_cols ): - df1 = df_single_col.copy() + df1, df2 = pd.DataFrame(df_single_col), pd.DataFrame(df_single_col) df1.columns = columns1 - df2 = df_single_col.copy() df2.columns = columns2 result_columns = pd.concat([df1, df2], axis=1).columns @@ -438,9 +459,8 @@ def test_concat_multiindex_columns_axis1( def test_concat_multiindex_row_labels_axis0( index1, index2, df_single_col, expected_index, expected_join_count ): - df1 = df_single_col.copy() + df1, df2 = pd.DataFrame(df_single_col), pd.DataFrame(df_single_col) df1.index = index1 - df2 = df_single_col.copy() df2.index = index2 with SqlCounter(query_count=1, join_count=expected_join_count): @@ -481,9 +501,8 @@ def test_concat_multiindex_row_labels_axis0( def test_concat_multiindex_row_labels_axis1( index1, index2, df_single_col, expected_index, expected_join_count ): - df1 = df_single_col.copy() + df1, df2 = pd.DataFrame(df_single_col), pd.DataFrame(df_single_col) df1.index = index1 - df2 = df_single_col.copy() df2.index = index2 with SqlCounter(query_count=1, join_count=expected_join_count): @@ -518,9 +537,8 @@ def test_concat_multiindex_row_labels_axis1( ], ) def test_concat_multiindex_row_labels_axis1_negative(index1, index2, df_single_col): - df1 = df_single_col.copy() + df1, df2 = pd.DataFrame(df_single_col), pd.DataFrame(df_single_col) df1.index = index1 - df2 = df_single_col.copy() df2.index = index2 # This behavior is different with Native pandas, where native pandas cast the index @@ -553,9 +571,8 @@ def test_concat_multiindex_row_labels_axis1_negative(index1, index2, df_single_c def test_concat_multiindex_columns_axis0( columns1, columns2, df_single_col, expected_cols ): - df1 = df_single_col.copy() + df1, df2 = pd.DataFrame(df_single_col), pd.DataFrame(df_single_col) df1.columns = columns1 - df2 = df_single_col.copy() df2.columns = columns2 result_columns = pd.concat([df1, df2], axis=0).columns @@ -563,10 +580,15 @@ def test_concat_multiindex_columns_axis0( def test_concat_index_with_nulls(df1, df2): + native_objs = [df1, df2] + df1, df2 = pd.DataFrame(df1), pd.DataFrame(df2) df1.set_index([[None, "a", None]]) df2.set_index([[4, 5, None, 1]]) - with SqlCounter(query_count=3): - eval_snowpark_pandas_result("pd", "native_pd", _concat_operation([df1, df2])) + snow_objs = [df1, df2] + with SqlCounter(query_count=1): + eval_snowpark_pandas_result( + "pd", "native_pd", _concat_operation(snow_objs, native_objs) + ) @pytest.mark.parametrize( @@ -581,12 +603,14 @@ def test_concat_index_with_nulls(df1, df2): ], ) def test_concat_with_keys(df1, df2, series1, keys, axis): + native_objs = [df1, df2, series1] + snow_objs = [pd.DataFrame(df1), pd.DataFrame(df2), pd.Series(series1)] expected_join_count = 2 if axis == 1 and len(keys) > 1 else 0 - with SqlCounter(query_count=4, join_count=expected_join_count): + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2, series1], keys=keys, axis=axis), + _concat_operation(snow_objs, native_objs, keys=keys, axis=axis), ) @@ -601,42 +625,52 @@ def test_concat_with_keys(df1, df2, series1, keys, axis): ], ) def test_concat_same_frame_with_keys(df1, keys, axis): + native_objs = [df1, df1] + snow_objs = [pd.DataFrame(df) for df in native_objs] expected_join_count = 1 if axis == 1 and len(keys) > 1 else 0 - with SqlCounter(query_count=3, join_count=expected_join_count): + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( - "pd", "native_pd", _concat_operation([df1, df1], keys=keys, axis=axis) + "pd", + "native_pd", + _concat_operation(snow_objs, native_objs, keys=keys, axis=axis), ) @pytest.mark.parametrize("nlevels", [2, 3]) @pytest.mark.parametrize("keys", [["x", "y"], [("x", 1), ("y", 2)]]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_concat_multiindex_columns_with_keys_axis1(df1, df2, nlevels, keys): - df1 = df1.copy() + df1, df2 = df1.copy(), df2.copy() df1.columns = MultiIndex.from_arrays([df1.columns.tolist()] * nlevels) - df2 = df2.copy() df2.columns = MultiIndex.from_arrays([df2.columns.tolist()] * nlevels) - + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(df) for df in native_objs] eval_snowpark_pandas_result( - "pd", "native_pd", _concat_operation([df1, df2], axis=1, keys=keys) + "pd", "native_pd", _concat_operation(snow_objs, native_objs, axis=1, keys=keys) ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_concat_single_with_key(df1, axis): eval_snowpark_pandas_result( - "pd", "native_pd", _concat_operation([df1], keys=["foo"], axis=axis) + "pd", + "native_pd", + _concat_operation( + snow_objs=[pd.DataFrame(df1)], native_objs=[df1], keys=["foo"], axis=axis + ), ) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=1) def test_concat_keys_with_none(df1, df2, axis): + native_objs = [df1, None, df2] + snow_objs = [pd.DataFrame(df1), None, pd.DataFrame(df2)] eval_snowpark_pandas_result( "pd", "native_pd", _concat_operation( - [df1, None, df2], - [df1.to_pandas(), None, df2.to_pandas()], + snow_objs, + native_objs, keys=["x", "y"], axis=axis, ), @@ -656,27 +690,31 @@ def test_concat_keys_with_none(df1, df2, axis): "name1, name2", [("one", "two"), ("one", None), (None, "two"), (None, None)] ) def test_concat_with_keys_and_names(df1, df2, names, name1, name2, axis): - with SqlCounter(query_count=0 if name1 is None or axis == 1 else 2): - df1 = df1.rename_axis(name1, axis=axis) - with SqlCounter(query_count=0 if name2 is None or axis == 1 else 2): - df2 = df2.rename_axis(name2, axis=axis) + df1 = df1.rename_axis(name1, axis=axis) + df2 = df2.rename_axis(name2, axis=axis) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(df) for df in native_objs] # One extra query to convert index to native pandas when creating df - with SqlCounter(query_count=3): + with SqlCounter(query_count=1): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2], keys=["x", "y"], names=names, axis=axis), + _concat_operation( + snow_objs, native_objs, keys=["x", "y"], names=names, axis=axis + ), ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_concat_with_keys_and_extra_names_negative(df1, df2, axis): + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(df) for df in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", _concat_operation( - [df1, df2], keys=["x", "y"], names=["a", "b", "c"], axis=axis + snow_objs, native_objs, keys=["x", "y"], names=["a", "b", "c"], axis=axis ), expect_exception=True, expect_exception_type=ValueError, @@ -684,12 +722,14 @@ def test_concat_with_keys_and_extra_names_negative(df1, df2, axis): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_concat_empty_keys_negative(df1, df2, axis): + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(df) for df in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2], keys=[], axis=axis), + _concat_operation(snow_objs, native_objs, keys=[], axis=axis), expect_exception=True, expect_exception_type=ValueError, ) @@ -697,64 +737,64 @@ def test_concat_empty_keys_negative(df1, df2, axis): @pytest.mark.parametrize("dict_keys", [["x", "y"], ["y", "x"]]) def test_concat_dict(df1, df2, dict_keys, axis): + native_objs = {dict_keys[0]: df1, dict_keys[1]: df2} + snow_objs = {dict_keys[0]: pd.DataFrame(df1), dict_keys[1]: pd.DataFrame(df2)} expected_join_count = 1 if axis == 1 else 0 - with SqlCounter(query_count=3, join_count=expected_join_count): - objs = {dict_keys[0]: df1, dict_keys[1]: df2} - native_objs = {dict_keys[0]: df1.to_pandas(), dict_keys[1]: df2.to_pandas()} + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( - "pd", "native_pd", _concat_operation(objs, native_objs, axis=axis) + "pd", "native_pd", _concat_operation(snow_objs, native_objs, axis=axis) ) @pytest.mark.parametrize("dict_keys", [["x", "y"], ["y", "x"]]) @pytest.mark.parametrize("keys", [["x", "y"], ["y", "x"], ["x"], ["y"]]) def test_concat_dict_with_keys(df1, df2, dict_keys, keys, axis): + native_objs = {dict_keys[0]: df1, dict_keys[1]: df2} + snow_objs = {dict_keys[0]: pd.DataFrame(df1), dict_keys[1]: pd.DataFrame(df2)} expected_join_count = 1 if axis == 1 and len(keys) > 1 else 0 - with SqlCounter(query_count=3, join_count=expected_join_count): - objs = {dict_keys[0]: df1, dict_keys[1]: df2} - native_objs = {dict_keys[0]: df1.to_pandas(), dict_keys[1]: df2.to_pandas()} + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation(objs, native_objs, axis=axis, keys=keys), + _concat_operation(snow_objs, native_objs, axis=axis, keys=keys), ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_concat_dict_with_invalid_keys_negative(df1, df2, axis): - objs = {"x": df1, "y": df2} - native_objs = {"x": df1.to_pandas(), "y": df2.to_pandas()} + native_objs = {"x": df1, "y": df2} + snow_objs = {"x": pd.DataFrame(df1), "y": pd.DataFrame(df2)} eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation(objs, native_objs, keys=["x", "z"], axis=axis), + _concat_operation(snow_objs, native_objs, keys=["x", "z"], axis=axis), expect_exception=True, expect_exception_type=KeyError, expect_exception_match="z", ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_concat_with_mixed_tuples_as_column_labels(sort): # columns have mixed tuples - df1 = pd.DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2)) - df2 = pd.DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2)) + df1 = native_pd.DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2)) + df2 = native_pd.DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2)) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(df) for df in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation( - [df1, df2], [df1.to_pandas(), df2.to_pandas()], axis=1, sort=sort - ), + _concat_operation(snow_objs, native_objs, axis=1, sort=sort), ) def test_concat_empty_df(df1, empty_df, zero_rows_df, zero_columns_df, axis): - objs = [df1, empty_df, zero_columns_df, zero_rows_df] - snow_res = pd.concat(objs) - - native_objs = [df.to_pandas() for df in objs] + native_objs = [df1, empty_df, zero_columns_df, zero_rows_df] native_res = native_pd.concat(native_objs) + snow_objs = [pd.DataFrame(obj) for obj in native_objs] + snow_res = pd.concat(snow_objs) + with SqlCounter(query_count=1): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_res, native_res) @@ -776,54 +816,61 @@ def test_concat_empty_df(df1, empty_df, zero_rows_df, zero_columns_df, axis): ), # same levels, one overlapping name ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_concat_multiindex(index1, index2): - df1 = pd.DataFrame({"A": [0, 1]}, index=index1) - df2 = pd.DataFrame({"B": [2, 3]}, index=index2) + df1 = native_pd.DataFrame({"A": [0, 1]}, index=index1) + df2 = native_pd.DataFrame({"B": [2, 3]}, index=index2) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(df) for df in native_objs] eval_snowpark_pandas_result( - "pd", "native_pd", _concat_operation([df1, df2], axis=1) + "pd", "native_pd", _concat_operation(snow_objs, native_objs, axis=1) ) @pytest.mark.parametrize( "type1, type2", - [(pd.DataFrame, pd.DataFrame), (pd.Series, pd.Series), (pd.DataFrame, pd.Series)], + [("df", "df"), ("series", "series"), ("df", "series")], ) @pytest.mark.parametrize("col1, col2", [("A", None), ("A", "a"), (1, "1")]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_concat_verify_integrity_axis1(type1, type2, col1, col2): - obj1 = ( - pd.DataFrame([1, 2], columns=[col1]) - if type1 == pd.DataFrame - else pd.Series([1, 2], name=col1) - ) - obj2 = ( - pd.DataFrame([1, 2], columns=[col2]) - if type2 == pd.DataFrame - else pd.Series([1, 2], name=col2) - ) + if type1 == "df": + native_obj1 = native_pd.DataFrame([1, 2], columns=[col1]) + snow_obj1 = pd.DataFrame(native_obj1) + else: + native_obj1 = native_pd.Series([1, 2], name=col1) + snow_obj1 = pd.Series(native_obj1) + + if type2 == "df": + native_obj2 = native_pd.DataFrame([1, 2], columns=[col2]) + snow_obj2 = pd.DataFrame(native_obj2) + else: + native_obj2 = native_pd.Series([1, 2], name=col2) + snow_obj2 = pd.Series(native_obj2) + eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([obj1, obj2], axis=1, verify_integrity=True), + _concat_operation( + snow_objs=[snow_obj1, snow_obj2], + native_objs=[native_obj1, native_obj2], + axis=1, + verify_integrity=True, + ), ) -@pytest.mark.parametrize( - "type1, type2", [(pd.DataFrame, pd.DataFrame), (pd.DataFrame, pd.Series)] -) +@pytest.mark.parametrize("obj2_type", ["df", "series"]) @sql_count_checker(query_count=0) -def test_concat_verify_integrity_axis1_negative(type1, type2): - obj1 = ( - pd.DataFrame([1, 2], columns=["A"]) - if type1 == pd.DataFrame - else pd.Series([1, 2], name="A") - ) - obj2 = ( - pd.DataFrame([3, 4], columns=["A"]) - if type2 == pd.DataFrame - else pd.Series([3, 4], name="A") - ) +def test_concat_verify_integrity_axis1_negative(obj2_type): + # obj1 is always a DataFrame. + obj1 = pd.DataFrame([1, 2], columns=["A"]) + + if obj2_type == "df": + obj2 = pd.DataFrame([3, 4], columns=["A"]) + else: + obj2 = pd.Series([3, 4], name="A") + msg = "Columns have overlapping values" with pytest.raises(ValueError, match=msg): pd.concat([obj1, obj2], axis=1, verify_integrity=True) @@ -840,16 +887,20 @@ def test_concat_all_series_verify_integrity_axis1_negative(): pd.concat([obj1, obj2], axis=1, verify_integrity=True) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_concat_verify_integrity_axis1_with_keys(): # Even though original frames have duplicate columns, after adding keys to column # labels duplicates are resolved, hence no error. - obj1 = pd.DataFrame([1, 2], columns=["A"]) - obj2 = pd.DataFrame([3, 4], columns=["A"]) + obj1 = native_pd.DataFrame([1, 2], columns=["A"]) + obj2 = native_pd.DataFrame([3, 4], columns=["A"]) + native_objs = [obj1, obj2] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([obj1, obj2], axis=1, verify_integrity=True, keys=["x", "y"]), + _concat_operation( + snow_objs, native_objs, axis=1, verify_integrity=True, keys=["x", "y"] + ), ) @@ -860,12 +911,16 @@ def test_concat_verify_integrity_axis1_with_keys(): (_multiindex([(1, 1), (1, 2)]), _multiindex([(2, 1), (2, 2)])), ], ) -@sql_count_checker(query_count=4, union_count=2) +@sql_count_checker(query_count=2, union_count=2) def test_concat_verify_integrity_axis0(index1, index2): - df1 = pd.DataFrame([1, 2], columns=["a"], index=index1) - df2 = pd.DataFrame([1, 2], columns=["a"], index=index2) + df1 = native_pd.DataFrame([1, 2], columns=["a"], index=index1) + df2 = native_pd.DataFrame([1, 2], columns=["a"], index=index2) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] eval_snowpark_pandas_result( - "pd", "native_pd", _concat_operation([df1, df2], verify_integrity=True) + "pd", + "native_pd", + _concat_operation(snow_objs, native_objs, verify_integrity=True), ) @@ -873,16 +928,20 @@ def test_concat_verify_integrity_axis0(index1, index2): "index1, index2", [([0, 1], [0, 1]), (_multiindex([(1, 1), (1, 2)]), _multiindex([(2, 1), (1, 2)]))], ) -@sql_count_checker(query_count=4, union_count=2) +@sql_count_checker(query_count=2, union_count=2) def test_concat_verify_integrity_axis0_with_keys(index1, index2): # Even though original frames have duplicate columns, after adding keys to column # labels duplicates are resolved, hence no error. - df1 = pd.DataFrame([1, 2], columns=["a"], index=index1) - df2 = pd.DataFrame([1, 2], columns=["a"], index=index2) + df1 = native_pd.DataFrame([1, 2], columns=["a"], index=index1) + df2 = native_pd.DataFrame([1, 2], columns=["a"], index=index2) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2], verify_integrity=True, keys=["red", "green"]), + _concat_operation( + snow_objs, native_objs, verify_integrity=True, keys=["red", "green"] + ), ) @@ -890,16 +949,20 @@ def test_concat_verify_integrity_axis0_with_keys(index1, index2): "index1, index2", [([0, 1], [0, 1]), (_multiindex([(1, 1), (1, 2)]), _multiindex([(2, 1), (1, 2)]))], ) -@sql_count_checker(query_count=3, union_count=1) +@sql_count_checker(query_count=1, union_count=1) def test_concat_verify_integrity_axis0_with_ignore_index(index1, index2): # Even though original frames have duplicate columns, ignore_index=True will # replace original index values with values 0 to n-1, hence no error. - df1 = pd.DataFrame([1, 2], columns=["a"], index=index1) - df2 = pd.DataFrame([1, 2], columns=["a"], index=index2) + df1 = native_pd.DataFrame([1, 2], columns=["a"], index=index1) + df2 = native_pd.DataFrame([1, 2], columns=["a"], index=index2) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2], verify_integrity=True, ignore_index=True), + _concat_operation( + snow_objs, native_objs, verify_integrity=True, ignore_index=True + ), ) @@ -911,14 +974,16 @@ def test_concat_verify_integrity_axis0_with_ignore_index(index1, index2): ([1, 1], [2, 3]), ], ) -@sql_count_checker(query_count=4, union_count=2) +@sql_count_checker(query_count=2, union_count=2) def test_concat_verify_integrity_axis0_negative(index1, index2): - df1 = pd.DataFrame([1, 2], columns=["a"], index=index1) - df2 = pd.DataFrame([1, 2], columns=["a"], index=index2) + df1 = native_pd.DataFrame([1, 2], columns=["a"], index=index1) + df2 = native_pd.DataFrame([1, 2], columns=["a"], index=index2) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2], verify_integrity=True), + _concat_operation(snow_objs, native_objs, verify_integrity=True), expect_exception=True, expect_exception_type=ValueError, expect_exception_match="Indexes have overlapping values: ", @@ -935,6 +1000,7 @@ def test_concat_verify_integrity_axis0_large_overlap_negative(): @sql_count_checker(query_count=0) def test_concat_levels_negative(df1, df2): + df1, df2 = pd.DataFrame(df1), pd.DataFrame(df2) with pytest.raises( NotImplementedError, match="Snowpark pandas doesn't support 'levels' argument in concat API", @@ -943,19 +1009,25 @@ def test_concat_levels_negative(df1, df2): def test_concat_sorted_frames(): - df1 = pd.DataFrame({"A": [5, 2, 7]}) - df2 = pd.DataFrame({"B": [3, 5, 6]}) - df3 = pd.DataFrame({"A": [2, 1, 7], "B": [3, 5, 4]}) - objs = [df1, df2, df3] - with SqlCounter(query_count=4): - eval_snowpark_pandas_result("pd", "native_pd", _concat_operation(objs)) - objs = [ + df1 = native_pd.DataFrame({"A": [5, 2, 7]}) + df2 = native_pd.DataFrame({"B": [3, 5, 6]}) + df3 = native_pd.DataFrame({"A": [2, 1, 7], "B": [3, 5, 4]}) + native_objs = [df1, df2, df3] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] + with SqlCounter(query_count=1): + eval_snowpark_pandas_result( + "pd", "native_pd", _concat_operation(snow_objs, native_objs) + ) + native_objs = [ df1.sort_values(by="A"), df2.sort_values(by="B"), df3.sort_values(by=["B", "A"]), ] - with SqlCounter(query_count=4): - eval_snowpark_pandas_result("pd", "native_pd", _concat_operation(objs)) + snow_objs = [pd.DataFrame(obj) for obj in native_objs] + with SqlCounter(query_count=1): + eval_snowpark_pandas_result( + "pd", "native_pd", _concat_operation(snow_objs, native_objs) + ) @pytest.mark.parametrize( @@ -988,9 +1060,7 @@ def test_concat_sorted_frames(): ], ) @sql_count_checker(query_count=2, union_count=1) -def test_concat_duplicate_columns( - df1, df2, columns1, columns2, expected_rows, expected_cols -): +def test_concat_duplicate_columns(columns1, columns2, expected_rows, expected_cols): df1 = pd.DataFrame([[1, 2, 3]], columns=columns1) df2 = pd.DataFrame([[4, 5, 6]], columns=columns2) expected_df = pd.DataFrame(expected_rows, columns=expected_cols, index=[0, 0]) @@ -999,14 +1069,16 @@ def test_concat_duplicate_columns( @pytest.mark.parametrize("value1", [4, 1.5, True, "c", (1, 2), {"a": 1}]) @pytest.mark.parametrize("value2", [4, 1.5, True, "c", (1, 2), {"a": 1}]) -@sql_count_checker(query_count=3, union_count=1) +@sql_count_checker(query_count=1, union_count=1) def test_concat_type_mismatch(value1, value2): - df1 = pd.DataFrame({"A": [value1]}) - df2 = pd.DataFrame({"A": [value2]}) + df1 = native_pd.DataFrame({"A": [value1]}) + df2 = native_pd.DataFrame({"A": [value2]}) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2]), + _concat_operation(snow_objs, native_objs), ) @@ -1023,27 +1095,32 @@ def test_concat_type_mismatch(value1, value2): ), ], ) -@sql_count_checker(query_count=5, union_count=1) +@sql_count_checker(query_count=1, union_count=1) def test_concat_none_index_name(index1, index2): - df1 = pd.DataFrame([11], columns=["A"], index=index1) - df2 = pd.DataFrame([22], columns=["B"], index=index2) - _concat_operation([df1, df2]), + df1 = native_pd.DataFrame([11], columns=["A"], index=index1) + df2 = native_pd.DataFrame([22], columns=["B"], index=index2) + native_objs = [df1, df2] + snow_objs = [pd.DataFrame(obj) for obj in native_objs] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2]), + _concat_operation(snow_objs, native_objs), ) -@sql_count_checker(query_count=5, union_count=1) +@sql_count_checker(query_count=3, union_count=1) def test_concat_from_file(resources_path): test_files = TestFiles(resources_path) + df1 = native_pd.read_csv(test_files.test_concat_file1_csv) + df2 = native_pd.read_csv(test_files.test_concat_file1_csv) + native_objs = [df1, df2] df1 = pd.read_csv(test_files.test_concat_file1_csv) df2 = pd.read_csv(test_files.test_concat_file1_csv) + snow_objs = [df1, df2] eval_snowpark_pandas_result( "pd", "native_pd", - _concat_operation([df1, df2]), + _concat_operation(snow_objs, native_objs), ) From 66d634c7bd261e4e09766494637ef8daefc5efe7 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Thu, 22 Aug 2024 18:14:14 -0700 Subject: [PATCH 09/42] Update tests --- src/snowflake/snowpark/modin/pandas/series.py | 6 +- tests/integ/modin/frame/test_assign.py | 30 +++--- tests/integ/modin/frame/test_iloc.py | 2 + tests/integ/modin/frame/test_insert.py | 5 +- tests/integ/modin/frame/test_join.py | 42 ++++---- tests/integ/modin/frame/test_loc.py | 4 +- tests/integ/modin/frame/test_merge.py | 98 +++++++++---------- tests/integ/modin/frame/test_reindex.py | 6 +- tests/integ/modin/frame/test_to_snowflake.py | 6 +- tests/integ/modin/frame/test_where.py | 24 ++--- .../modin/pivot/test_pivot_table_single.py | 2 +- tests/integ/modin/resample/test_resample.py | 39 +++----- .../modin/resample/test_resample_asfreq.py | 4 +- .../modin/resample/test_resample_fillna.py | 17 ++-- tests/integ/modin/series/test_empty.py | 2 +- tests/integ/modin/series/test_iloc.py | 1 + tests/integ/modin/series/test_loc.py | 4 +- tests/integ/modin/series/test_reindex.py | 14 +-- tests/integ/modin/series/test_rename.py | 4 +- tests/integ/modin/series/test_sort_values.py | 2 +- tests/integ/modin/series/test_to_snowflake.py | 8 +- tests/integ/modin/series/test_where.py | 10 +- 22 files changed, 164 insertions(+), 166 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index f59c1a7939..5c36afe5ab 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -193,8 +193,10 @@ def __init__( ) )._query_compiler if isinstance(index, Index): - query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( - index._query_compiler + query_compiler = ( + query_compiler.create_qc_with_data_and_index_joined_on_index( + index._query_compiler + ) ) self._query_compiler = query_compiler.columnarize() if name is not None: diff --git a/tests/integ/modin/frame/test_assign.py b/tests/integ/modin/frame/test_assign.py index b0da2a110b..5fb54b9d44 100644 --- a/tests/integ/modin/frame/test_assign.py +++ b/tests/integ/modin/frame/test_assign.py @@ -17,7 +17,7 @@ ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_assign_basic_series(): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -36,7 +36,7 @@ def assign_func(df): eval_snowpark_pandas_result(snow_df, native_df, assign_func) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) @pytest.mark.parametrize( "index", [[2, 1, 0], [4, 5, 6]], ids=["reversed_index", "different_index"] ) @@ -60,8 +60,8 @@ def assign_func(df): @pytest.mark.parametrize("new_col_value", [2, [10, 11, 12], "x"]) def test_assign_basic_non_pandas_object(new_col_value): - join_count = 2 if isinstance(new_col_value, list) else 0 - with SqlCounter(query_count=2, join_count=join_count): + join_count = 4 if isinstance(new_col_value, list) else 1 + with SqlCounter(query_count=1, join_count=join_count): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=native_pd.Index(list("abc"), name="columns"), @@ -74,11 +74,11 @@ def test_assign_basic_non_pandas_object(new_col_value): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=4) def test_assign_invalid_long_column_length_negative(): # pandas errors out in this test, since we are attempting to assign a column of length 5 to a DataFrame with length 3. # Snowpark pandas on the other hand, just truncates the last element of the new column so that it is the correct length. If we wanted - # to error and match pandas behavior, we'd need to eagerly materialize the DataFrame in order to confirm lengths are correct + # to error and match pandas behavior, we'd need to eagerly materialize the DataFrame to confirm lengths are correct # and error otherwise. snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -98,11 +98,11 @@ def test_assign_invalid_long_column_length_negative(): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=4) def test_assign_invalid_short_column_length_negative(): # pandas errors out in this test, since we are attempting to assign a column of length 2 to a DataFrame with length 3. # Snowpark pandas on the other hand, just broadcasts the last element of the new column so that it is filled. If we wanted - # to error and match pandas behavior, we'd need to eagerly materialize the DataFrame in order to confirm lengths are correct + # to error and match pandas behavior, we'd need to eagerly materialize the DataFrame to confirm lengths are correct # and error otherwise. snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -122,7 +122,7 @@ def test_assign_invalid_short_column_length_negative(): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_assign_short_series(): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -136,7 +136,7 @@ def test_assign_short_series(): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) @pytest.mark.parametrize( "index", [[1, 0], [4, 5]], ids=["reversed_index", "different_index"] ) @@ -153,7 +153,7 @@ def test_assign_short_series_mismatched_index(index): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize( "callable_fn", [lambda x: x["a"], lambda x: x["a"] + x["b"]], @@ -172,7 +172,7 @@ def test_assign_basic_callable(callable_fn): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_assign_chained_callable(): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -190,7 +190,7 @@ def test_assign_chained_callable(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_assign_chained_callable_wrong_order(): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -212,7 +212,7 @@ def test_assign_chained_callable_wrong_order(): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_assign_self_columns(): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -226,7 +226,7 @@ def test_assign_self_columns(): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=4) def test_overwrite_columns_via_assign(): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], diff --git a/tests/integ/modin/frame/test_iloc.py b/tests/integ/modin/frame/test_iloc.py index dd09068383..c79d5eb8ba 100644 --- a/tests/integ/modin/frame/test_iloc.py +++ b/tests/integ/modin/frame/test_iloc.py @@ -2710,6 +2710,7 @@ def test_df_iloc_set_with_multi_index( native_items.columns = pd.MultiIndex.from_tuples(item_columns) if row_key_index: + expected_join_count += 1 snow_row_key = pd.Series(row_key, index=pd.Index(row_key_index)) native_row_key = native_pd.Series(row_key, index=pd.Index(row_key_index)) else: @@ -2717,6 +2718,7 @@ def test_df_iloc_set_with_multi_index( native_row_key = row_key if col_key_index: + expected_join_count += 1 snow_col_key = pd.Series(col_key, index=pd.Index(col_key_index)) native_col_key = native_pd.Series(col_key, index=pd.Index(col_key_index)) else: diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index 258d4d2e64..dd8a26fc54 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -723,11 +723,12 @@ def test_insert_multiindex_column_negative(snow_df, columns, insert_label): [["a", "b", "b", "d", "e"], ["x", "y", "z", "u", "u"], True], ], ) -# Two extra queries to convert index to native pandas when creating snowpark pandas dataframes -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_insert_with_unique_and_duplicate_index_values( index_values, other_index_values, expect_mismatch ): + # Two of the three joins come from creating the DataFrame with non-Snowpark pandas data + # and a Snowpark pandas Index. The third join is from the insert operation. data = list(range(5)) data1 = {"foo": data} data2 = {"bar": [val * 10 for val in data]} diff --git a/tests/integ/modin/frame/test_join.py b/tests/integ/modin/frame/test_join.py index 91500189d1..253d8d7049 100644 --- a/tests/integ/modin/frame/test_join.py +++ b/tests/integ/modin/frame/test_join.py @@ -13,14 +13,16 @@ @pytest.fixture def left(): - return pd.DataFrame( + return native_pd.DataFrame( {"a": [1, 1, 0, 4]}, index=native_pd.Index([2, 1, 0, 3], name="li") ) @pytest.fixture def right(): - return pd.DataFrame({"b": [300, 100, 200]}, index=pd.Index([3, 1, 2], name="ri")) + return native_pd.DataFrame( + {"b": [300, 100, 200]}, index=pd.Index([3, 1, 2], name="ri") + ) @pytest.fixture(params=["left", "inner", "right", "outer"]) @@ -41,6 +43,7 @@ def sort(request): @sql_count_checker(query_count=2, join_count=2) def test_join_index_to_index(left, right, how, sort): + left, right = pd.DataFrame(left), pd.DataFrame(right) result = left.join(right, how=how, sort=sort) expected = left.merge(right, left_index=True, right_index=True, how=how, sort=sort) assert_frame_equal(result, expected) @@ -48,18 +51,19 @@ def test_join_index_to_index(left, right, how, sort): @sql_count_checker(query_count=2, join_count=2) def test_join_column_to_index(left, right, how, sort): + left, right = pd.DataFrame(left), pd.DataFrame(right) result = left.join(right, on="a", how=how, sort=sort) expected = left.merge(right, left_on="a", right_index=True, how=how, sort=sort) assert_frame_equal(result, expected) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_join_list_with_on_negative(left, right): eval_snowpark_pandas_result( + pd.DataFrame(left), left, - left.to_pandas(), lambda df: df.join( - [right if isinstance(df, pd.DataFrame) else right.to_pandas()], on="a" + [pd.DataFrame(right) if isinstance(df, pd.DataFrame) else right], on="a" ), expect_exception=True, expect_exception_type=ValueError, @@ -97,6 +101,7 @@ def test_join_suffix_on_list_negative(): ) @sql_count_checker(query_count=2, join_count=2) def test_join_overlapping_columns(left, lsuffix, rsuffix): + left = pd.DataFrame(left) result = left.join(left, how="left", lsuffix=lsuffix, rsuffix=rsuffix) expected = left.merge( left, how="left", left_index=True, right_index=True, suffixes=(lsuffix, rsuffix) @@ -104,11 +109,11 @@ def test_join_overlapping_columns(left, lsuffix, rsuffix): assert_frame_equal(result, expected) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_join_overlapping_columns_negative(left): eval_snowpark_pandas_result( + pd.DataFrame(left), left, - left.to_pandas(), lambda df: df.join(df), expect_exception=True, expect_exception_type=ValueError, @@ -116,11 +121,11 @@ def test_join_overlapping_columns_negative(left): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_join_invalid_how_negative(left): eval_snowpark_pandas_result( + pd.DataFrame(left), left, - left.to_pandas(), lambda df: df.join(df, how="full_outer_join"), expect_exception=True, expect_exception_type=ValueError, @@ -130,21 +135,20 @@ def test_join_invalid_how_negative(left): @sql_count_checker(query_count=2, join_count=2) def test_join_with_series(left): + left = pd.DataFrame(left) right = pd.Series([1, 0, 2], name="s") result = left.join(right) expected = left.merge(right, left_index=True, right_index=True, how="left") assert_frame_equal(result, expected) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_join_unnamed_series_negative(left): - right = pd.Series([1, 0, 2]) + right = native_pd.Series([1, 0, 2]) eval_snowpark_pandas_result( + pd.DataFrame(left), left, - left.to_pandas(), - lambda df: df.join( - right if isinstance(df, pd.DataFrame) else right.to_pandas() - ), + lambda df: df.join(pd.Series(right) if isinstance(df, pd.DataFrame) else right), expect_exception=True, expect_exception_type=ValueError, expect_exception_match="Other Series must have a name", @@ -155,12 +159,13 @@ def test_join_unnamed_series_negative(left): def test_join_unnamed_series_in_list_negative(left): right = pd.Series([1, 0, 2]) with pytest.raises(ValueError, match="Other Series must have a name"): - left.join([right]) + pd.DataFrame(left).join([right]) @sql_count_checker(query_count=2, join_count=4) def test_join_list_mixed(left, right): # Join a DataFrame with a list containing both a Series and a DataFrame + left, right = pd.DataFrame(left), pd.DataFrame(right) series = pd.Series([1, 2, 3], name="s") other = [right, series] result = left.join(other) @@ -170,6 +175,7 @@ def test_join_list_mixed(left, right): @sql_count_checker(query_count=4, join_count=4) def test_join_empty_rows(left, right, how): + left, right = pd.DataFrame(left), pd.DataFrame(right) empty_df = pd.DataFrame(columns=["x", "y"]) # empty on left result = left.join(empty_df, how=how) @@ -183,6 +189,7 @@ def test_join_empty_rows(left, right, how): @sql_count_checker(query_count=4, join_count=4) def test_join_empty_columns(left, right, how): + left, right = pd.DataFrame(left), pd.DataFrame(right) empty_df = pd.DataFrame(native_pd.Index([1, 2, 3])) # empty on left result = left.join(empty_df, how=how) @@ -203,11 +210,12 @@ def test_join_different_levels_negative(left): with pytest.raises( ValueError, match="Can not merge objects with different column levels" ): - left.join(right) + pd.DataFrame(left).join(right) @sql_count_checker(query_count=2, join_count=2) def test_cross_join(left, right): + left, right = pd.DataFrame(left), pd.DataFrame(right) result = left.join(right, how="cross") expected = left.merge(right, how="cross") assert_frame_equal(result, expected) diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index 1012a0d395..68991b3cf1 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -1727,7 +1727,7 @@ def test_df_loc_get_key_bool_series_with_unaligned_and_distinct_indices( # One extra query for series init converting index to native pandas when creating series_key -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_df_loc_get_key_bool_series_with_unaligned_and_duplicate_indices(): key = [True] * 5 # index can have null values and duplicates @@ -2697,7 +2697,7 @@ def test_empty_df_loc_set_series_and_list(native_item): else native_item ) - expected_join_count = 1 if isinstance(native_item, native_pd.Series) else 2 + expected_join_count = 2 if isinstance(native_item, native_pd.Series) else 4 def setitem_op(df): item = native_item if isinstance(df, native_pd.DataFrame) else snow_item diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py index 7ac88042e7..80df6bc516 100644 --- a/tests/integ/modin/frame/test_merge.py +++ b/tests/integ/modin/frame/test_merge.py @@ -213,7 +213,7 @@ def _verify_merge( left_index: If True, use index from left DataFrame as join keys. right_index: If True, use index from right DataFrame as join keys. force_output_column_order: If provided, reorder native result using this list. - indicator: If true include indicator column. + indicator: If True, include indicator column. Returns: None @@ -276,13 +276,13 @@ def _verify_merge( @pytest.mark.parametrize("on", ["A", "B", ["A", "B"], ("A", "B")]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_on(left_df, right_df, on, how, sort): _verify_merge(left_df, right_df, how, on=on, sort=sort) @pytest.mark.parametrize("on", ["left_i", "right_i"]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_on_index_columns(left_df, right_df, how, on, sort): # Change left_df to: columns=["right_i", "B", "left_c", "left_d"] index=["left_i"] left_df = left_df.rename(columns={"A": "right_i"}) @@ -361,7 +361,7 @@ def test_join_type_mismatch_diff_with_native_pandas(index1, index2, expected_res @pytest.mark.parametrize("on", ["A", "B", "C"]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_on_index_columns_with_multiindex(left_df, right_df, how, on, sort): # Change left_df to: columns = ["C", "left_d"] index = ["A", "B"] left_df = left_df.rename(columns={"left_c": "C"}).set_index(["A", "B"]) @@ -370,7 +370,7 @@ def test_merge_on_index_columns_with_multiindex(left_df, right_df, how, on, sort _verify_merge(left_df, right_df, how, on=on, sort=sort) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_on_multiindex_with_non_multiindex(left_df, right_df, how, sort): # Change left_df to: columns = ["A", "B"] index = ["left_c", "left_d"] left_df = left_df.set_index(["left_c", "left_d"]) @@ -392,29 +392,29 @@ def test_merge_on_multiindex_with_non_multiindex(left_df, right_df, how, sort): (["A", "left_i"], ["B", "right_i"]), # Mix of index and data join keys ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_left_on_right_on(left_df, right_df, how, left_on, right_on, sort): _verify_merge(left_df, right_df, how, left_on=left_on, right_on=right_on, sort=sort) @pytest.mark.parametrize("left_on", ["left_i", "A", "B"]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_left_on_right_index(left_df, right_df, how, left_on, sort): _verify_merge(left_df, right_df, how, left_on=left_on, right_index=True, sort=sort) @pytest.mark.parametrize("right_on", ["right_i", "A", "B"]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_left_index_right_on(left_df, right_df, how, right_on, sort): _verify_merge(left_df, right_df, how, left_index=True, right_on=right_on, sort=sort) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_on_index_single_index(left_df, right_df, how, sort): _verify_merge(left_df, right_df, how, left_index=True, right_index=True, sort=sort) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_on_index_multiindex_common_labels(left_df, right_df, how, sort): left_df = left_df.set_index("A", append=True) # index columns ['left_i', 'A'] right_df = right_df.set_index("A", append=True) # index columns ['right_i', 'A'] @@ -444,7 +444,7 @@ def test_merge_on_index_multiindex_common_labels_with_none( ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_on_index_multiindex_equal_labels(left_df, right_df, how, sort): # index columns ['A', 'B] left_df = left_df.set_index(["A", "B"]) @@ -463,7 +463,7 @@ def test_merge_left_index_right_index_single_to_multi(left_df, right_df, how, so if how == "inner" and sort is False: pytest.skip("pandas bug: https://github.com/pandas-dev/pandas/issues/55774") else: - with SqlCounter(query_count=3, join_count=1): + with SqlCounter(query_count=3, join_count=5): _verify_merge( left_df, right_df, @@ -489,7 +489,7 @@ def test_merge_left_index_right_index_single_to_multi(left_df, right_df, how, so .merge(right_df.to_pandas(), how=how, on="left_i", sort=sort) .reset_index(drop=True) ) - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=3): assert_snowpark_pandas_equal_to_pandas( snow_res.reset_index(drop=True), native_res ) @@ -500,7 +500,7 @@ def test_merge_left_index_right_index_multi_to_single(left_df, right_df, how, so "right_i", append=True ) # index columns ['left_i', 'right_i'] if how in ("left", "inner"): - with SqlCounter(query_count=3, join_count=1): + with SqlCounter(query_count=3, join_count=5): _verify_merge( left_df, right_df, how=how, left_index=True, right_index=True, sort=sort ) @@ -519,13 +519,13 @@ def test_merge_left_index_right_index_multi_to_single(left_df, right_df, how, so .merge(right_df.to_pandas(), how=how, on="right_i", sort=sort) .reset_index(drop=True) ) - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=3): assert_snowpark_pandas_equal_to_pandas( snow_res.reset_index(drop=True), native_res ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_left_index_right_index_no_common_names_negative(left_df, right_df): left_df = left_df.set_index("B", append=True) # index columns ['left_i', 'B'] right_df = right_df.set_index("A", append=True) # index columns ['right_i', 'A'] @@ -543,7 +543,7 @@ def test_merge_left_index_right_index_no_common_names_negative(left_df, right_df ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_left_index_right_index_none_as_common_label_negative(left_df, right_df): # index columns [None, 'B'] left_df = left_df.reset_index(drop=True).set_index("B", append=True) @@ -563,7 +563,7 @@ def test_merge_left_index_right_index_none_as_common_label_negative(left_df, rig ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_cross(left_df, right_df, sort): eval_snowpark_pandas_result( left_df, @@ -587,7 +587,7 @@ def test_merge_cross(left_df, right_df, sort): {"left_index": True, "right_on": "A"}, ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_non_empty_with_empty(left_df, empty_df, how, kwargs, sort): _verify_merge(left_df, empty_df, how, sort=sort, **kwargs) @@ -601,7 +601,7 @@ def test_merge_non_empty_with_empty(left_df, empty_df, how, kwargs, sort): {"left_index": True, "right_on": "A"}, ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_empty_with_non_empty(empty_df, right_df, how, kwargs, sort): # Native pandas returns incorrect column order when left frame is empty. # https://github.com/pandas-dev/pandas/issues/51929 @@ -637,7 +637,7 @@ def test_merge_empty_with_non_empty(empty_df, right_df, how, kwargs, sort): (None, None, ["A", "B"], True, False), # left.num_index_levels != len(right_on) ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_mis_specified_negative( left_df, right_df, on, left_on, right_on, left_index, right_index ): @@ -666,7 +666,7 @@ def test_merge_mis_specified_negative( (None, None, None, False, True), # right_index is set to True ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_cross_mis_specified_negative( left_df, right_df, on, left_on, right_on, left_index, right_index ): @@ -704,7 +704,7 @@ def test_merge_cross_mis_specified_negative( (0.0, 0.0, {"suffixes": ("_x", None)}), ], ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_suffix(left_df, right_df, left_col, right_col, kwargs): left_df = left_df.rename(columns={"A": left_col}) right_df = right_df.rename(columns={"A": right_col}) @@ -720,7 +720,7 @@ def test_merge_suffix(left_df, right_df, left_col, right_col, kwargs): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_duplicate_suffix(left_df, right_df): eval_snowpark_pandas_result( left_df, @@ -734,7 +734,7 @@ def test_merge_duplicate_suffix(left_df, right_df): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_label_conflict_with_suffix(left_df, right_df): # Test the behavior when adding suffix crates a conflict with another label. # Note: This raises a warning in pandas 2.0 and will raise an error in future @@ -758,7 +758,7 @@ def test_merge_label_conflict_with_suffix(left_df, right_df): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_non_str_suffix(left_df, right_df): eval_snowpark_pandas_result( left_df, @@ -776,7 +776,7 @@ def test_merge_non_str_suffix(left_df, right_df): "suffixes", [(None, None), ("", None), (None, ""), ("", "")], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_empty_suffix_negative(left_df, right_df, suffixes): eval_snowpark_pandas_result( left_df, @@ -794,7 +794,7 @@ def test_merge_empty_suffix_negative(left_df, right_df, suffixes): "suffixes", [("a", "b", "c"), tuple("a")], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_suffix_length_error_negative(left_df, right_df, suffixes): eval_snowpark_pandas_result( left_df, @@ -808,7 +808,7 @@ def test_merge_suffix_length_error_negative(left_df, right_df, suffixes): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_duplicate_labels(left_df, right_df): # Change left_df columns to ["A", "B", "left_c", "left_c"] # 'left_c' is a duplicate label. @@ -824,7 +824,7 @@ def test_merge_duplicate_labels(left_df, right_df): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_duplicate_join_keys_negative(left_df, right_df): # Change left_df columns to ["A", "B", "left_c", "left_c"] # 'left_c' is a duplicate label. This can not be used as join key. @@ -860,14 +860,14 @@ def test_merge_with_self(): @pytest.mark.parametrize("on", ["A", "B"]) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=4, join_count=4) def test_merge_with_series(left_df, right_df, how, on, sort): native_series = right_df.to_pandas()[on] snow_series = pd.Series(native_series) _verify_merge(left_df, snow_series, how=how, on=on, sort=sort) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_merge_with_unnamed_series_negative(left_df): native_series = native_pd.Series([1, 2, 3]) snow_series = pd.Series(native_series) @@ -923,7 +923,7 @@ def test_merge_outer_with_nan(dtype): # Two extra queries to convert to native index for dataframe constructor when creating left and right -@sql_count_checker(query_count=5, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_different_index_names(): left = pd.DataFrame({"a": [1]}, index=pd.Index([1], name="c")) right = pd.DataFrame({"a": [1]}, index=pd.Index([1], name="d")) @@ -938,13 +938,13 @@ def test_merge_different_index_names(): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_no_join_keys(left_df, right_df, how, sort): _verify_merge(left_df, right_df, how, sort=sort) @pytest.mark.parametrize("left_name, right_name", [("left_a", "right_a"), (1, "1")]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_no_join_keys_negative(left_name, right_name, left_df, right_df): left_df = left_df.rename(columns={"A": left_name, "B": "left_b"}) right_df = right_df.rename(columns={"A": right_name, "B": "right_b"}) @@ -979,7 +979,7 @@ def test_merge_no_join_keys_common_index_negative(left_df, right_df): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_no_join_keys_common_index_with_data_negative(left_df, right_df): left_df = left_df.rename(columns={"A": "left_a", "B": "left_b"}) right_df = right_df.rename(columns={"A": "right_a", "B": "left_i"}) @@ -1003,16 +1003,16 @@ def test_merge_no_join_keys_common_index_with_data_negative(left_df, right_df): @pytest.mark.parametrize( "left_on, right_on, expected_query_count, expected_join_count", [ - (np.array(["a", "b", "c", "x", "y"]), "right_d", 5, 2), - ([np.array(["a", "b", "c", "x", "y"]), "A"], ["right_d", "A"], 5, 2), - ("left_d", np.array(["a", "b", "c", "x", "y"]), 5, 2), - (["left_d", "A"], [np.array(["a", "b", "c", "x", "y"]), "A"], 5, 2), - (["left_d", "A"], (np.array(["a", "b", "c", "x", "y"]), "A"), 5, 2), # tuple + (np.array(["a", "b", "c", "x", "y"]), "right_d", 5, 7), + ([np.array(["a", "b", "c", "x", "y"]), "A"], ["right_d", "A"], 5, 7), + ("left_d", np.array(["a", "b", "c", "x", "y"]), 5, 7), + (["left_d", "A"], [np.array(["a", "b", "c", "x", "y"]), "A"], 5, 7), + (["left_d", "A"], (np.array(["a", "b", "c", "x", "y"]), "A"), 5, 7), # tuple ( np.array(["a", "b", "c", "x", "y"]), np.array(["x", "y", "c", "a", "b"]), 7, - 3, + 9, ), ], ) @@ -1023,7 +1023,7 @@ def test_merge_on_array_like_keys( _verify_merge(left_df, right_df, how=how, left_on=left_on, right_on=right_on) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_on_array_like_keys_conflict_negative(left_df, right_df): left_on = np.array(["a", "b", "c", "x", "y"]) right_on = np.array(["x", "y", "c", "a", "b"]) @@ -1050,7 +1050,7 @@ def test_merge_on_array_like_keys_conflict_negative(left_df, right_df): np.array(["a", "b", "c", "a", "b", "c"]), # too long ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=1) def test_merge_on_array_like_keys_length_mismatch_negative(left_df, right_df, left_on): # Native pandas raises # ValueError: The truth value of an array with more than one element is ambiguous @@ -1062,22 +1062,22 @@ def test_merge_on_array_like_keys_length_mismatch_negative(left_df, right_df, le left_df.merge(right_df, left_on=left_on, right_on="right_d") -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_with_indicator(left_df, right_df, how): _verify_merge(left_df, right_df, how, on="A", indicator=True) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_with_indicator_cross_join(left_df, right_df): _verify_merge(left_df, right_df, how="cross", indicator=True) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_merge_with_indicator_explicit_name(left_df, right_df): _verify_merge(left_df, right_df, "outer", on="A", indicator="indicator_col") -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_with_invalid_indicator_type_negative(left_df, right_df): eval_snowpark_pandas_result( left_df, @@ -1093,7 +1093,7 @@ def test_merge_with_invalid_indicator_type_negative(left_df, right_df): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_with_indicator_explicit_name_negative(left_df, right_df): left_df = left_df.rename(columns={"left_c": "_merge"}) eval_snowpark_pandas_result( diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index 692fd66471..423e526fef 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -209,7 +209,7 @@ def perform_reindex(df): perform_reindex, ) - @sql_count_checker(query_count=2, join_count=1) + @sql_count_checker(query_count=1, join_count=2) @pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) def test_reindex_index_datetime_with_fill(self, limit, method): @@ -248,7 +248,7 @@ def test_reindex_index_non_overlapping_index(self): snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("EFG")) ) - @sql_count_checker(query_count=2, join_count=1) + @sql_count_checker(query_count=1, join_count=2) def test_reindex_index_non_overlapping_datetime_index(self): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_df = native_pd.DataFrame( @@ -273,7 +273,7 @@ def perform_reindex(df): snow_df, native_df, perform_reindex, check_freq=False ) - @sql_count_checker(query_count=1) + @sql_count_checker(query_count=0) def test_reindex_index_non_overlapping_different_types_index_negative(self): date_index = pd.date_range("1/1/2010", periods=6, freq="D") snow_df = pd.DataFrame( diff --git a/tests/integ/modin/frame/test_to_snowflake.py b/tests/integ/modin/frame/test_to_snowflake.py index 3fbdc7e754..f688b9f1af 100644 --- a/tests/integ/modin/frame/test_to_snowflake.py +++ b/tests/integ/modin/frame/test_to_snowflake.py @@ -15,7 +15,7 @@ @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("index_labels", [None, ["my_index"]]) # one extra query to convert index to native pandas when creating the snowpark pandas dataframe -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2, join_count=1) def test_to_snowflake_index(test_table_name, index, index_labels): df = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index") @@ -179,7 +179,7 @@ def test_to_snowflake_column_with_quotes(session, test_table_name): # one extra query to convert index to native pandas when creating the snowpark pandas dataframe -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_to_snowflake_index_label_none_raises(test_table_name): df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -197,7 +197,7 @@ def test_to_snowflake_index_label_none_raises(test_table_name): # one extra query to convert index to native pandas when creating the snowpark pandas dataframe -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_to_snowflake_data_label_none_raises(test_table_name): df = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index") diff --git a/tests/integ/modin/frame/test_where.py b/tests/integ/modin/frame/test_where.py index 48cfc7bba5..006b7e76fb 100644 --- a/tests/integ/modin/frame/test_where.py +++ b/tests/integ/modin/frame/test_where.py @@ -443,7 +443,7 @@ def test_dataframe_where_not_implemented(test_data, test_cond, test_others): snow_dfs[0].where(snow_dfs[1], snow_dfs[2], axis=1) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=2) def test_dataframe_where_cond_is_array(caplog): data = [[1, 2], [3, 4]] cond = np.array([[True, False], [False, True]]) @@ -506,7 +506,7 @@ def __call__(self, df): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=2) def test_dataframe_where_other_is_array(): data = [[1, 3], [2, 4]] other = np.array([[99, -99], [101, -101]]) @@ -549,7 +549,7 @@ def test_dataframe_where_sizes_do_not_match_negative_test(test_data, test_cond): snow_df.where(snow_cond_df) -@sql_count_checker(query_count=3, join_count=2) +@sql_count_checker(query_count=2, join_count=3) def test_dataframe_where_with_np_array_cond(): data = [1, 2, 3] cond = np.array([[False, True, False]]).T @@ -576,8 +576,7 @@ def test_dataframe_where_with_np_array_cond(): ) -# one extra query to convert index to native pandas when creating the snowpark pandas dataframe -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=3) def test_dataframe_where_with_np_array_cond_mismatched_labels(): data = [1, 2, 3] cond = np.array([[False, True, False]]).T @@ -602,7 +601,7 @@ def test_dataframe_where_with_np_array_cond_mismatched_labels(): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=3) def test_dataframe_where_with_dataframe_cond_single_index_different_names(): data = [1, 2, 3] cond = [False, True, False] @@ -630,7 +629,7 @@ def test_dataframe_where_with_dataframe_cond_single_index_different_names(): # one extra query to convert index to native pandas when creating the snowpark pandas dataframe -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=3) def test_dataframe_where_with_dataframe_cond_single_index_different_names_2(): data = [1, 2, 3] cond = [False, True, False] @@ -692,7 +691,7 @@ def test_dataframe_where_with_duplicated_index_aligned(cond_frame, other): native_other = other snow_other = other - expected_join_count = 1 if isinstance(other, int) else 2 + expected_join_count = 2 if isinstance(other, int) else 3 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( snow_df, @@ -703,8 +702,9 @@ def test_dataframe_where_with_duplicated_index_aligned(cond_frame, other): ) -# 3 extra queries to convert index to native pandas when creating the 3 snowpark pandas dataframe -@sql_count_checker(query_count=4, join_count=2) +# 3 extra join queries to create the 3 snowpark pandas dataframe with non-Snowpark pandas data +# and a Snowpark pandas Index. +@sql_count_checker(query_count=1, join_count=5) def test_dataframe_where_with_duplicated_index_unaligned(): data = [3, 4, 5, 2] df_index = pd.Index([2, 1, 2, 3], name="index") @@ -902,7 +902,7 @@ def perform_where(df): ) -@sql_count_checker(query_count=3, join_count=2) +@sql_count_checker(query_count=2, join_count=3) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], @@ -947,7 +947,7 @@ def perform_where(df): ) -@sql_count_checker(query_count=3, join_count=2, union_count=1) +@sql_count_checker(query_count=2, join_count=3, union_count=1) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], diff --git a/tests/integ/modin/pivot/test_pivot_table_single.py b/tests/integ/modin/pivot/test_pivot_table_single.py index 9feab0c09f..e53b553090 100644 --- a/tests/integ/modin/pivot/test_pivot_table_single.py +++ b/tests/integ/modin/pivot/test_pivot_table_single.py @@ -226,7 +226,7 @@ def test_pivot_table_with_sum_and_count_null_and_empty_values_matching_behavior_ # One extra query to convert to native pandas in dataframe constructor when creating snow_df -@sql_count_checker(query_count=6, join_count=1) +@sql_count_checker(query_count=5, join_count=2) def test_pivot_on_inline_data_using_temp_table(): # Create a large dataframe of inlined data that will spill to a temporary table. snow_df = pd.DataFrame( diff --git a/tests/integ/modin/resample/test_resample.py b/tests/integ/modin/resample/test_resample.py index 63c72452c1..af99185294 100644 --- a/tests/integ/modin/resample/test_resample.py +++ b/tests/integ/modin/resample/test_resample.py @@ -32,8 +32,7 @@ def randomword(length): @freq @interval @agg_func -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_with_varying_freq_and_interval(freq, interval, agg_func): rule = f"{interval}{freq}" eval_snowpark_pandas_result( @@ -46,8 +45,7 @@ def test_resample_with_varying_freq_and_interval(freq, interval, agg_func): ) -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_date_before_snowflake_alignment_date(): # Snowflake TIMESLICE alignment date is 1970-01-01 00:00:00 date_data = native_pd.to_datetime( @@ -68,8 +66,7 @@ def test_resample_date_before_snowflake_alignment_date(): @interval -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_date_wraparound_snowflake_alignment_date(interval): # Snowflake TIMESLICE alignment date is 1970-01-01 00:00:00 date_data = native_pd.to_datetime( @@ -92,8 +89,7 @@ def test_resample_date_wraparound_snowflake_alignment_date(interval): @agg_func @freq -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_missing_data_upsample(agg_func, freq): # this tests to make sure that missing resample bins will be filled in. date_data = native_pd.date_range("2020-01-01", periods=13, freq=f"1{freq}").delete( @@ -107,8 +103,7 @@ def test_resample_missing_data_upsample(agg_func, freq): ) -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_duplicated_timestamps_downsample(): date_data = native_pd.to_datetime( [ @@ -127,8 +122,7 @@ def test_resample_duplicated_timestamps_downsample(): ) -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_duplicated_timestamps(): date_data = native_pd.to_datetime( [ @@ -164,11 +158,10 @@ def test_resample_series(freq, interval, agg_func): ) -# One extra query to convert index to native pandas for dataframe constructor @pytest.mark.parametrize( "agg_func", ["max", "min", "mean", "median", "sum", "std", "var"] ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_numeric_only(agg_func): eval_snowpark_pandas_result( *create_test_dfs( @@ -180,9 +173,8 @@ def test_resample_numeric_only(agg_func): ) -# One extra query to convert index to native pandas for dataframe constructor @agg_func -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_df_with_nan(agg_func): # resample bins of 'A' each have a NaN. 1 resample bin of 'B' is all NaN eval_snowpark_pandas_result( @@ -210,8 +202,7 @@ def test_resample_ser_with_nan(agg_func): @agg_func -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_single_resample_bin(agg_func): eval_snowpark_pandas_result( *create_test_dfs( @@ -224,8 +215,7 @@ def test_resample_single_resample_bin(agg_func): @agg_func -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_index_with_nan(agg_func): datecol = native_pd.to_datetime( ["2020-01-01", "2020-01-03", "2020-01-05", np.nan, "2020-01-09", np.nan] @@ -240,8 +230,7 @@ def test_resample_index_with_nan(agg_func): ) -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_df_getitem(): eval_snowpark_pandas_result( *create_test_dfs( @@ -264,8 +253,7 @@ def test_resample_ser_getitem(): ) -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_date_trunc_day(): # resample bins of 'A' each have a NaN. 1 resample bin of 'B' is all NaN eval_snowpark_pandas_result( @@ -278,8 +266,7 @@ def test_resample_date_trunc_day(): ) -# One extra query to convert index to native pandas for dataframe constructor -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_date_trunc_hour(): # resample bins of 'A' each have a NaN. 1 resample bin of 'B' is all NaN eval_snowpark_pandas_result( diff --git a/tests/integ/modin/resample/test_resample_asfreq.py b/tests/integ/modin/resample/test_resample_asfreq.py index 50e9646a4c..fc60f62621 100644 --- a/tests/integ/modin/resample/test_resample_asfreq.py +++ b/tests/integ/modin/resample/test_resample_asfreq.py @@ -19,7 +19,7 @@ @freq @interval -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_asfreq_no_method(freq, interval): rule = f"{interval}{freq}" eval_snowpark_pandas_result( @@ -32,7 +32,7 @@ def test_asfreq_no_method(freq, interval): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_asfreq_ffill(): eval_snowpark_pandas_result( *create_test_dfs( diff --git a/tests/integ/modin/resample/test_resample_fillna.py b/tests/integ/modin/resample/test_resample_fillna.py index 6b47368eb5..6be0388f27 100644 --- a/tests/integ/modin/resample/test_resample_fillna.py +++ b/tests/integ/modin/resample/test_resample_fillna.py @@ -15,10 +15,9 @@ agg_func = pytest.mark.parametrize("agg_func", ["ffill", "bfill"]) -# One extra query to convert index to native pandas for dataframe constructor @interval @agg_func -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_fill(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -67,10 +66,9 @@ def test_resample_fill_ser(interval, agg_func): ) -# One extra query to convert index to native pandas for dataframe constructor @interval @agg_func -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_ffill_one_gap(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -112,10 +110,9 @@ def resample_ffill_ser_one_gap(agg_func): ) -# One extra query to convert index to native pandas for dataframe constructor @interval @agg_func -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_ffill_missing_in_middle(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -162,10 +159,9 @@ def test_resample_ffill_ser_missing_in_middle(interval, agg_func): ) -# One extra query to convert index to native pandas for dataframe constructor @interval @agg_func -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_ffill_ffilled_with_none(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -187,10 +183,9 @@ def test_resample_ffill_ffilled_with_none(interval, agg_func): ) -# One extra query to convert index to native pandas for dataframe constructor @interval @agg_func -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_ffill_large_gaps(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -212,7 +207,7 @@ def test_resample_ffill_large_gaps(interval, agg_func): @interval @pytest.mark.parametrize("method", ["ffill", "pad", "backfill", "bfill"]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_fillna(interval, method): datecol = native_pd.to_datetime( [ diff --git a/tests/integ/modin/series/test_empty.py b/tests/integ/modin/series/test_empty.py index 7040fa43fd..a30a69116c 100644 --- a/tests/integ/modin/series/test_empty.py +++ b/tests/integ/modin/series/test_empty.py @@ -44,7 +44,7 @@ def test_series_empty(args, kwargs): ) -@sql_count_checker(query_count=7) +@sql_count_checker(query_count=5, join_count=2) def test_empty_series_type(): def check_dtype(series): assert series.to_pandas().dtype == series.dtype diff --git a/tests/integ/modin/series/test_iloc.py b/tests/integ/modin/series/test_iloc.py index ac02f368dd..2f9444b9ae 100644 --- a/tests/integ/modin/series/test_iloc.py +++ b/tests/integ/modin/series/test_iloc.py @@ -823,6 +823,7 @@ def test_df_iloc_set_with_multi_index( native_items.index = pd.MultiIndex.from_tuples(item_index) if row_key_index: + expected_join_count += 1 snow_row_key = pd.Series(row_key, index=pd.Index(row_key_index)) native_row_key = native_pd.Series(row_key, index=pd.Index(row_key_index)) else: diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index aa16a841f2..d60b9eb26a 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -251,7 +251,7 @@ def test_series_loc_get_key_bool_series_with_aligned_indices(key, use_default_in [random.choice([True, False]) for _ in range(5)], ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_loc_get_key_bool_series_with_unaligned_and_distinct_indices( key, use_default_index ): @@ -343,7 +343,7 @@ def test_df_loc_get_callable_key(row): ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_loc_get_key_bool_series_with_unaligned_and_duplicate_indices(): # index can have null values and duplicates key = [True] * 5 diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index 7c2bbba906..9450112ae5 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -259,17 +259,17 @@ def perform_reindex(series): ) -@sql_count_checker(query_count=2, join_count=1) -@pytest.mark.parametrize("limit", [None, 1, 2, 100]) -@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) +# @sql_count_checker(query_count=2, join_count=1) +@pytest.mark.parametrize("limit", [None]) # , 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill"]) # , "backfill", "pad", "ffill"]) def test_reindex_index_datetime_with_fill(limit, method): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_series = native_pd.Series( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + {"1/1/2020": [100, 101, np.nan, 100, 89, 88]}, index=date_index ) date_index = pd.date_range("1/1/2010", periods=6, freq="D") snow_series = pd.Series( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + {"1/1/2020": [100, 101, np.nan, 100, 89, 88]}, index=date_index ) def perform_reindex(series): @@ -300,7 +300,7 @@ def test_reindex_index_non_overlapping_index(): ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_reindex_index_non_overlapping_datetime_index(): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_series = native_pd.Series( @@ -326,7 +326,7 @@ def perform_reindex(series): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_reindex_index_non_overlapping_different_types_index_negative(): date_index = pd.date_range("1/1/2010", periods=6, freq="D") snow_series = pd.Series( diff --git a/tests/integ/modin/series/test_rename.py b/tests/integ/modin/series/test_rename.py index 8dda080d95..4ccf29706f 100644 --- a/tests/integ/modin/series/test_rename.py +++ b/tests/integ/modin/series/test_rename.py @@ -52,7 +52,7 @@ def test_rename_partial_dict(self): renamed = ser.rename({"b": "foo", "d": "bar"}) assert_index_equal(renamed.index, native_pd.Index(["a", "foo", "c", "bar"])) - @sql_count_checker(query_count=1, join_count=0) + @sql_count_checker(query_count=0) def test_rename_retain_index_name(self): # index with name renamer = Series( @@ -130,7 +130,7 @@ class MyIndexer: ser.rename(ix, inplace=True) assert ser.name is ix - @sql_count_checker(query_count=1) + @sql_count_checker(query_count=0) def test_rename_callable(self): # GH 17407 ser = Series(range(1, 6), index=Index(range(2, 7), name="IntIndex")) diff --git a/tests/integ/modin/series/test_sort_values.py b/tests/integ/modin/series/test_sort_values.py index e966409dfc..b147377f75 100644 --- a/tests/integ/modin/series/test_sort_values.py +++ b/tests/integ/modin/series/test_sort_values.py @@ -33,7 +33,7 @@ def snow_series(snow_df): @pytest.mark.parametrize("by", ["A", "B", "a", "b"]) @pytest.mark.parametrize("ascending", [True, False]) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=3, join_count=3) def test_sort_values(snow_df, by, ascending): snow_series = snow_df[by] native_series = snow_series.to_pandas() diff --git a/tests/integ/modin/series/test_to_snowflake.py b/tests/integ/modin/series/test_to_snowflake.py index 968a96e33d..92b428f70e 100644 --- a/tests/integ/modin/series/test_to_snowflake.py +++ b/tests/integ/modin/series/test_to_snowflake.py @@ -29,7 +29,7 @@ def _verify_num_rows(session, table_name: str, expected: int) -> None: @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("index_labels", [None, ["my_index"]]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=1) def test_to_snowflake_index(test_table_name, snow_series, index, index_labels): snow_series.to_snowflake( test_table_name, if_exists="replace", index=index, index_label=index_labels @@ -108,7 +108,7 @@ def test_to_snowflake_if_exists(session, test_table_name, snow_series): _verify_num_rows(session, test_table_name, 6) -@sql_count_checker(query_count=4) +@sql_count_checker(query_count=4, join_count=1) def test_to_snowflake_if_exists_negative(session, test_table_name, snow_series): # Create a table. snow_series.to_snowflake(test_table_name, if_exists="fail", index=False) @@ -127,7 +127,7 @@ def test_to_snowflake_if_exists_negative(session, test_table_name, snow_series): @pytest.mark.parametrize("index_label", VALID_PANDAS_LABELS) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=1) def test_to_snowflake_index_column_labels(index_label, test_table_name, snow_series): snow_series.to_snowflake( test_table_name, if_exists="replace", index=True, index_label=index_label @@ -136,7 +136,7 @@ def test_to_snowflake_index_column_labels(index_label, test_table_name, snow_ser @pytest.mark.parametrize("col_label", VALID_PANDAS_LABELS) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=1) def test_to_snowflake_data_column_labels(col_label, test_table_name, snow_series): snow_series = snow_series.rename(col_label) snow_series.to_snowflake(test_table_name, if_exists="replace", index=False) diff --git a/tests/integ/modin/series/test_where.py b/tests/integ/modin/series/test_where.py index 18fec0aadf..9f0c6d0f80 100644 --- a/tests/integ/modin/series/test_where.py +++ b/tests/integ/modin/series/test_where.py @@ -103,7 +103,7 @@ def test_series_where_index_no_names(): ) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3, join_count=2) def test_series_where_with_np_array_cond(): data = [1, 2] cond = np.array([True, False]) @@ -114,7 +114,7 @@ def test_series_where_with_np_array_cond(): eval_snowpark_pandas_result(snow_ser, native_ser, lambda df: df.where(cond)) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_series_where_with_series_cond_single_index_different_names(): data = [1, 2, 3] cond = [False, True, False] @@ -139,7 +139,7 @@ def test_series_where_with_series_cond_single_index_different_names(): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_series_where_with_duplicated_index_aligned(): data = [1, 2, 3] cond = [False, True, False] @@ -196,9 +196,11 @@ def test_series_where_with_lambda_cond_returns_singleton_should_fail(): @pytest.mark.parametrize( "other, sql_count, join_count", - [(lambda x: -x.iloc[0], 5, 3), (lambda x: x**2, 4, 2)], + [(lambda x: -x.iloc[0], 4, 10), (lambda x: x**2, 3, 8)], ) def test_series_where_with_lambda_other(other, sql_count, join_count): + # High join count due to creatinga Series with non-Snowpark pandas data + # and a Snowpark pandas Index. data = [1, 6, 7, 4] index = pd.Index(["a", "b", "c", "d"]) From 6a2cb79dd8472a6fdbdb9a1ad819ef812771e114 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 23 Aug 2024 14:09:06 -0700 Subject: [PATCH 10/42] added edge case logic, fix test query count --- .../snowpark/modin/pandas/dataframe.py | 16 ++++++++ src/snowflake/snowpark/modin/pandas/series.py | 29 +++++++++++--- tests/integ/modin/frame/test_mask.py | 13 ++++--- tests/integ/modin/frame/test_setitem.py | 2 +- .../integ/modin/groupby/test_groupby_apply.py | 22 +++-------- .../modin/groupby/test_groupby_negative.py | 6 +-- .../modin/groupby/test_groupby_series.py | 2 +- .../test_df_series_creation_with_index.py | 39 ++++++++++++++++++- tests/integ/modin/index/test_name.py | 8 ++-- tests/integ/modin/series/test_iloc.py | 5 +-- tests/integ/modin/series/test_loc.py | 2 +- tests/integ/modin/series/test_mask.py | 14 ++++--- tests/integ/modin/series/test_reindex.py | 6 +-- tests/integ/modin/test_concat.py | 1 - 14 files changed, 113 insertions(+), 52 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 609f5bf55e..f35e40373e 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -297,6 +297,22 @@ def __init__( for k, v in data.items() } + if all(len(v) == 1 for v in data.values()) and index is not None: + # Special case when creating: + # >>> DataFrame({"A": [1], "V": [2]}, native_pd.Index(["A", "B", "C"]), name="none") + # A V + # none + # A 1 2 + # B 1 2 <--- the first row is copied into the rest of the rows. + # C 1 2 + # Recreate a 2-d array with the first row copied into the rest of the rows. + self._query_compiler = DataFrame( + data=[[v[0] for v in data.values()]] * len(index), + index=index, + columns=list(data.keys()), + )._query_compiler + return + new_index = index if isinstance(index, Index): # Skip turning this into a native pandas object here since this issues an extra query. diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 5c36afe5ab..7ac3172328 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -165,6 +165,26 @@ def __init__( ) # 3. Perform .loc[] on `data` to select the rows that are in `index`. query_compiler = data.loc[index]._query_compiler + + elif is_dict_like(data) and not isinstance(data, (pandas.Series, Series)): + if name is None: + name = MODIN_UNNAMED_SERIES_LABEL + # If the data is a dictionary, we need to convert it to a query compiler and set the index. + query_compiler = from_pandas( + pandas.DataFrame( + pandas.Series( + data=data, dtype=dtype, name=name, copy=copy, fastpath=fastpath + ) + ) + )._query_compiler + if index is not None: + index = index if isinstance(index, Index) else Index(index) + query_compiler = ( + query_compiler.create_qc_with_data_and_index_joined_on_index( + index._query_compiler + ) + ) + if query_compiler is None: # Defaulting to pandas if name is None: @@ -184,7 +204,7 @@ def __init__( pandas.DataFrame( pandas.Series( data=try_convert_index_to_native(data), - index=try_convert_index_to_native(new_index), + index=new_index, dtype=dtype, name=name, copy=copy, @@ -193,10 +213,9 @@ def __init__( ) )._query_compiler if isinstance(index, Index): - query_compiler = ( - query_compiler.create_qc_with_data_and_index_joined_on_index( - index._query_compiler - ) + # Performing set index to directly set the index column (joining on row-position instead of index). + query_compiler = query_compiler.set_index_from_series( + index.to_series()._query_compiler ) self._query_compiler = query_compiler.columnarize() if name is not None: diff --git a/tests/integ/modin/frame/test_mask.py b/tests/integ/modin/frame/test_mask.py index 684d8ba434..2422edb736 100644 --- a/tests/integ/modin/frame/test_mask.py +++ b/tests/integ/modin/frame/test_mask.py @@ -437,7 +437,7 @@ def test_dataframe_mask_not_implemented(test_data, test_cond, test_others): snow_dfs[0].mask(snow_dfs[1], snow_dfs[2], axis=1) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=2) def test_dataframe_mask_cond_is_array(caplog): data = [[1, 2], [3, 4]] cond = np.array([[True, False], [False, True]]) @@ -686,7 +686,7 @@ def test_dataframe_mask_with_duplicated_index_aligned(cond_frame, other): native_other = other snow_other = other - expected_join_count = 1 if isinstance(other, int) else 2 + expected_join_count = 2 if isinstance(other, int) else 3 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( snow_df, @@ -697,8 +697,9 @@ def test_dataframe_mask_with_duplicated_index_aligned(cond_frame, other): ) -# Three extra queries to convert to native index for dataframe constructor when creating the 3 snowpark pandas dataframes -@sql_count_checker(query_count=4, join_count=2) +# Three extra joins when creating the 3 snowpark pandas dataframes with non-Snowpark pandas +# data and Snowpark pandas Index. +@sql_count_checker(query_count=1, join_count=5) def test_dataframe_mask_with_duplicated_index_unaligned(): data = [3, 4, 5, 2] df_index = pd.Index([2, 1, 2, 3], name="index") @@ -866,7 +867,7 @@ def perform_mask(df): ) -@sql_count_checker(query_count=3, join_count=2) +@sql_count_checker(query_count=2, join_count=3) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], @@ -911,7 +912,7 @@ def perform_mask(df): ) -@sql_count_checker(query_count=3, join_count=2, union_count=1) +@sql_count_checker(query_count=2, join_count=3, union_count=1) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index cc5698b684..6152089f39 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -445,7 +445,7 @@ def setitem_helper(df): ], ) # 2 extra queries to convert to native pandas when creating the two snowpark pandas dataframes -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_df_setitem_with_unique_and_duplicate_index_values( index_values, other_index_values, expect_mismatch ): diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index 4c85c1fd06..adfcd7f46b 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -537,7 +537,7 @@ def operation(df: native_pd.DataFrame) -> native_pd.DataFrame: if group_keys else QUERY_COUNT_WITH_TRANSFORM_CHECK ), - join_count=JOIN_COUNT, + join_count=2, udtf_count=UDTF_COUNT, ): snow_result = operation(mdf) @@ -719,7 +719,7 @@ def groupby_apply_without_sort(df): with SqlCounter( query_count=QUERY_COUNT_WITH_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=2, ): assert_snowpark_pandas_equal_to_pandas( groupby_apply_without_sort(snow_df).sort_values(), @@ -967,9 +967,9 @@ def test_args_and_kwargs(self, grouping_dfs_with_multiindexes): @pytest.mark.parametrize("dropna", [True, False]) @sql_count_checker( # One extra query to convert index to native pandas in dataframe constructor to create test dataframes - query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK + 1, + query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=2, ) @pytest.mark.parametrize("index", [[2.0, np.nan, 2.0, 1.0], [np.nan] * 4]) def test_dropna(self, dropna, index): @@ -1082,19 +1082,9 @@ def test_dataframe_groupby_getitem(self, by, func, dropna, group_keys, sort): # (pd.NA, k1) that we cannot serialize. pytest.xfail(reason="SNOW-1229760") with SqlCounter( - # one additional query for converting index to native pandas in dataframe constructor - query_count=QUERY_COUNT_WITH_TRANSFORM_CHECK + 1 - if not group_keys - and func - in ( - get_dataframe_from_numeric_series, - get_series_from_numeric_series, - series_transform_returns_frame, - series_transform_returns_series, - ) - else QUERY_COUNT_WITHOUT_TRANSFORM_CHECK + 1, + query_count=6 if group_keys is False else 5, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=2, ): eval_snowpark_pandas_result( *create_test_dfs( diff --git a/tests/integ/modin/groupby/test_groupby_negative.py b/tests/integ/modin/groupby/test_groupby_negative.py index eeddd0e6c2..c616a1d019 100644 --- a/tests/integ/modin/groupby/test_groupby_negative.py +++ b/tests/integ/modin/groupby/test_groupby_negative.py @@ -164,7 +164,7 @@ def test_groupby_min_max_invalid_non_numeric_column( agg_func(df).to_pandas() -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_groupby_series_numeric_only_true(series_str): message = "SeriesGroupBy does not implement numeric_only" eval_snowpark_pandas_result( @@ -177,7 +177,7 @@ def test_groupby_series_numeric_only_true(series_str): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_groupby_as_index_raises(series_str): eval_snowpark_pandas_result( series_str, @@ -254,7 +254,7 @@ def test_groupby_as_index_false_axis_1_raises(df_multi): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_groupby_series_agg_dict_like_input_raise(series_str): eval_snowpark_pandas_result( series_str, diff --git a/tests/integ/modin/groupby/test_groupby_series.py b/tests/integ/modin/groupby/test_groupby_series.py index 7756f8b620..ae8ae0926d 100644 --- a/tests/integ/modin/groupby/test_groupby_series.py +++ b/tests/integ/modin/groupby/test_groupby_series.py @@ -153,7 +153,7 @@ def test_groupby_agg_series_named_agg(aggs, sort): @pytest.mark.parametrize("numeric_only", [False, None]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_groupby_series_numeric_only(series_str, numeric_only): native_series = series_str.to_pandas() eval_snowpark_pandas_result( diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 5b2571ccca..a1512d8ced 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -322,6 +322,7 @@ def test_create_df_with_df_as_data_and_index_as_index(native_df, native_index): native_pd.DataFrame([]), native_pd.Index(["A", "V"], name="non-empty index"), ), # empty df and index + ({}, native_pd.Index([10, 0, 1], name="non-empty index")), ], ) @sql_count_checker(query_count=1, join_count=2) @@ -407,6 +408,14 @@ def test_create_df_with_empty_df_as_data_and_index_as_index(native_df, native_in native_pd.Index(["A", "V"], name="non-empty index"), ["A", "V"], ), # empty data, non-empty index and columns + ( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + }, # dict data should behave similar to DataFrame data + native_pd.Index([10, 0, 1], name="non-empty index"), + ["A", "C"], + ), ], ) @pytest.mark.parametrize("column_type", ["list", "index"]) @@ -421,9 +430,16 @@ def test_create_df_with_df_as_data_and_index_as_index_and_different_columns( # One extra query is required to create the columns if it is an Index (column_type is "index"). native_columns = columns if column_type == "list" else native_pd.Index(columns) snow_columns = columns if column_type == "list" else pd.Index(columns) - snow_df = pd.DataFrame(native_df) + snow_df = ( + pd.DataFrame(native_df) + if isinstance(native_df, native_pd.DataFrame) + else native_df + ) snow_index = pd.Index(native_index) - with SqlCounter(query_count=1 if column_type == "list" else 2, join_count=2): + qc = 1 if column_type == "list" else 2 + qc += 1 if (isinstance(native_df, dict) and column_type == "index") else 0 + jc = 2 if isinstance(native_df, native_pd.DataFrame) else 1 + with SqlCounter(query_count=qc, join_count=jc): assert_frame_equal( pd.DataFrame(snow_df, index=snow_index, columns=native_columns), native_pd.DataFrame(native_df, index=native_index, columns=snow_columns), @@ -454,3 +470,22 @@ def test_create_df_with_df_index_negative(): match=re.escape("Shape of passed values is (3, 1), indices imply (2, 1)"), ): pd.DataFrame([1, 2, 3], index=[[1, 2], [3, 4], [5, 6]]) + + +@sql_count_checker(query_count=2, join_count=1) +def test_create_df_with_dict_as_data_and_index_as_index(): + """ + Special case when creating: + >>> DataFrame({"A": [1], "V": [2]}, native_pd.Index(["A", "B", "C"]), name="none") # doctest: +SKIP + A V + none + A 1 2 + B 1 2 <--- the first row is copied into the rest of the rows. + C 1 2 + """ + data = {"A": [1], "V": [2]} + native_index = native_pd.Index(["A", "B", "C"]) + snow_index = pd.Index(native_index) + native_df = native_pd.DataFrame(data, index=native_index) + snow_df = pd.DataFrame(data, index=snow_index) + assert_frame_equal(snow_df, native_df) diff --git a/tests/integ/modin/index/test_name.py b/tests/integ/modin/index/test_name.py index 0397ed1546..387a76c358 100644 --- a/tests/integ/modin/index/test_name.py +++ b/tests/integ/modin/index/test_name.py @@ -95,7 +95,7 @@ def test_index_rename_copy(new_name): @pytest.mark.parametrize("new_name", [None, "grade", ("grade",), ("A", "B")]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_index_rename_inplace(new_name): # 1 query to create the DataFrame. # Create the DataFrame and the new index. @@ -117,7 +117,7 @@ def test_df_index_rename_inplace(new_name): @pytest.mark.parametrize("new_name", [None, "grade", ("grade",), ("A", "B")]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_index_rename_copy(new_name): # 1 query to create the DataFrame. # Create the DataFrame and the new index. @@ -183,7 +183,7 @@ def test_index_set_names_copy(new_name): @pytest.mark.parametrize("new_name", [None, "grade", ["grade"], ("grade",)]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_index_set_names_inplace(new_name): # 1 query to create the DataFrame. # Create the DataFrame and the new index. @@ -213,7 +213,7 @@ def test_df_index_set_names_inplace(new_name): @pytest.mark.parametrize("new_name", [None, "grade", ["grade"], ("grade",)]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_df_index_set_names_copy(new_name): # 1 query to create the DataFrame. # Create the DataFrame and the new index. diff --git a/tests/integ/modin/series/test_iloc.py b/tests/integ/modin/series/test_iloc.py index 2f9444b9ae..b35681e4ee 100644 --- a/tests/integ/modin/series/test_iloc.py +++ b/tests/integ/modin/series/test_iloc.py @@ -823,9 +823,8 @@ def test_df_iloc_set_with_multi_index( native_items.index = pd.MultiIndex.from_tuples(item_index) if row_key_index: - expected_join_count += 1 - snow_row_key = pd.Series(row_key, index=pd.Index(row_key_index)) - native_row_key = native_pd.Series(row_key, index=pd.Index(row_key_index)) + snow_row_key = pd.Series(row_key, index=native_pd.Index(row_key_index)) + native_row_key = native_pd.Series(row_key, index=native_pd.Index(row_key_index)) else: snow_row_key = row_key native_row_key = row_key diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index d60b9eb26a..b745431df9 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -388,7 +388,7 @@ def test_series_loc_get_key_bool_series_with_unaligned_and_duplicate_indices(): ], # larger length ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_loc_get_key_bool_series_with_mismatch_index_len(key, use_default_index): if use_default_index: index = None diff --git a/tests/integ/modin/series/test_mask.py b/tests/integ/modin/series/test_mask.py index 6c554a0358..2ef2465b58 100644 --- a/tests/integ/modin/series/test_mask.py +++ b/tests/integ/modin/series/test_mask.py @@ -103,7 +103,7 @@ def test_series_mask_index_no_names(): ) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3, join_count=2) def test_series_mask_with_np_array_cond(): data = [1, 2] cond = np.array([True, False]) @@ -114,7 +114,7 @@ def test_series_mask_with_np_array_cond(): eval_snowpark_pandas_result(snow_ser, native_ser, lambda df: df.mask(cond)) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_series_mask_with_series_cond_single_index_different_names(): data = [1, 2, 3] cond = [False, True, False] @@ -138,7 +138,7 @@ def test_series_mask_with_series_cond_single_index_different_names(): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_series_mask_with_duplicated_index_aligned(): data = [1, 2, 3] cond = [False, True, False] @@ -160,7 +160,7 @@ def test_series_mask_with_duplicated_index_aligned(): ) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2, join_count=1) def test_series_mask_with_lambda_cond(): data = [1, 6, 7, 4] index = pd.Index(["a", "b", "c", "d"]) @@ -175,7 +175,7 @@ def test_series_mask_with_lambda_cond(): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1) def test_series_mask_with_lambda_returns_singleton_should_fail(): data = [1, 6, 7, 4] index = pd.Index(["a", "b", "c", "d"]) @@ -196,9 +196,11 @@ def test_series_mask_with_lambda_returns_singleton_should_fail(): @pytest.mark.parametrize( "other, sql_count, join_count", - [(lambda x: -x.iloc[0], 5, 3), (lambda x: x**2, 4, 2)], + [(lambda x: -x.iloc[0], 4, 10), (lambda x: x**2, 3, 8)], ) def test_series_mask_with_lambda_other(other, sql_count, join_count): + # Multiple joins since multiple Series are created with non-Snowpark pandas data + # and a Snowpark pandas Index. data = [1, 6, 7, 4] index = pd.Index(["a", "b", "c", "d"]) diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index 9450112ae5..97e2931dfb 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -259,9 +259,9 @@ def perform_reindex(series): ) -# @sql_count_checker(query_count=2, join_count=1) -@pytest.mark.parametrize("limit", [None]) # , 1, 2, 100]) -@pytest.mark.parametrize("method", ["bfill"]) # , "backfill", "pad", "ffill"]) +@sql_count_checker(query_count=1, join_count=2) +@pytest.mark.parametrize("limit", [None, 1, 2, 100]) +@pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) def test_reindex_index_datetime_with_fill(limit, method): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_series = native_pd.Series( diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index 981a2932a2..011baf1ec5 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -109,7 +109,6 @@ def axis(request): return request.param -# TODO: redefine df1, df2, and _concat_operation def _concat_operation(snow_objs, native_objs, **kwargs): return ( lambda x: pd.concat(snow_objs, **kwargs) From f971b0d5fe1d394aa2e4cea334510adffe22fe15 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 23 Aug 2024 15:47:38 -0700 Subject: [PATCH 11/42] more test fixes --- src/snowflake/snowpark/modin/pandas/dataframe.py | 5 ++++- src/snowflake/snowpark/modin/pandas/series.py | 6 +++++- tests/integ/modin/frame/test_iloc.py | 10 ++++------ tests/integ/modin/frame/test_mask.py | 13 +++++-------- tests/integ/modin/frame/test_merge.py | 3 +-- tests/integ/modin/frame/test_rename.py | 3 +-- tests/integ/modin/groupby/test_groupby_apply.py | 7 ++++++- tests/integ/modin/resample/test_resample_fillna.py | 10 ++++++---- tests/integ/modin/series/test_rank.py | 1 - 9 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index f35e40373e..71b07c9684 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -297,7 +297,10 @@ def __init__( for k, v in data.items() } - if all(len(v) == 1 for v in data.values()) and index is not None: + if ( + all(not is_scalar(v) and len(v) == 1 for v in data.values()) + and index is not None + ): # Special case when creating: # >>> DataFrame({"A": [1], "V": [2]}, native_pd.Index(["A", "B", "C"]), name="none") # A V diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 7ac3172328..2802000451 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -166,7 +166,11 @@ def __init__( # 3. Perform .loc[] on `data` to select the rows that are in `index`. query_compiler = data.loc[index]._query_compiler - elif is_dict_like(data) and not isinstance(data, (pandas.Series, Series)): + elif ( + is_dict_like(data) + and not is_list_like(data) + and not isinstance(data, (pandas.Series, Series)) + ): if name is None: name = MODIN_UNNAMED_SERIES_LABEL # If the data is a dictionary, we need to convert it to a query compiler and set the index. diff --git a/tests/integ/modin/frame/test_iloc.py b/tests/integ/modin/frame/test_iloc.py index c79d5eb8ba..cb69f78172 100644 --- a/tests/integ/modin/frame/test_iloc.py +++ b/tests/integ/modin/frame/test_iloc.py @@ -2710,17 +2710,15 @@ def test_df_iloc_set_with_multi_index( native_items.columns = pd.MultiIndex.from_tuples(item_columns) if row_key_index: - expected_join_count += 1 - snow_row_key = pd.Series(row_key, index=pd.Index(row_key_index)) - native_row_key = native_pd.Series(row_key, index=pd.Index(row_key_index)) + snow_row_key = pd.Series(row_key, index=native_pd.Index(row_key_index)) + native_row_key = native_pd.Series(row_key, index=native_pd.Index(row_key_index)) else: snow_row_key = row_key native_row_key = row_key if col_key_index: - expected_join_count += 1 - snow_col_key = pd.Series(col_key, index=pd.Index(col_key_index)) - native_col_key = native_pd.Series(col_key, index=pd.Index(col_key_index)) + snow_col_key = pd.Series(col_key, index=native_pd.Index(col_key_index)) + native_col_key = native_pd.Series(col_key, index=native_pd.Index(col_key_index)) else: snow_col_key = col_key native_col_key = col_key diff --git a/tests/integ/modin/frame/test_mask.py b/tests/integ/modin/frame/test_mask.py index 2422edb736..80dfb0f410 100644 --- a/tests/integ/modin/frame/test_mask.py +++ b/tests/integ/modin/frame/test_mask.py @@ -500,7 +500,7 @@ def __call__(self, df): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=2) def test_dataframe_mask_other_is_array(): data = [[1, 3], [2, 4]] other = np.array([[99, -99], [101, -101]]) @@ -543,7 +543,7 @@ def test_dataframe_mask_sizes_do_not_match_negative_test(test_data, test_cond): snow_df.mask(snow_cond_df) -@sql_count_checker(query_count=3, join_count=2) +@sql_count_checker(query_count=2, join_count=3) def test_dataframe_mask_with_np_array_cond(): data = [1, 2, 3] cond = np.array([[False, True, False]]).T @@ -570,8 +570,7 @@ def test_dataframe_mask_with_np_array_cond(): ) -# One extra query to convert to native index for dataframe constructor when creating snow_other_df -@sql_count_checker(query_count=4, join_count=2) +@sql_count_checker(query_count=2, join_count=4) def test_dataframe_mask_with_np_array_cond_mismatched_labels(): data = [1, 2, 3] cond = np.array([[False, True, False]]).T @@ -596,8 +595,7 @@ def test_dataframe_mask_with_np_array_cond_mismatched_labels(): ) -# One extra query to convert to native index for dataframe constructor when creating snow_other_df -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=3) def test_dataframe_mask_with_dataframe_cond_single_index_different_names(): data = [1, 2, 3] cond = [False, True, False] @@ -624,8 +622,7 @@ def test_dataframe_mask_with_dataframe_cond_single_index_different_names(): ) -# One extra query to convert to native index for dataframe constructor when creating snow_other_df -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=1, join_count=3) def test_dataframe_mask_with_dataframe_cond_single_index_different_names_2(): data = [1, 2, 3] cond = [False, True, False] diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py index 80df6bc516..1bbbb80f93 100644 --- a/tests/integ/modin/frame/test_merge.py +++ b/tests/integ/modin/frame/test_merge.py @@ -922,8 +922,7 @@ def test_merge_outer_with_nan(dtype): _verify_merge(right, left, "outer", on="key") -# Two extra queries to convert to native index for dataframe constructor when creating left and right -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=5, join_count=5) def test_merge_different_index_names(): left = pd.DataFrame({"a": [1]}, index=pd.Index([1], name="c")) right = pd.DataFrame({"a": [1]}, index=pd.Index([1], name="d")) diff --git a/tests/integ/modin/frame/test_rename.py b/tests/integ/modin/frame/test_rename.py index 289fb6e159..a5595ec716 100644 --- a/tests/integ/modin/frame/test_rename.py +++ b/tests/integ/modin/frame/test_rename.py @@ -104,8 +104,7 @@ def test_rename(self, snow_float_frame): assert_index_equal(renamed.index, native_pd.Index(["A", "B", "foo", "bar"])) # index with name - # Two extra queries, one for converting to native pandas in renamer Dataframe constructor, one to get the name - with SqlCounter(query_count=2, join_count=1): + with SqlCounter(query_count=1, join_count=2): index = Index(["foo", "bar"], name="name") renamer = DataFrame(data, index=index) renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index adfcd7f46b..e83fcbe00b 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -1073,6 +1073,11 @@ class TestSeriesGroupBy: @pytest.mark.parametrize("by", ["string_col_1", ["index", "string_col_1"], "index"]) def test_dataframe_groupby_getitem(self, by, func, dropna, group_keys, sort): """Test apply() on a SeriesGroupBy that we get by DataFrameGroupBy.__getitem__""" + qc = ( + 6 + if group_keys is False and not func == get_scalar_from_numeric_series + else 5 + ) if ( func in (get_dataframe_from_numeric_series, get_series_from_numeric_series) and not dropna @@ -1082,7 +1087,7 @@ def test_dataframe_groupby_getitem(self, by, func, dropna, group_keys, sort): # (pd.NA, k1) that we cannot serialize. pytest.xfail(reason="SNOW-1229760") with SqlCounter( - query_count=6 if group_keys is False else 5, + query_count=qc, udtf_count=UDTF_COUNT, join_count=2, ): diff --git a/tests/integ/modin/resample/test_resample_fillna.py b/tests/integ/modin/resample/test_resample_fillna.py index 6be0388f27..96ad514a2b 100644 --- a/tests/integ/modin/resample/test_resample_fillna.py +++ b/tests/integ/modin/resample/test_resample_fillna.py @@ -44,7 +44,7 @@ def test_resample_fill(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_fill_ser(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -59,8 +59,9 @@ def test_resample_fill_ser(interval, agg_func): ], format="mixed", ) + # TODO: SNOW-1638397 See if it's possible to use data={"a": range(len(datecol))} instead. eval_snowpark_pandas_result( - *create_test_series({"a": range(len(datecol))}, index=datecol), + *create_test_series({"2024-01-02": list(range(len(datecol)))}, index=datecol), lambda df: getattr(df.resample(rule=f"{interval}D"), agg_func)(), check_freq=False, ) @@ -138,7 +139,7 @@ def test_resample_ffill_missing_in_middle(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_ffill_ser_missing_in_middle(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -152,8 +153,9 @@ def test_resample_ffill_ser_missing_in_middle(interval, agg_func): ], format="mixed", ) + # TODO: SNOW-1638397 See if it's possible to use data={"a": range(len(datecol))} instead. eval_snowpark_pandas_result( - *create_test_series({"a": range(len(datecol))}, index=datecol), + *create_test_series({"2024-01-01": list(range(len(datecol)))}, index=datecol), lambda df: getattr(df.resample(rule=f"{interval}D"), agg_func)(), check_freq=False, ) diff --git a/tests/integ/modin/series/test_rank.py b/tests/integ/modin/series/test_rank.py index 3a855f1142..47d46dc0f3 100644 --- a/tests/integ/modin/series/test_rank.py +++ b/tests/integ/modin/series/test_rank.py @@ -28,7 +28,6 @@ ] -@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", From 8c78f8d88944a2a257ad55af9affc03862ebf206 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 23 Aug 2024 16:44:54 -0700 Subject: [PATCH 12/42] fix dict case --- src/snowflake/snowpark/modin/pandas/series.py | 6 +----- tests/integ/modin/series/test_rank.py | 1 + 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 2802000451..7ac3172328 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -166,11 +166,7 @@ def __init__( # 3. Perform .loc[] on `data` to select the rows that are in `index`. query_compiler = data.loc[index]._query_compiler - elif ( - is_dict_like(data) - and not is_list_like(data) - and not isinstance(data, (pandas.Series, Series)) - ): + elif is_dict_like(data) and not isinstance(data, (pandas.Series, Series)): if name is None: name = MODIN_UNNAMED_SERIES_LABEL # If the data is a dictionary, we need to convert it to a query compiler and set the index. diff --git a/tests/integ/modin/series/test_rank.py b/tests/integ/modin/series/test_rank.py index 47d46dc0f3..3a855f1142 100644 --- a/tests/integ/modin/series/test_rank.py +++ b/tests/integ/modin/series/test_rank.py @@ -28,6 +28,7 @@ ] +@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", From 7970101f6cb47ae6bc3381815361ce6a8b4fe1e9 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 23 Aug 2024 17:23:07 -0700 Subject: [PATCH 13/42] more test case fixes --- tests/integ/modin/series/test_rank.py | 3 ++- tests/integ/modin/series/test_reindex.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integ/modin/series/test_rank.py b/tests/integ/modin/series/test_rank.py index 3a855f1142..2544f12e43 100644 --- a/tests/integ/modin/series/test_rank.py +++ b/tests/integ/modin/series/test_rank.py @@ -29,7 +29,8 @@ @sql_count_checker(query_count=1) -@pytest.mark.parametrize("data, index", TEST_RANK_DATA) +# Skipping last test case since it uses MultiIndex. +@pytest.mark.parametrize("data, index", TEST_RANK_DATA[:-1]) @pytest.mark.parametrize( "method", ["min", "dense", "first", "max", "average"], diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index 97e2931dfb..14ab6fa6cf 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -302,13 +302,14 @@ def test_reindex_index_non_overlapping_index(): @sql_count_checker(query_count=1, join_count=2) def test_reindex_index_non_overlapping_datetime_index(): + # TODO: SNOW-1638397 See if it's possible to use data={"prices": [100, 101, np.nan, 100, 89, 88]} instead. date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_series = native_pd.Series( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + {"1/1/2023": [100, 101, np.nan, 100, 89, 88]}, index=date_index ) date_index = pd.date_range("1/1/2010", periods=6, freq="D") snow_series = pd.Series( - {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + {"1/1/2023": [100, 101, np.nan, 100, 89, 88]}, index=date_index ) def perform_reindex(series): From f3de1c36c85cdfac4af52aee5dd460de43508e6a Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 26 Aug 2024 11:11:40 -0700 Subject: [PATCH 14/42] correct the logic for series created with dict and index --- src/snowflake/snowpark/modin/pandas/series.py | 8 +++++--- src/snowflake/snowpark/modin/plugin/docstrings/series.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 7ac3172328..0a868f2687 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -179,11 +179,13 @@ def __init__( )._query_compiler if index is not None: index = index if isinstance(index, Index) else Index(index) - query_compiler = ( - query_compiler.create_qc_with_data_and_index_joined_on_index( + data = Series( + query_compiler=query_compiler.create_qc_with_data_and_index_joined_on_index( index._query_compiler ) ) + # Perform .loc[] on `data` to select the rows that are in `index`. + query_compiler = data.loc[index]._query_compiler if query_compiler is None: # Defaulting to pandas @@ -194,7 +196,6 @@ def __init__( and data.name is not None ): name = data.name - new_index = index if isinstance(index, Index): # Skip turning this into a native pandas object here since this issues an extra query. @@ -217,6 +218,7 @@ def __init__( query_compiler = query_compiler.set_index_from_series( index.to_series()._query_compiler ) + self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index 6e48a7e57f..16ed09c19a 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -76,7 +76,7 @@ class Series: c 3 dtype: int64 - The keys of the dictionary match with the Index values, hence the Index + The keys of the dictionary match with the Index values, hence the dictionary values have no effect. >>> d = {'a': 1, 'b': 2, 'c': 3} From 82728bfc1aae9bae62d8dba2e837ef58ca4dbcbc Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 26 Aug 2024 12:08:02 -0700 Subject: [PATCH 15/42] fix query counts --- tests/integ/modin/series/test_reindex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index 14ab6fa6cf..8fc0b77f1b 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -259,7 +259,7 @@ def perform_reindex(series): ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=3) @pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) def test_reindex_index_datetime_with_fill(limit, method): @@ -300,7 +300,7 @@ def test_reindex_index_non_overlapping_index(): ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=3) def test_reindex_index_non_overlapping_datetime_index(): # TODO: SNOW-1638397 See if it's possible to use data={"prices": [100, 101, np.nan, 100, 89, 88]} instead. date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") From 1577ddc66fcee436ef780f9c5036acfda20116cc Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 26 Aug 2024 12:49:41 -0700 Subject: [PATCH 16/42] fix join count --- tests/integ/modin/resample/test_resample_fillna.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/modin/resample/test_resample_fillna.py b/tests/integ/modin/resample/test_resample_fillna.py index 96ad514a2b..d4e959123a 100644 --- a/tests/integ/modin/resample/test_resample_fillna.py +++ b/tests/integ/modin/resample/test_resample_fillna.py @@ -44,7 +44,7 @@ def test_resample_fill(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=5) def test_resample_fill_ser(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -139,7 +139,7 @@ def test_resample_ffill_missing_in_middle(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=5) def test_resample_ffill_ser_missing_in_middle(interval, agg_func): datecol = native_pd.to_datetime( [ From 8903f60a3585dc3d49848d9f7dbe7d4fd5c674c7 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 4 Sep 2024 13:31:46 -0700 Subject: [PATCH 17/42] refactor series and df --- .../snowpark/modin/pandas/dataframe.py | 194 ++++++++---------- src/snowflake/snowpark/modin/pandas/series.py | 119 ++++------- .../compiler/snowflake_query_compiler.py | 127 +++--------- .../test_df_series_creation_with_index.py | 12 +- tests/integ/test_dataframe.py | 3 + 5 files changed, 172 insertions(+), 283 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 71b07c9684..0005fc787a 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -84,7 +84,6 @@ ) from snowflake.snowpark.modin.pandas.utils import ( create_empty_native_pandas_frame, - from_non_pandas, from_pandas, is_scalar, raise_if_native_pandas_objects, @@ -159,101 +158,67 @@ def __init__( if isinstance(index, DataFrame): # pandas raises the same error raise ValueError("Index data must be 1-dimensional") - # Engine.subscribe(_update_engine) + if query_compiler is not None: + # CASE 1: query_compiler + # If a query_compiler is passed in, only use the query_compiler and name fields to create a new Series. + self._query_compiler = query_compiler + return + + # The logic followed here is: + # 1. Create a query_compiler from the provided data. If columns are provided, add/select the columns. + # 2. If an index is provided, set the index through reindex. + # 3. If the data is a DataFrame, perform loc to select the required index and columns from the DataFrame. + # 4. The resultant query_compiler is then set as the query_compiler for the DataFrame. + if isinstance(data, Index): - # If the data is an Index object, we need to convert it to a DataFrame to make sure - # that the values are in the correct format -- as a data column, not an index column. - # Additionally, if an index is provided, converting it to an Index object ensures that - # its values are an index column. - # We set the column name if it is not in the provided Index `data`. + # CASE 2: data is a Snowpark pandas Index + # If the data is an Index object, convert it to a DataFrame to make sure that the values are in the + # correct format: the values are a data column, not an index column. if data.name is None: new_name = 0 if columns is None else columns[0] else: new_name = data.name query_compiler = data.to_frame(index=False, name=new_name)._query_compiler - if index is not None: - index = index if isinstance(index, Index) else Index(index) - query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( - index._query_compiler - ) - if isinstance(data, (DataFrame, Series)): - self._query_compiler = data._query_compiler.copy() - if isinstance(data, Series): - # We set the column name if it is not in the provided Series `data`. - if data.name is None: - self.columns = [0] if columns is None else columns + elif isinstance(data, Series): + # CASE 3: data is a Snowpark pandas Series + query_compiler = data._query_compiler.copy() + # We set the column name if it is not in the provided Series `data`. + if data.name is None: + self.columns = [0] if columns is None else columns + elif columns is not None and data.name not in columns: # If the columns provided are not in the named Series, pandas clears # the DataFrame and sets columns to the columns provided. - elif columns is not None and data.name not in columns: - self._query_compiler = from_pandas( - self.__constructor__(columns=columns) - )._query_compiler - if index is not None: - # The `index` parameter is used to select the rows from `data` that will be in the resultant - # DataFrame. If a value in `index` is not present in `data`'s index, it will be filled with a - # NaN value. - # 1. The `index` is converted to an Index object so that the index values are in an index column. - index = index if isinstance(index, Index) else Index(index) - # 2. A right outer join is performed between `data` and `index` to create a Series object where - # any index values in `data`'s index that are not in `index` are filled with NaN. - data = Series( - query_compiler=data._query_compiler.create_qc_with_data_and_index_joined_on_index( - index._query_compiler - ), - name=0 if data.name is None else data.name, - ) - # 3. Perform .loc[] on `data` to select the rows that are in the `index`. - self._query_compiler = data.loc[index]._query_compiler - - elif columns is None and index is None: + query_compiler = from_pandas( + self.__constructor__(columns=columns) + )._query_compiler + + elif isinstance(data, DataFrame): + # CASE 5: data is a Snowpark pandas DataFrame + query_compiler = data._query_compiler.copy() + + if columns is None and index is None: + # If the new DataFrame has the same columns and index as the original DataFrame, + # the query compiler is shared and kept track of as a sibling. + self._query_compiler = query_compiler data._add_sibling(self) - - else: - # The `columns` parameter is used to select the columns from `data` that will be in the resultant - # DataFrame. If a value in `columns` is not present in `data`'s columns, it will be added as a - # new column filled with NaN values. These columns are tracked by the `extra_columns` variable. - extra_columns = None - if columns is None: - # In case `columns` is not provided, `columns` is set to slice(None) to select all columns. - columns = slice(None) - else: - extra_columns = [col for col in columns if col not in data.columns] - - # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. - # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. - if index is None: - # In case `index` is not provided, `index` is set to slice(None) to select all rows. - index = slice(None) - data = DataFrame( - query_compiler=data._query_compiler.create_qc_with_data_and_index_joined_on_index( - extra_columns=extra_columns - ) - ) - else: - # The `index` is converted to an Index object so that the index values are in an index column. - index = index if isinstance(index, Index) else Index(index) - # A right outer join is performed between `data` and `index` to create a DataFrame object where any - # index values in `data`'s index that are not in `index` are filled with NaN. - data = DataFrame( - query_compiler=data._query_compiler.create_qc_with_data_and_index_joined_on_index( - index._query_compiler, - extra_columns=extra_columns, - ) - ) - # 3. Perform .loc[] on `data` to select the rows and columns that are in `index` and `columns`. - self._query_compiler = data.loc[index, columns]._query_compiler - - # Check the type of data and use the appropriate constructor - elif query_compiler is None: - distributed_frame = from_non_pandas(data, index, columns, dtype) - if distributed_frame is not None: - self._query_compiler = distributed_frame._query_compiler return + # The `columns` parameter is used to select the columns from `data` that will be in the resultant + # DataFrame. If a value in `columns` is not present in `data`'s columns, it will be added as a + # new column filled with NaN values. These columns are tracked by the `extra_columns` variable. + extra_columns = [col for col in columns if col not in data.columns] + query_compiler = data._query_compiler.create_qc_with_extra_columns( + extra_columns + ) + else: + # CASE 5: Non-Snowpark pandas data if isinstance(data, pandas.Index): + # CASE 5.B: data is a pandas Index pass + elif is_list_like(data) and not is_dict_like(data): + # CASE 5.C: data is list-like old_dtype = getattr(data, "dtype", None) values = [ obj._to_pandas() if isinstance(obj, Series) else obj for obj in data @@ -265,30 +230,33 @@ def __init__( data = type(data)(values, dtype=old_dtype) except TypeError: data = values + elif is_dict_like(data) and not isinstance( - data, (pandas.Series, Series, pandas.DataFrame, DataFrame) + data, (pandas.Series, pandas.DataFrame) ): + # CASE 5.D: data is dict-like if columns is not None: data = {key: value for key, value in data.items() if key in columns} if len(data) and all(isinstance(v, Series) for v in data.values()): + # Special case: data is a dictionary where all the values are Snowpark pandas Series from .general import concat new_qc = concat( data.values(), axis=1, keys=data.keys() )._query_compiler - if dtype is not None: new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) if index is not None: - new_qc = new_qc.reindex( - axis=0, labels=try_convert_index_to_native(index) - ) + if isinstance(index, Index): + index = index.to_series()._query_compiler + elif isinstance(index, Series): + index = index._query_compiler + new_qc = new_qc.reindex(axis=0, labels=index) if columns is not None: new_qc = new_qc.reindex( axis=1, labels=try_convert_index_to_native(columns) ) - self._query_compiler = new_qc return @@ -301,10 +269,10 @@ def __init__( all(not is_scalar(v) and len(v) == 1 for v in data.values()) and index is not None ): - # Special case when creating: - # >>> DataFrame({"A": [1], "V": [2]}, native_pd.Index(["A", "B", "C"]), name="none") + # Special case: the values in the dictionary are all non-scalar objects of length 1 + # >>> DataFrame({"A": [1], "V": [2]}, native_pd.Index(["A", "B", "C"]), name="cake") # A V - # none + # cake # A 1 2 # B 1 2 <--- the first row is copied into the rest of the rows. # C 1 2 @@ -316,26 +284,36 @@ def __init__( )._query_compiler return - new_index = index + query_compiler = from_pandas( + pandas.DataFrame( + data=data, + columns=try_convert_index_to_native(columns), + dtype=dtype, + copy=copy, + ) + )._query_compiler + + if index is not None: + # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. + # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. if isinstance(index, Index): - # Skip turning this into a native pandas object here since this issues an extra query. - # Instead, first get the query compiler from native pandas and then add the index column. - new_index = None - pandas_df = pandas.DataFrame( - data=try_convert_index_to_native(data), - index=try_convert_index_to_native(new_index), - columns=try_convert_index_to_native(columns), - dtype=dtype, - copy=copy, + index = index.to_series()._query_compiler + elif isinstance(index, Series): + index = index._query_compiler + query_compiler = query_compiler.reindex(axis=0, labels=index) + + if isinstance(data, DataFrame): + # To select the required index and columns for the resultant DataFrame, + # perform .loc[] on the created query compiler. + index = slice(None) if index is None else index + columns = slice(None) if columns is None else columns + query_compiler = ( + DataFrame(query_compiler=query_compiler) + .loc[index, columns] + ._query_compiler ) - query_compiler = from_pandas(pandas_df)._query_compiler - if isinstance(index, Index): - query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( - index._query_compiler - ) - self._query_compiler = query_compiler - else: - self._query_compiler = query_compiler + + self._query_compiler = query_compiler def __repr__(self): """ diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 0a868f2687..83b98f930c 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -52,11 +52,7 @@ from pandas.util._validators import validate_bool_kwarg from snowflake.snowpark.modin.pandas.base import _ATTRS_NO_LOOKUP, BasePandasDataset -from snowflake.snowpark.modin.pandas.utils import ( - from_pandas, - is_scalar, - try_convert_index_to_native, -) +from snowflake.snowpark.modin.pandas.utils import from_pandas, is_scalar from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, @@ -131,45 +127,43 @@ def __init__( # Engine.subscribe(_update_engine) from snowflake.snowpark.modin.plugin.extensions.index import Index - # Convert lazy index to Series without pulling the data to client. + if query_compiler: + # CASE 1: query_compiler + # If a query_compiler is passed in, only use the query_compiler and name fields to create a new Series. + self._query_compiler = query_compiler.columnarize() + if name is not None: + self.name = name + return + + # The logic followed here is: + # 1. Create a query_compiler from the provided data. + # 2. If an index is provided, set the index. This is either through set_index or reindex. + # 3. The resultant query_compiler is columnarized and set as the query_compiler for the Series. + # 4. If a name is provided, set the name. + if isinstance(data, Index): - # If the data is an Index object, we need to convert it to a Series to make sure - # that the values are in the correct format -- as a data column, not an index column. - # Additionally, if an index is provided, converting it to an Index object ensures that - # its values are an index column. + # CASE 2: Index + # If the data is an Index object, convert it to a Series, and get the query_compiler. query_compiler = ( data.to_series(index=None, name=name) .reset_index(drop=True) ._query_compiler ) - if index is not None: - index = index if isinstance(index, Index) else Index(index) - query_compiler = query_compiler.create_qc_with_index_data_and_qc_index( - index._query_compiler - ) elif isinstance(data, type(self)): + # CASE 3: Series + # If the data is a Series object, copy the query_compiler. query_compiler = data._query_compiler.copy() - if index is not None: - # The `index` parameter is used to select the rows from `data` that will be in the resultant Series. - # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. - # 1. The `index` is converted to an Index object so that the index values are in an index column. - index = index if isinstance(index, Index) else Index(index) - # 2. A right outer join is performed between `data` and `index` to create a Series object where any - # index values in `data`'s index that are not in `index` are filled with NaN. - data = Series( - query_compiler=data._query_compiler.create_qc_with_data_and_index_joined_on_index( - index._query_compiler - ), - name=data.name, - ) - # 3. Perform .loc[] on `data` to select the rows that are in `index`. - query_compiler = data.loc[index]._query_compiler - elif is_dict_like(data) and not isinstance(data, (pandas.Series, Series)): - if name is None: - name = MODIN_UNNAMED_SERIES_LABEL - # If the data is a dictionary, we need to convert it to a query compiler and set the index. + else: + # CASE 4: Non-Snowpark pandas data + # If the data is not a Snowpark pandas object, convert it to a query compiler. + name = MODIN_UNNAMED_SERIES_LABEL if name is None else name + if ( + isinstance(data, (pandas.Series, pandas.Index)) + and data.name is not None + ): + name = data.name query_compiler = from_pandas( pandas.DataFrame( pandas.Series( @@ -177,48 +171,25 @@ def __init__( ) ) )._query_compiler - if index is not None: - index = index if isinstance(index, Index) else Index(index) - data = Series( - query_compiler=query_compiler.create_qc_with_data_and_index_joined_on_index( - index._query_compiler - ) - ) - # Perform .loc[] on `data` to select the rows that are in `index`. - query_compiler = data.loc[index]._query_compiler - - if query_compiler is None: - # Defaulting to pandas - if name is None: - name = MODIN_UNNAMED_SERIES_LABEL - if ( - isinstance(data, (pandas.Series, pandas.Index, pd.Index)) - and data.name is not None - ): - name = data.name - new_index = index - if isinstance(index, Index): - # Skip turning this into a native pandas object here since this issues an extra query. - # Instead, first get the query compiler from native pandas and then add the index column. - new_index = None - query_compiler = from_pandas( - pandas.DataFrame( - pandas.Series( - data=try_convert_index_to_native(data), - index=new_index, - dtype=dtype, - name=name, - copy=copy, - fastpath=fastpath, - ) - ) - )._query_compiler - if isinstance(index, Index): + + if index is not None: + if is_dict_like(data) or isinstance(data, (type(self))): + # The `index` parameter is used to select the rows from `data` that will be in the resultant Series. + # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. + if isinstance(index, Index): + index = index.to_series()._query_compiler + elif isinstance(index, Series): + index = index._query_compiler + query_compiler = query_compiler.reindex(axis=0, labels=index) + + else: # Performing set index to directly set the index column (joining on row-position instead of index). - query_compiler = query_compiler.set_index_from_series( - index.to_series()._query_compiler - ) + index_qc = ( + index if isinstance(index, Series) else Series(index) + )._query_compiler + query_compiler = query_compiler.set_index_from_series(index_qc) + # Set the query compiler and name fields. self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 21d1d9c4b7..8057581128 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -17508,117 +17508,52 @@ def tz_convert(self, *args: Any, **kwargs: Any) -> None: def tz_localize(self, *args: Any, **kwargs: Any) -> None: ErrorMessage.method_not_implemented_error("tz_convert", "BasePandasDataset") - def create_qc_with_index_data_and_qc_index( - self, index_qc: "SnowflakeQueryCompiler" + def create_qc_with_extra_columns( + self, extra_columns: List[Hashable] ) -> "SnowflakeQueryCompiler": """ - This is a helper function for creating a DataFrame/Series where the data is an Index - and an index is provided. - Before this method is called, the provided index is converted to an Index object; - the query compilers of the data and index are then joined. + This is a helper function for creating a DataFrame where the data is a DataFrame object. Sometimes, columns + not present in the `data` DataFrame can be passed as arguments - these are added to the resultant DataFrame + as NaN columns. Parameters ---------- - index_qc : SnowflakeQueryCompiler - The query compiler of the index to be joined with the data. - - Returns - ------- - SnowflakeQueryCompiler - A new query compiler with the data and index joined. - """ - self_frame = self._modin_frame.ensure_row_position_column() - other_frame = index_qc._modin_frame.ensure_row_position_column() - - new_internal_frame, _ = join_utils.join( - self_frame, - other_frame, - how="left", - left_on=[self_frame.row_position_snowflake_quoted_identifier], - right_on=[other_frame.row_position_snowflake_quoted_identifier], - inherit_join_index=InheritJoinIndex.FROM_RIGHT, - ) - - return SnowflakeQueryCompiler(new_internal_frame) - - def create_qc_with_data_and_index_joined_on_index( - self, - index_qc: Optional["SnowflakeQueryCompiler"] = None, - extra_columns: Optional[List[Hashable]] = None, - ) -> "SnowflakeQueryCompiler": - """ - This is a helper function for creating a DataFrame/Series where the data is a DataFrame/Series object. - This is a special case since only the values where the index value matches in the `data` and `index` provided - take on an actual value from the given `data`. Otherwise, they take on a NaN value. - - For instance, - - >>> data = pd.Series(["A", "B", "C", "D"], index=[1.1, 2.2, 3, 4], name="index series name") - >>> index = pd.Index([1, 2, 3, 4], name="some name") - >>> df = pd.DataFrame(data=data, index=index) - >>> df # doctest: +SKIP - index series name - some name - 1 NaN - 2 NaN - 3 C - 4 D - - Notice how only the data for index values 3 and 4 have an actual value while 1 and 2 have a NaN value. - 3 and 4 are values present in the index of the `data` and `index` provided. 1 and 2 are not present. - - Parameters - ---------- - index_qc : SnowflakeQueryCompiler, default None - The query compiler of the index to be joined with the data. If no query compiler is provided, - skip this join operation. extra_columns : list of hashable, default None - If the DataFrame being created has new columns that are not a part of the data, they can be passed here - and appended as NaN columns. + New columns that are not a part of the original query compiler Returns ------- SnowflakeQueryCompiler - A new query compiler with the data and index joined. + A new query compiler with the new columns. """ self_frame = self._modin_frame - if extra_columns: - # Append the new columns to the data's internal frame. - new_snowflake_quoted_identifiers = self._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers( + if not extra_columns or len(extra_columns) == 0: + return self.copy() + + # Append the new columns to the data's internal frame. + new_snowflake_quoted_identifiers = ( + self._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers( pandas_labels=extra_columns, excluded=self_frame.data_column_snowflake_quoted_identifiers, ) - new_ordered_frame = append_columns( - self_frame.ordered_dataframe, - new_snowflake_quoted_identifiers, - [pandas_lit(np.nan)] * len(extra_columns), - ) - self_frame = InternalFrame.create( - ordered_dataframe=new_ordered_frame, - data_column_pandas_labels=self_frame.data_column_pandas_labels - + extra_columns, - data_column_snowflake_quoted_identifiers=self_frame.data_column_snowflake_quoted_identifiers - + new_snowflake_quoted_identifiers, - data_column_pandas_index_names=self_frame.data_column_pandas_index_names, - index_column_pandas_labels=self_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=self_frame.index_column_snowflake_quoted_identifiers, - data_column_types=None, - index_column_types=None, - ) - - if index_qc is None: - new_internal_frame = self_frame - else: - # Join the index and data internal frames. - other_frame = index_qc._modin_frame - new_internal_frame, _ = join_utils.join( - other_frame, - self_frame, - how="outer", - left_on=other_frame.index_column_snowflake_quoted_identifiers, - right_on=self_frame.index_column_snowflake_quoted_identifiers, - inherit_join_index=InheritJoinIndex.FROM_LEFT, - ) + ) + new_ordered_frame = append_columns( + self_frame.ordered_dataframe, + new_snowflake_quoted_identifiers, + [pandas_lit(np.nan)] * len(extra_columns), + ) + new_internal_frame = InternalFrame.create( + ordered_dataframe=new_ordered_frame, + data_column_pandas_labels=self_frame.data_column_pandas_labels + + extra_columns, + data_column_snowflake_quoted_identifiers=self_frame.data_column_snowflake_quoted_identifiers + + new_snowflake_quoted_identifiers, + data_column_pandas_index_names=self_frame.data_column_pandas_index_names, + index_column_pandas_labels=self_frame.index_column_pandas_labels, + index_column_snowflake_quoted_identifiers=self_frame.index_column_snowflake_quoted_identifiers, + data_column_types=None, + index_column_types=None, + ) return SnowflakeQueryCompiler(new_internal_frame) diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index a1512d8ced..391c3c71d9 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -47,14 +47,16 @@ def obj_type_helper(obj_type: str) -> tuple: ], ) @pytest.mark.parametrize("obj_type", ["series", "df"]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_create_with_index_as_data(native_idx, obj_type): """ Creating a Series where the data is an Index. """ snow_idx = pd.Index(native_idx) - assert_equal_func, snow_obj, native_obj, _ = obj_type_helper(obj_type) - assert_equal_func(snow_obj(snow_idx), native_obj(native_idx)) + assert_equal_func, snow_obj, native_obj, kwargs = obj_type_helper(obj_type) + assert_equal_func( + snow_obj(snow_idx), native_obj(native_idx), check_dtype=False, **kwargs + ) @pytest.mark.parametrize( @@ -202,7 +204,7 @@ def test_create_with_index_as_data_and_series_as_index( ], ) @pytest.mark.parametrize("obj_type", ["series", "df"]) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_create_with_series_as_data_and_index_as_index( native_series, native_index, obj_type ): @@ -476,7 +478,7 @@ def test_create_df_with_df_index_negative(): def test_create_df_with_dict_as_data_and_index_as_index(): """ Special case when creating: - >>> DataFrame({"A": [1], "V": [2]}, native_pd.Index(["A", "B", "C"]), name="none") # doctest: +SKIP + DataFrame({"A": [1], "V": [2]}, native_pd.Index(["A", "B", "C"]), name="none") A V none A 1 2 diff --git a/tests/integ/test_dataframe.py b/tests/integ/test_dataframe.py index dd46bb4835..4cf335678e 100644 --- a/tests/integ/test_dataframe.py +++ b/tests/integ/test_dataframe.py @@ -2843,6 +2843,7 @@ def test_write_table_with_clustering_keys_and_comment( reason="Clustering is a SQL feature", run=False, ) +@pytest.mark.skipif(IS_IN_STORED_PROC, reason="show parameters is not supported in SP") def test_write_table_with_all_options(session): try: table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE) @@ -2996,6 +2997,8 @@ def test_create_dynamic_table(session, table_name_1, is_transient): if is_transient: assert "create or replace transient" in ddl_result, ddl_result else: + if IS_IN_STORED_PROC: + pytest.skip("show parameters is not supported in SP") # data retention and max data extension time cannot be queried from get_ddl # we run a show parameters query to get the values for these parameters show_params_sql = ( From 67a07c11125a38206df8d8162009fd65af22ee77 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 4 Sep 2024 15:43:48 -0700 Subject: [PATCH 18/42] refactor dataframe and series constructors --- .../snowpark/modin/pandas/dataframe.py | 36 +++++++++++++------ src/snowflake/snowpark/modin/pandas/series.py | 15 ++++---- .../compiler/snowflake_query_compiler.py | 21 ++++++----- .../snowpark/modin/plugin/extensions/index.py | 9 +++-- .../test_df_series_creation_with_index.py | 4 +-- 5 files changed, 55 insertions(+), 30 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index da2afd6ccf..72e6d04019 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -187,8 +187,10 @@ def __init__( query_compiler = data._query_compiler.copy() # We set the column name if it is not in the provided Series `data`. if data.name is None: - self.columns = [0] if columns is None else columns - elif columns is not None and data.name not in columns: + query_compiler = query_compiler.set_columns( + [0] if columns is None else columns + ) + if columns is not None and data.name not in columns: # If the columns provided are not in the named Series, pandas clears # the DataFrame and sets columns to the columns provided. query_compiler = from_pandas( @@ -208,7 +210,10 @@ def __init__( # The `columns` parameter is used to select the columns from `data` that will be in the resultant # DataFrame. If a value in `columns` is not present in `data`'s columns, it will be added as a # new column filled with NaN values. These columns are tracked by the `extra_columns` variable. - extra_columns = [col for col in columns if col not in data.columns] + if data.columns is not None and columns is not None: + extra_columns = [col for col in columns if col not in data.columns] + else: + extra_columns = [] query_compiler = data._query_compiler.create_qc_with_extra_columns( extra_columns ) @@ -296,13 +301,24 @@ def __init__( )._query_compiler if index is not None: - # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. - # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. - if isinstance(index, Index): - index = index.to_series()._query_compiler - elif isinstance(index, Series): - index = index._query_compiler - query_compiler = query_compiler.reindex(axis=0, labels=index) + if isinstance(data, (type(self), Series)): + # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. + # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. + labels = index + if isinstance(labels, Index): + labels = labels.to_series()._query_compiler + elif isinstance(labels, Series): + labels = labels._query_compiler + else: + labels = Index(labels).to_series()._query_compiler + query_compiler = query_compiler.reindex(axis=0, labels=labels) + + else: + # Performing set index to directly set the index column (joining on row-position instead of index). + index_qc = ( + index if isinstance(index, Series) else Series(index) + )._query_compiler + query_compiler = query_compiler.set_index_from_series(index_qc) if isinstance(data, DataFrame): # To select the required index and columns for the resultant DataFrame, diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 042ee86805..01c84699e4 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -31,6 +31,7 @@ import numpy.typing as npt import pandas from modin.pandas.accessor import CachedAccessor, SparseAccessor +from modin.pandas.base import _ATTRS_NO_LOOKUP, BasePandasDataset from modin.pandas.iterator import PartitionIterator from pandas._libs.lib import NoDefault, is_integer, no_default from pandas._typing import ( @@ -51,7 +52,6 @@ from pandas.core.series import _coerce_method from pandas.util._validators import validate_bool_kwarg -from snowflake.snowpark.modin.pandas.base import _ATTRS_NO_LOOKUP, BasePandasDataset from snowflake.snowpark.modin.pandas.utils import from_pandas, is_scalar from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike @@ -177,11 +177,14 @@ def __init__( if is_dict_like(data) or isinstance(data, (type(self))): # The `index` parameter is used to select the rows from `data` that will be in the resultant Series. # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. - if isinstance(index, Index): - index = index.to_series()._query_compiler - elif isinstance(index, Series): - index = index._query_compiler - query_compiler = query_compiler.reindex(axis=0, labels=index) + labels = index + if isinstance(labels, Index): + labels = labels.to_series()._query_compiler + elif isinstance(labels, Series): + labels = labels._query_compiler + else: + labels = Index(labels).to_series()._query_compiler + query_compiler = query_compiler.reindex(axis=0, labels=labels) else: # Performing set index to directly set the index column (joining on row-position instead of index). diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 3f9fe42115..123e2f7e54 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -2260,7 +2260,7 @@ def any( def reindex( self, axis: int, - labels: Union[pandas.Index, "pd.Index", list[Any]], + labels: Union[pandas.Index, "pd.Index", list[Any], "SnowflakeQueryCompiler"], **kwargs: dict[str, Any], ) -> "SnowflakeQueryCompiler": """ @@ -2270,7 +2270,7 @@ def reindex( ---------- axis : {0, 1} Axis to align labels along. 0 is for index, 1 is for columns. - labels : list-like + labels : list-like, SnowflakeQueryCompiler Index-labels to align with. method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} Method to use for filling holes in reindexed frame. @@ -2468,7 +2468,7 @@ def _add_columns_for_monotonicity_checks( def _reindex_axis_0( self, - labels: Union[pandas.Index, "pd.Index", list[Any]], + labels: Union[pandas.Index, "pd.Index", list[Any], "SnowflakeQueryCompiler"], **kwargs: dict[str, Any], ) -> "SnowflakeQueryCompiler": """ @@ -2476,7 +2476,7 @@ def _reindex_axis_0( Parameters ---------- - labels : list-like + labels : list-like, SnowflakeQueryCompiler Index-labels to align with. method : {None, "backfill"/"bfill", "pad"/"ffill", "nearest"} Method to use for filling holes in reindexed frame. @@ -2494,12 +2494,15 @@ def _reindex_axis_0( """ self._raise_not_implemented_error_for_timedelta() - if isinstance(labels, native_pd.Index): - labels = pd.Index(labels) - if isinstance(labels, pd.Index): - new_index_qc = labels.to_series()._query_compiler + if isinstance(labels, SnowflakeQueryCompiler): + new_index_qc = labels else: - new_index_qc = pd.Series(labels)._query_compiler + if isinstance(labels, native_pd.Index): + labels = pd.Index(labels) + if isinstance(labels, pd.Index): + new_index_qc = labels.to_series()._query_compiler + else: + new_index_qc = pd.Series(labels)._query_compiler new_index_modin_frame = new_index_qc._modin_frame modin_frame = self._modin_frame diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index 4e5cff3517..ea830561cd 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -49,7 +49,10 @@ from pandas.core.dtypes.inference import is_hashable from snowflake.snowpark.modin.pandas import DataFrame, Series -from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native +from snowflake.snowpark.modin.pandas.utils import ( + from_pandas, + try_convert_index_to_native, +) from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta from snowflake.snowpark.modin.plugin._internal.timestamp_utils import DateTimeOrigin from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( @@ -214,8 +217,8 @@ def _init_query_compiler( elif isinstance(data, Index): query_compiler = data._query_compiler else: - query_compiler = DataFrame( - index=cls._NATIVE_INDEX_TYPE(data=data, **kwargs) + query_compiler = from_pandas( + native_pd.DataFrame(index=cls._NATIVE_INDEX_TYPE(data=data, **kwargs)) )._query_compiler if len(query_compiler.columns): diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 391c3c71d9..d1bcb56651 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -47,7 +47,7 @@ def obj_type_helper(obj_type: str) -> tuple: ], ) @pytest.mark.parametrize("obj_type", ["series", "df"]) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=0) def test_create_with_index_as_data(native_idx, obj_type): """ Creating a Series where the data is an Index. @@ -471,7 +471,7 @@ def test_create_df_with_df_index_negative(): ValueError, match=re.escape("Shape of passed values is (3, 1), indices imply (2, 1)"), ): - pd.DataFrame([1, 2, 3], index=[[1, 2], [3, 4], [5, 6]]) + native_pd.DataFrame([1, 2, 3], index=[[1, 2], [3, 4], [5, 6]]) @sql_count_checker(query_count=2, join_count=1) From 145368080974e024d47035e048d3e468c621aa71 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 4 Sep 2024 17:51:57 -0700 Subject: [PATCH 19/42] fix docstring tests --- .../snowpark/modin/pandas/dataframe.py | 26 ++++++++++++++++--- src/snowflake/snowpark/modin/pandas/series.py | 19 +++++++++++--- .../snowpark/modin/plugin/docstrings/base.py | 2 +- tests/integ/modin/frame/test_loc.py | 6 +++++ .../integ/modin/groupby/test_groupby_apply.py | 2 +- 5 files changed, 45 insertions(+), 10 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 72e6d04019..6d487fd4bc 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -220,6 +220,7 @@ def __init__( else: # CASE 5: Non-Snowpark pandas data + dummy_index = None # used in a special dict case if isinstance(data, pandas.Index): # CASE 5.B: data is a pandas Index pass @@ -291,9 +292,15 @@ def __init__( )._query_compiler return + if all(is_scalar(k) and is_scalar(v) for k, v in data.items()): + # Special case: All keys and values in the dict are all scalars, an index needs to be provided. + # pd.DataFrame({'a': 1, 'b': 2}, index=[0]) + dummy_index = index + query_compiler = from_pandas( pandas.DataFrame( data=data, + index=dummy_index, columns=try_convert_index_to_native(columns), dtype=dtype, copy=copy, @@ -315,10 +322,21 @@ def __init__( else: # Performing set index to directly set the index column (joining on row-position instead of index). - index_qc = ( - index if isinstance(index, Series) else Series(index) - )._query_compiler - query_compiler = query_compiler.set_index_from_series(index_qc) + if isinstance(index, Series): + index_qc_list = [index._query_compiler] + elif isinstance(index, Index): + index_qc_list = [index.to_series()._query_compiler] + elif isinstance(index, pd.MultiIndex): + index_qc_list = [ + s._query_compiler + for s in [ + pd.Series(index.get_level_values(level)) + for level in range(index.nlevels) + ] + ] + else: + index_qc_list = [Series(index)._query_compiler] + query_compiler = query_compiler.set_index(index_qc_list) if isinstance(data, DataFrame): # To select the required index and columns for the resultant DataFrame, diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 01c84699e4..439591be0a 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -188,10 +188,21 @@ def __init__( else: # Performing set index to directly set the index column (joining on row-position instead of index). - index_qc = ( - index if isinstance(index, Series) else Series(index) - )._query_compiler - query_compiler = query_compiler.set_index_from_series(index_qc) + if isinstance(index, Series): + index_qc_list = [index._query_compiler] + elif isinstance(index, Index): + index_qc_list = [index.to_series()._query_compiler] + elif isinstance(index, pd.MultiIndex): + index_qc_list = [ + s._query_compiler + for s in [ + pd.Series(index.get_level_values(level)) + for level in range(index.nlevels) + ] + ] + else: + index_qc_list = [Series(index)._query_compiler] + query_compiler = query_compiler.set_index(index_qc_list) # Set the query compiler and name fields. self._query_compiler = query_compiler.columnarize() diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/base.py b/src/snowflake/snowpark/modin/plugin/docstrings/base.py index 3ba4f2f2da..52696fc64d 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/base.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/base.py @@ -1649,7 +1649,7 @@ def last_valid_index(): >>> df.last_valid_index() 12 >>> df = pd.DataFrame([5, 6, 7, 8], index=["i", "am", "iron", "man"]) - >>> df.last_valid_index() + >>> df.last_valid_index() # doctest: +SKIP 'man' """ diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index 68991b3cf1..a826d13a39 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -3924,3 +3924,9 @@ def test_raise_set_cell_with_list_like_value_error(): s.loc[0] = [0, 0] with pytest.raises(NotImplementedError): s.to_frame().loc[0, 0] = [0, 0] + + +def test_v(): + df = pd.DataFrame([5, 6, 7, 8], index=["i", "am", "iron", "man"]) + print(df) + print(df.last_valid_index()) diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index e83fcbe00b..82d21987cb 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -631,7 +631,7 @@ def test_apply_transfform_to_subset( ) @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, - join_count=JOIN_COUNT, + join_count=3, udtf_count=UDTF_COUNT, ) def test_numpy_ints_in_result(self, grouping_dfs_with_multiindexes, result): From b73f027fd4d2e2bfdf0d19b4fe52e28d38c563d4 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 6 Sep 2024 13:31:37 -0700 Subject: [PATCH 20/42] fix some tests --- .../snowpark/modin/pandas/dataframe.py | 2 +- src/snowflake/snowpark/modin/pandas/series.py | 22 +++++---- tests/integ/modin/frame/test_idxmax_idxmin.py | 8 ++-- tests/integ/modin/frame/test_insert.py | 18 +++---- .../integ/modin/groupby/test_groupby_apply.py | 26 +++++----- .../modin/groupby/test_groupby_transform.py | 8 ++-- .../test_df_series_creation_with_index.py | 14 ++++++ tests/integ/modin/series/test_reindex.py | 4 +- tests/integ/modin/series/test_setitem.py | 24 +++++----- tests/integ/modin/series/test_size.py | 21 ++++---- tests/integ/modin/series/test_take.py | 6 +-- tests/integ/modin/series/test_transpose.py | 4 +- tests/integ/modin/series/test_where.py | 40 +++++++++------- tests/integ/modin/test_merge.py | 31 ++++++------ .../modin/types/test_timedelta_indexing.py | 48 +++++++++---------- 15 files changed, 153 insertions(+), 123 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 6d487fd4bc..fb477bd82c 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -168,7 +168,7 @@ def __init__( # The logic followed here is: # 1. Create a query_compiler from the provided data. If columns are provided, add/select the columns. - # 2. If an index is provided, set the index through reindex. + # 2. If an index is provided, set the index through set_index or reindex. # 3. If the data is a DataFrame, perform loc to select the required index and columns from the DataFrame. # 4. The resultant query_compiler is then set as the query_compiler for the DataFrame. diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 5bf72262d6..57206ad5bf 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -196,16 +196,20 @@ def __init__( index_qc_list = [index._query_compiler] elif isinstance(index, Index): index_qc_list = [index.to_series()._query_compiler] - elif isinstance(index, pd.MultiIndex): - index_qc_list = [ - s._query_compiler - for s in [ - pd.Series(index.get_level_values(level)) - for level in range(index.nlevels) - ] - ] else: - index_qc_list = [Series(index)._query_compiler] + if is_list_like(index) and is_list_like(index[0]): + # If given a list of lists, convert it to a MultiIndex. + index = pandas.MultiIndex.from_arrays(index) + if isinstance(index, pandas.MultiIndex): + index_qc_list = [ + s._query_compiler + for s in [ + pd.Series(index.get_level_values(level)) + for level in range(index.nlevels) + ] + ] + else: + index_qc_list = [Series(index)._query_compiler] query_compiler = query_compiler.set_index(index_qc_list) # Set the query compiler and name fields. diff --git a/tests/integ/modin/frame/test_idxmax_idxmin.py b/tests/integ/modin/frame/test_idxmax_idxmin.py index 72fe88968b..94ca1d55b9 100644 --- a/tests/integ/modin/frame/test_idxmax_idxmin.py +++ b/tests/integ/modin/frame/test_idxmax_idxmin.py @@ -13,7 +13,7 @@ from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize( "data, index", [ @@ -83,7 +83,7 @@ def test_idxmax_idxmin_df(data, index, func, axis, skipna): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize( "data, index", [ @@ -173,7 +173,7 @@ def test_idxmax_idxmin_df_numeric_only_axis_1_different_column_dtypes( ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax_idxmin_with_dates(func, axis): @@ -194,7 +194,7 @@ def test_idxmax_idxmin_with_dates(func, axis): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.xfail(reason="SNOW-1625380 TODO") diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index 282f0fb3e7..ecc1ec19db 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -277,16 +277,16 @@ def test_insert_loc_negative(native_df, loc, expected_query_count): @pytest.mark.parametrize( "value, expected_query_count, expected_join_count", [ - (np.array(["a", "b", "c", "d"]), 2, 1), # numpy array of shape (N,) - (np.array([["a"], ["b"], ["c"], ["d"]]), 2, 1), # numpy array of shape (N, 1) - (["a", "b", "c", "d"], 2, 1), # python list - (("a", "b", "c", "d"), 2, 1), # python tuple - ({(3, 1): 1}, 1, 1), # python dict - ("abc", 1, 0), # sting scalar - (1, 1, 0), # int scalar + (np.array(["a", "b", "c", "d"]), 2, 5), # numpy array of shape (N,) + (np.array([["a"], ["b"], ["c"], ["d"]]), 2, 5), # numpy array of shape (N, 1) + (["a", "b", "c", "d"], 2, 5), # python list + (("a", "b", "c", "d"), 2, 5), # python tuple + ({(3, 1): 1}, 1, 3), # python dict + ("abc", 1, 2), # sting scalar + (1, 1, 2), # int scalar ], ) -def test_insert_multiindex_array_like_and_scaler( +def test_insert_multiindex_array_like_and_scalar( value, expected_query_count, expected_join_count ): arrays = [[3, 4, 5, 6], [1, 2, 1, 2]] @@ -310,7 +310,7 @@ def test_insert_multiindex_array_like_and_scaler( ("a", "b", "c", "d"), # python tuple ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2, join_count=5) def test_insert_empty_multiindex_frame(value): mi = pd.MultiIndex.from_arrays([np.array([], dtype=int), np.array([], dtype=int)]) snow_df = pd.DataFrame([], index=mi) diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index 82d21987cb..7c43b00a7b 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -191,7 +191,7 @@ class TestFuncReturnsDataFrame: @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_group_by_one_column_and_one_level_with_default_kwargs( self, grouping_dfs_with_multiindexes, func @@ -206,7 +206,7 @@ def test_group_by_one_column_and_one_level_with_default_kwargs( @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_df_with_default_index(self, grouping_dfs_with_multiindexes): eval_snowpark_pandas_result( @@ -232,7 +232,7 @@ def test_func_returns_empty_frame(self): @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_args_and_kwargs(self, grouping_dfs_with_multiindexes): def func(df, num1, str1): @@ -258,7 +258,7 @@ def func(df, num1, str1): @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_group_by_level(self, grouping_dfs_with_multiindexes, level): eval_snowpark_pandas_result( @@ -281,7 +281,7 @@ def operation(df: native_pd.DataFrame) -> native_pd.DataFrame: # When dropna=False, we can skip the dropna query query_count=4, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ): snow_result = operation(snow_df) pandas_result = operation(pandas_df) @@ -332,7 +332,7 @@ def test_group_dataframe_with_column_of_all_nulls_snow_1233832(self, null_value) @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) @pytest.mark.parametrize( "by, expected_output", @@ -417,7 +417,7 @@ def operation(df: native_pd.DataFrame) -> native_pd.DataFrame: @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) @pytest.mark.parametrize("by", ["level_0", ("a", "string_col_1")]) @pytest.mark.parametrize( @@ -444,7 +444,7 @@ def test_as_index_false(self, grouping_dfs_with_multiindexes, by, func): # transform because we only reindex to the original ordering if query_count=QUERY_COUNT_WITH_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_group_keys_false(self, grouping_dfs_with_multiindexes, as_index): eval_snowpark_pandas_result( @@ -598,7 +598,7 @@ def operation(df: native_pd.DataFrame) -> native_pd.DataFrame: @sql_count_checker( # we need a transform check because group_keys=False. query_count=QUERY_COUNT_WITH_TRANSFORM_CHECK, - join_count=JOIN_COUNT, + join_count=3, udtf_count=UDTF_COUNT, ) def test_apply_transfform_to_subset( @@ -800,7 +800,7 @@ def test_root_mean_squared_error(self): @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_multiindex_df(self, grouping_dfs_with_multiindexes, by, sort, as_index): eval_snowpark_pandas_result( @@ -836,7 +836,7 @@ def test_multiindex_df(self, grouping_dfs_with_multiindexes, by, sort, as_index) @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_non_series_or_dataframe_return_types( self, return_value, grouping_dfs_with_multiindexes @@ -918,7 +918,7 @@ class TestFuncReturnsSeries: @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_return_series_with_two_columns( self, grouping_dfs_with_multiindexes, by, level, as_index, sort, group_keys @@ -943,7 +943,7 @@ def test_return_series_with_two_columns( @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=JOIN_COUNT, + join_count=3, ) def test_args_and_kwargs(self, grouping_dfs_with_multiindexes): eval_snowpark_pandas_result( diff --git a/tests/integ/modin/groupby/test_groupby_transform.py b/tests/integ/modin/groupby/test_groupby_transform.py index 5f2339f2e4..46ef42f4f4 100644 --- a/tests/integ/modin/groupby/test_groupby_transform.py +++ b/tests/integ/modin/groupby/test_groupby_transform.py @@ -39,7 +39,7 @@ def test_dataframe_groupby_transform( # temporary function's resultant table. # - A second join is performed only when the groupby object specifies dropna=True. # This is because a loc set operation is being performed to correctly set NA values. - with SqlCounter(query_count=6, join_count=1 + (1 if dropna else 0), udtf_count=1): + with SqlCounter(query_count=6, join_count=2 + (2 if dropna else 0), udtf_count=1): eval_snowpark_pandas_result( *df_with_multiple_columns, lambda df: df.groupby( @@ -85,11 +85,11 @@ def test_dataframe_groupby_transform_with_func_args_and_kwargs( Test DataFrameGroupby.transform with functions that require *args and **kwargs. """ # - A UDTF is created to run `groupby.transform(func)` on every group via `apply`. - # - One join always occurs when joining the original DataFrame's table with the + # - Two joins always occurs when joining the original DataFrame's table with the # temporary function's resultant table. - # - A second join is performed only when the groupby object specifies dropna=True. + # - Another two joins are performed only when the groupby object specifies dropna=True. # This is because a loc set operation is being performed to correctly set NA values. - with SqlCounter(query_count=6, join_count=1 + (1 if dropna else 0), udtf_count=1): + with SqlCounter(query_count=6, join_count=2 + (2 if dropna else 0), udtf_count=1): eval_snowpark_pandas_result( *df_with_multiple_columns, lambda df: df.groupby( diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index d1bcb56651..f9c2cf173c 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -4,6 +4,7 @@ import re import modin.pandas as pd +import numpy as np import pandas as native_pd import pytest @@ -491,3 +492,16 @@ def test_create_df_with_dict_as_data_and_index_as_index(): native_df = native_pd.DataFrame(data, index=native_index) snow_df = pd.DataFrame(data, index=snow_index) assert_frame_equal(snow_df, native_df) + + +@sql_count_checker(query_count=1, join_count=2) +def test_create_series_with_list_of_lists_index(): + # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. + arrays = [ + np.array(["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"]), + np.array(["two", "one", "two", "one", "two", "one", "two", "one"]), + ] + data = [1, 2, 3, 4, 5, 6, 7, 8] + native_series = native_pd.Series(data, index=arrays) + snow_series = pd.Series(data, index=arrays) + assert_series_equal(snow_series, native_series) diff --git a/tests/integ/modin/series/test_reindex.py b/tests/integ/modin/series/test_reindex.py index b8bf2875ac..3f902f96df 100644 --- a/tests/integ/modin/series/test_reindex.py +++ b/tests/integ/modin/series/test_reindex.py @@ -259,7 +259,7 @@ def perform_reindex(series): ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=2) @pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) def test_reindex_index_datetime_with_fill(limit, method): @@ -300,7 +300,7 @@ def test_reindex_index_non_overlapping_index(): ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=2) def test_reindex_index_non_overlapping_datetime_index(): # TODO: SNOW-1638397 See if it's possible to use data={"prices": [100, 101, np.nan, 100, 89, 88]} instead. date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") diff --git a/tests/integ/modin/series/test_setitem.py b/tests/integ/modin/series/test_setitem.py index 50405643bc..39358b9870 100644 --- a/tests/integ/modin/series/test_setitem.py +++ b/tests/integ/modin/series/test_setitem.py @@ -175,7 +175,7 @@ (None, 35), # None scalar ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_setitem_scalar_key_and_scalar_item( key, item, default_index_native_int_series ): @@ -276,7 +276,7 @@ def test_series_setitem_none_key_and_scalar_item_mixed_type_series( (3.14, "a"), ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_setitem_scalar_key_and_scalar_item_mixed_type_series_type_coercion( key, item, mixed_type_index_native_series_mixed_type_index ): @@ -341,7 +341,7 @@ def test_series_setitem_scalar_key_and_scalar_item_mixed_type_series_type_coerci # TODO: SNOW-986548 fix where key is False, row is missed in this case @pytest.mark.parametrize("key", [True, False]) @pytest.mark.parametrize("item", SCALAR_LIKE_VALUES) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_setitem_boolean_key_and_scalar_item_label_updated(key, item): # series[scalar boolean key] = scalar item # ---------------------------------------- @@ -493,14 +493,14 @@ def test_series_setitem_boolean_key_and_scalar_item_case2_numeric_index(key, ite expected_ser = native_pd.Series(data=data, index=index) - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): # verify that the result is correct assert_series_equal(snowpark_ser, expected_ser) @pytest.mark.parametrize("key", [True, False]) @pytest.mark.parametrize("item", SCALAR_LIKE_VALUES) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_series_setitem_boolean_key_and_scalar_item_case2_non_numeric_index(key, item): # series[scalar boolean key] = scalar item # ---------------------------------------- @@ -559,7 +559,7 @@ def test_series_setitem_boolean_key_and_scalar_item_case2_non_numeric_index(key, @pytest.mark.parametrize("key", [0, 1]) @pytest.mark.parametrize("item", SCALAR_LIKE_VALUES) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_setitem_boolean_key_and_scalar_item_case3( key, item, native_series_with_duplicate_boolean_index ): @@ -1601,7 +1601,7 @@ def test_series_setitem_with_empty_key_and_empty_series_item( else: snowpark_key = key - with SqlCounter(query_count=1): + with SqlCounter(query_count=4): native_ser[key] = item snowpark_ser[ pd.Series(snowpark_key) @@ -1835,7 +1835,7 @@ def test_series_setitem_check_type_behavior_with_string_key_and_number_scalar_it assert_series_equal(snowpark_ser, native_ser, check_dtype=False) else: # All other cases match native pandas behavior - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): assert_series_equal(snowpark_ser, native_ser, check_dtype=False) @@ -1886,7 +1886,7 @@ def test_series_setitem_check_type_behavior_with_string_key_and_boolean_scalar_i # b True # c True # dtype: bool - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): err_msg = "Series are different" with pytest.raises(AssertionError, match=err_msg): assert_series_equal(snowpark_ser, native_ser, check_dtype=False) @@ -1997,7 +1997,7 @@ def test_series_setitem_check_type_behavior_with_string_key_and_string_scalar_it expected_data = [str(val) for val in native_ser] expected_ser = native_pd.Series(data=expected_data, index=index) - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): assert_series_equal(snowpark_ser, expected_ser, check_dtype=False) @@ -2093,7 +2093,7 @@ def set_loc_helper(ser): [2, "x"], ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_df_setitem_boolean_key(key, index): item = 99 @@ -2435,7 +2435,7 @@ def test_behavior_table_is_up_to_date(): prev_err_msg = expected_err_msg -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2, join_count=6) def test_series_setitem_int_key(): # pandas series setitem with int key is similar to loc set in most cases: # E.g., set index with label 3 to 100 diff --git a/tests/integ/modin/series/test_size.py b/tests/integ/modin/series/test_size.py index 4543525b2d..65730da0fd 100644 --- a/tests/integ/modin/series/test_size.py +++ b/tests/integ/modin/series/test_size.py @@ -8,7 +8,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -36,11 +36,16 @@ "multi index", ], ) -@sql_count_checker(query_count=1) def test_series_size(args, kwargs): - eval_snowpark_pandas_result( - pd.Series(*args, **kwargs), - native_pd.Series(*args, **kwargs), - lambda df: df.size, - comparator=lambda x, y: x == y, - ) + with SqlCounter( + query_count=1, + join_count=2 + if isinstance(kwargs.get("index", None), native_pd.MultiIndex) + else 0, + ): + eval_snowpark_pandas_result( + pd.Series(*args, **kwargs), + native_pd.Series(*args, **kwargs), + lambda df: df.size, + comparator=lambda x, y: x == y, + ) diff --git a/tests/integ/modin/series/test_take.py b/tests/integ/modin/series/test_take.py index 211a89968d..b21dc4295b 100644 --- a/tests/integ/modin/series/test_take.py +++ b/tests/integ/modin/series/test_take.py @@ -20,16 +20,16 @@ def test_series_take(): actual = ser.take([-1, 3, 4]) expected = pd.Series([4, 2, 4], index=[4, 3, 4]) - with SqlCounter(query_count=2, join_count=2): + with SqlCounter(query_count=2, join_count=3): assert_series_equal(actual, expected) # Out-of-bounds testing - valid because .iloc is used in backend. actual = ser.take([1, 10]) expected = pd.Series([5], index=[1]) - with SqlCounter(query_count=2, join_count=2): + with SqlCounter(query_count=2, join_count=3): assert_series_equal(actual, expected) actual = ser.take([2, 5]) expected = pd.Series([6], index=[2]) - with SqlCounter(query_count=2, join_count=2): + with SqlCounter(query_count=2, join_count=3): assert_series_equal(actual, expected) diff --git a/tests/integ/modin/series/test_transpose.py b/tests/integ/modin/series/test_transpose.py index 1e733a5226..ae2a076171 100644 --- a/tests/integ/modin/series/test_transpose.py +++ b/tests/integ/modin/series/test_transpose.py @@ -51,7 +51,7 @@ def test_series_transpose_empty(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_series_transpose_multi_index(): data = [1, 2, 3, 4, 5] index = [("a", "x"), ("b", "y"), ("c", "z"), ("d", "u"), ("e", "v")] @@ -66,7 +66,7 @@ def test_series_transpose_multi_index(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_series_transpose_index_no_names(): data = [1, 2, 3, 4, 5] index = [None, None, None, None, None] diff --git a/tests/integ/modin/series/test_where.py b/tests/integ/modin/series/test_where.py index 9f0c6d0f80..cff58d4a82 100644 --- a/tests/integ/modin/series/test_where.py +++ b/tests/integ/modin/series/test_where.py @@ -76,7 +76,7 @@ def test_series_where_duplicate_labels(): eval_snowpark_pandas_result(snow_ser, native_ser, lambda ser: ser.where(ser > 3)) -@sql_count_checker(query_count=1, join_count=0) +@sql_count_checker(query_count=1, join_count=1) def test_series_where_multi_index(): data = [1, 2, 3, 4, 5] index = [("a", "x"), ("b", "y"), ("c", "z"), ("d", "u"), ("e", "v")] @@ -234,7 +234,7 @@ def test_series_where_with_scalar_cond(cond): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_series_where_series_cond_unmatched_index(): data = [1, 2, 3, 4] index1 = [0, 1, 2, 3] @@ -259,9 +259,10 @@ def perform_where(series): ) -@sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("index", ["matched_index", "unmatched_index"]) -def test_series_where_short_series_cond(index): +@pytest.mark.parametrize( + "index, join_count", [("matched_index", 1), ("unmatched_index", 2)] +) +def test_series_where_short_series_cond(index, join_count): data = [1, 2, 3, 4] if index != "matched_index": index = [7, 8, 9] @@ -280,16 +281,18 @@ def perform_where(series): else: return series.where(native_cond, -1) - eval_snowpark_pandas_result( - snow_ser, - native_ser, - perform_where, - ) + with SqlCounter(query_count=1, join_count=join_count): + eval_snowpark_pandas_result( + snow_ser, + native_ser, + perform_where, + ) -@sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("index", ["matched_index", "unmatched_index"]) -def test_series_where_long_series_cond(index): +@pytest.mark.parametrize( + "index, join_count", [("matched_index", 1), ("unmatched_index", 2)] +) +def test_series_where_long_series_cond(index, join_count): data = [1, 2, 3, 4] if index != "matched_index": index = [7, 8, 9, 10, 11] @@ -308,8 +311,9 @@ def perform_where(series): else: return series.where(native_cond, -1) - eval_snowpark_pandas_result( - snow_ser, - native_ser, - perform_where, - ) + with SqlCounter(query_count=1, join_count=join_count): + eval_snowpark_pandas_result( + snow_ser, + native_ser, + perform_where, + ) diff --git a/tests/integ/modin/test_merge.py b/tests/integ/modin/test_merge.py index 7ba4a79152..5b265f5e3d 100644 --- a/tests/integ/modin/test_merge.py +++ b/tests/integ/modin/test_merge.py @@ -19,7 +19,7 @@ @pytest.fixture(scope="function") def left_df(): - return pd.DataFrame( + return native_pd.DataFrame( { "A": [3, 2, 1, 4, 4], "B": [2, 3, 1, 2, 1], @@ -30,7 +30,7 @@ def left_df(): @pytest.fixture(scope="function") def right_df(): - return pd.DataFrame( + return native_pd.DataFrame( { "A": [4, 3, 1, 4, 4], "C": [3, 4, 2, 1, 1], @@ -41,12 +41,12 @@ def right_df(): @pytest.fixture(scope="function") def unnamed_series(): - return pd.Series([1, 2, 3]) + return native_pd.Series([1, 2, 3]) @pytest.fixture(scope="function") def named_series(): - return pd.Series([1, 2, 3], name="S") + return native_pd.Series([1, 2, 3], name="S") @pytest.fixture(params=["left", "inner", "right", "outer"]) @@ -59,6 +59,7 @@ def how(request): @sql_count_checker(query_count=2, join_count=2) def test_merge(left_df, right_df, how): + left_df, right_df = pd.DataFrame(left_df), pd.DataFrame(right_df) res = pd.merge(left_df, right_df, on="A", how=how) expected = left_df.merge(right_df, on="A", how=how) assert_frame_equal(res, expected) @@ -66,6 +67,7 @@ def test_merge(left_df, right_df, how): @sql_count_checker(query_count=2, join_count=2) def test_merge_series_on_left(named_series, right_df, how): + named_series, right_df = pd.Series(named_series), pd.DataFrame(right_df) res = pd.merge(named_series, right_df, left_on="S", right_on="A", how=how) expected = named_series.to_frame().merge( right_df, left_on="S", right_on="A", how=how @@ -73,36 +75,37 @@ def test_merge_series_on_left(named_series, right_df, how): assert_frame_equal(res, expected) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=0) def test_merge_unnamed_series_negative(unnamed_series, right_df): with pytest.raises(ValueError) as pd_e: - native_pd.merge(unnamed_series.to_pandas(), right_df.to_pandas()) + native_pd.merge(unnamed_series, right_df) + unnamed_series, right_df = pd.Series(unnamed_series), pd.DataFrame(right_df) with pytest.raises(ValueError) as snow_e: pd.merge(unnamed_series, right_df) assert str(pd_e.value) == str(snow_e.value) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_merge_native_pandas_object_negative(left_df, right_df): - left_native = left_df.to_pandas() + right_df = pd.DataFrame(right_df) msg = ( - f"{type(left_native)} is not supported as 'value' argument. Please convert this to Snowpark pandas" + f"{type(left_df)} is not supported as 'value' argument. Please convert this to Snowpark pandas" r" objects by calling modin.pandas.Series\(\)/DataFrame\(\)" ) # Left frame as native pandas object with pytest.raises(TypeError, match=msg): - pd.merge(left_native, right_df, on="A") + pd.merge(left_df, right_df, on="A") # right frame as native pandas object with pytest.raises(TypeError, match=msg): - pd.merge(right_df, left_native, on="A") + pd.merge(right_df, left_df, on="A") -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=0) def test_merge_invalid_object_type_negative(left_df): right_df = "abc" with pytest.raises(TypeError) as pd_e: - native_pd.merge(left_df.to_pandas(), right_df) + native_pd.merge(left_df, right_df) with pytest.raises(TypeError) as snow_e: - pd.merge(left_df, right_df) + pd.merge(pd.DataFrame(left_df), right_df) assert str(pd_e.value) == str(snow_e.value) diff --git a/tests/integ/modin/types/test_timedelta_indexing.py b/tests/integ/modin/types/test_timedelta_indexing.py index 3840d11cc1..62f98107b9 100644 --- a/tests/integ/modin/types/test_timedelta_indexing.py +++ b/tests/integ/modin/types/test_timedelta_indexing.py @@ -264,7 +264,7 @@ def loc_set(key, item, df): df.loc[key] = item return df - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): # single value key = (1, "a") run_test(key, item, api=loc_set) @@ -346,7 +346,7 @@ def loc_set(key, item, df): run_test(key, item, api=loc_set) item = 1000 - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): # single value key = (1, "b") td_int = td.copy() @@ -383,7 +383,7 @@ def setitem_enlargement(key, item, df): ) key = 10 - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( snow_td["a"].copy(), td["a"].copy(), @@ -402,7 +402,7 @@ def loc_enlargement(key, item, df): ) key = 10 - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( snow_td["a"].copy(), td["a"].copy(), @@ -412,7 +412,7 @@ def loc_enlargement(key, item, df): # single row key = (10, slice(None, None, None)) - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=2): if pd.isna(item): eval_snowpark_pandas_result( snow_td.copy(), td.copy(), functools.partial(loc_enlargement, key, item) @@ -450,9 +450,9 @@ def test_index_get_timedelta(key, join_count): @pytest.mark.parametrize( "key, api, query_count, join_count", [ - [2, "iat", 1, 2], - [native_pd.Timedelta("1 days 1 hour"), "at", 2, 2], - [[2, 1], "iloc", 1, 2], + [2, "iat", 1, 4], + [native_pd.Timedelta("1 days 1 hour"), "at", 2, 4], + [[2, 1], "iloc", 1, 4], [ [ native_pd.Timedelta("1 days 1 hour"), @@ -460,11 +460,11 @@ def test_index_get_timedelta(key, join_count): ], "loc", 1, - 1, + 2, ], - [slice(1, None), "iloc", 1, 0], - [[True, False, False, True], "iloc", 1, 1], - [[True, False, False, True], "loc", 1, 1], + [slice(1, None), "iloc", 1, 1], + [[True, False, False, True], "iloc", 1, 2], + [[True, False, False, True], "loc", 1, 2], ], ) def test_series_with_timedelta_index(key, api, query_count, join_count): @@ -494,9 +494,9 @@ def test_series_with_timedelta_index(key, api, query_count, join_count): @pytest.mark.parametrize( "key, api, query_count, join_count", [ - [2, "iat", 1, 2], - [native_pd.Timedelta("1 days 1 hour"), "at", 2, 2], - [[2, 1], "iloc", 1, 2], + [2, "iat", 1, 4], + [native_pd.Timedelta("1 days 1 hour"), "at", 2, 4], + [[2, 1], "iloc", 1, 4], [ [ native_pd.Timedelta("1 days 1 hour"), @@ -504,11 +504,11 @@ def test_series_with_timedelta_index(key, api, query_count, join_count): ], "loc", 1, - 1, + 2, ], - [slice(1, None), "iloc", 1, 0], - [[True, False, False, True], "iloc", 1, 1], - [[True, False, False, True], "loc", 1, 1], + [slice(1, None), "iloc", 1, 1], + [[True, False, False, True], "iloc", 1, 2], + [[True, False, False, True], "loc", 1, 2], ], ) def test_df_with_timedelta_index(key, api, query_count, join_count): @@ -558,7 +558,7 @@ def setitem_enlargement(key, item, df): item = 23 key = native_pd.Timedelta("2 days") - with SqlCounter(query_count=1, join_count=0): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( snow_df.copy(), native_df.copy(), @@ -566,7 +566,7 @@ def setitem_enlargement(key, item, df): ) key = native_pd.Timedelta("2 days 45 minutes") - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=3): eval_snowpark_pandas_result( snow_df["a"].copy(), native_df["a"].copy(), @@ -579,7 +579,7 @@ def loc_enlargement(key, item, df): key = (slice(None, None, None), "x") - with SqlCounter(query_count=1, join_count=0): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( snow_df.copy(), native_df.copy(), @@ -587,7 +587,7 @@ def loc_enlargement(key, item, df): ) key = native_pd.Timedelta("2 days 25 minutes") - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=3): eval_snowpark_pandas_result( snow_df["a"].copy(), native_df["a"].copy(), @@ -597,7 +597,7 @@ def loc_enlargement(key, item, df): # single row key = (native_pd.Timedelta("2 days 45 minutes"), slice(None, None, None)) - with SqlCounter(query_count=1, join_count=1): + with SqlCounter(query_count=1, join_count=3): eval_snowpark_pandas_result( snow_df.copy(), native_df.copy(), From d422f86ae5ec554d73f033e0a12b25ce69cd404a Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 6 Sep 2024 14:34:33 -0700 Subject: [PATCH 21/42] replace series constructor --- .../plugin/extensions/series_overrides.py | 105 ++++++++++++------ 1 file changed, 72 insertions(+), 33 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index 645109120c..c442ecf995 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -50,11 +50,7 @@ from snowflake.snowpark.modin import pandas as spd # noqa: F401 from snowflake.snowpark.modin.pandas.api.extensions import register_series_accessor -from snowflake.snowpark.modin.pandas.utils import ( - from_pandas, - is_scalar, - try_convert_index_to_native, -) +from snowflake.snowpark.modin.pandas.utils import from_pandas, is_scalar from snowflake.snowpark.modin.plugin._internal.telemetry import ( snowpark_pandas_telemetry_method_decorator, try_add_telemetry_to_attribute, @@ -382,44 +378,87 @@ def __init__( # use this list to update inplace when there is a shallow copy. self._siblings = [] - # modified: - # Engine.subscribe(_update_engine) + from snowflake.snowpark.modin.plugin.extensions.index import Index + + if query_compiler: + # CASE 1: query_compiler + # If a query_compiler is passed in, only use the query_compiler and name fields to create a new Series. + self._query_compiler = query_compiler.columnarize() + if name is not None: + self.name = name + return + + # The logic followed here is: + # 1. Create a query_compiler from the provided data. + # 2. If an index is provided, set the index. This is either through set_index or reindex. + # 3. The resultant query_compiler is columnarized and set as the query_compiler for the Series. + # 4. If a name is provided, set the name. + + if isinstance(data, Index): + # CASE 2: Index + # If the data is an Index object, convert it to a Series, and get the query_compiler. + query_compiler = ( + data.to_series(index=None, name=name).reset_index(drop=True)._query_compiler + ) - # Convert lazy index to Series without pulling the data to client. - if isinstance(data, pd.Index): - query_compiler = data.to_series(index=index, name=name)._query_compiler - query_compiler = query_compiler.reset_index(drop=True) elif isinstance(data, type(self)): + # CASE 3: Series + # If the data is a Series object, copy the query_compiler. query_compiler = data._query_compiler.copy() - if index is not None: - if any(i not in data.index for i in index): - ErrorMessage.not_implemented( - "Passing non-existent columns or index values to constructor " - + "not yet implemented." - ) # pragma: no cover - query_compiler = data.loc[index]._query_compiler - if query_compiler is None: - # Defaulting to pandas - if name is None: - name = MODIN_UNNAMED_SERIES_LABEL - if ( - isinstance(data, (native_pd.Series, native_pd.Index, pd.Index)) - and data.name is not None - ): - name = data.name + else: + # CASE 4: Non-Snowpark pandas data + # If the data is not a Snowpark pandas object, convert it to a query compiler. + name = MODIN_UNNAMED_SERIES_LABEL if name is None else name + if ( + isinstance(data, (native_pd.Series, native_pd.Index)) + and data.name is not None + ): + name = data.name query_compiler = from_pandas( native_pd.DataFrame( native_pd.Series( - data=try_convert_index_to_native(data), - index=try_convert_index_to_native(index), - dtype=dtype, - name=name, - copy=copy, - fastpath=fastpath, + data=data, dtype=dtype, name=name, copy=copy, fastpath=fastpath ) ) )._query_compiler + + if index is not None: + if is_dict_like(data) or isinstance(data, (type(self))): + # The `index` parameter is used to select the rows from `data` that will be in the resultant Series. + # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. + labels = index + if isinstance(labels, Index): + labels = labels.to_series()._query_compiler + elif isinstance(labels, Series): + labels = labels._query_compiler + else: + labels = Index(labels).to_series()._query_compiler + query_compiler = query_compiler.reindex(axis=0, labels=labels) + + else: + # Performing set index to directly set the index column (joining on row-position instead of index). + if isinstance(index, Series): + index_qc_list = [index._query_compiler] + elif isinstance(index, Index): + index_qc_list = [index.to_series()._query_compiler] + else: + if is_list_like(index) and is_list_like(index[0]): + # If given a list of lists, convert it to a MultiIndex. + index = native_pd.MultiIndex.from_arrays(index) + if isinstance(index, native_pd.MultiIndex): + index_qc_list = [ + s._query_compiler + for s in [ + pd.Series(index.get_level_values(level)) + for level in range(index.nlevels) + ] + ] + else: + index_qc_list = [Series(index)._query_compiler] + query_compiler = query_compiler.set_index(index_qc_list) + + # Set the query compiler and name fields. self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name From 1ea5d00a76875efa956588a28f528a756b88db49 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Sun, 8 Sep 2024 22:35:51 -0700 Subject: [PATCH 22/42] fix tests --- .../snowpark/modin/pandas/dataframe.py | 35 ++++++--- .../plugin/extensions/series_overrides.py | 23 +++++- tests/integ/modin/frame/test_add_prefix.py | 2 +- tests/integ/modin/frame/test_add_suffix.py | 2 +- tests/integ/modin/frame/test_assign.py | 6 +- tests/integ/modin/frame/test_astype.py | 2 +- tests/integ/modin/frame/test_at.py | 8 +- tests/integ/modin/frame/test_axis.py | 2 +- tests/integ/modin/frame/test_cache_result.py | 2 + tests/integ/modin/frame/test_copy.py | 6 +- tests/integ/modin/frame/test_describe.py | 4 +- tests/integ/modin/frame/test_drop.py | 6 +- tests/integ/modin/frame/test_dtypes.py | 40 +++++----- tests/integ/modin/frame/test_iat.py | 4 +- tests/integ/modin/frame/test_idxmax_idxmin.py | 23 +++--- tests/integ/modin/frame/test_insert.py | 76 ++++++++++--------- tests/integ/modin/frame/test_join.py | 10 +-- tests/integ/modin/frame/test_mask.py | 4 +- tests/integ/modin/frame/test_nunique.py | 10 +-- tests/integ/modin/frame/test_rank.py | 40 +++++----- tests/integ/modin/frame/test_reindex.py | 6 +- tests/integ/modin/frame/test_rename.py | 4 +- tests/integ/modin/frame/test_repr.py | 2 +- tests/integ/modin/frame/test_setitem.py | 6 +- tests/integ/modin/frame/test_stack.py | 2 +- .../groupby/test_groupby_dataframe_rank.py | 46 ++++++----- .../groupby/test_groupby_default2pandas.py | 2 +- .../modin/groupby/test_groupby_head_tail.py | 4 +- .../groupby/test_groupby_idxmax_idxmin.py | 4 +- .../modin/groupby/test_groupby_ngroups.py | 2 +- .../modin/groupby/test_groupby_series.py | 16 ++-- .../test_df_series_creation_with_index.py | 9 +++ tests/integ/modin/index/test_index_methods.py | 2 +- tests/integ/modin/resample/test_resample.py | 6 +- .../modin/resample/test_resample_fillna.py | 4 +- tests/integ/modin/series/test_add_prefix.py | 2 +- tests/integ/modin/series/test_add_suffix.py | 2 +- tests/integ/modin/series/test_at.py | 8 +- .../modin/series/test_bitwise_operators.py | 62 ++++++++------- tests/integ/modin/series/test_compare.py | 2 +- tests/integ/modin/series/test_describe.py | 23 +++--- tests/integ/modin/series/test_empty.py | 2 +- tests/integ/modin/series/test_iat.py | 4 +- tests/integ/modin/series/test_mask.py | 40 +++++----- .../modin/series/test_nlargest_nsmallest.py | 2 +- tests/integ/modin/series/test_nunique.py | 14 ++-- tests/integ/modin/series/test_rank.py | 22 +++--- tests/integ/modin/series/test_rename.py | 6 +- tests/integ/modin/series/test_setitem.py | 2 +- tests/integ/modin/series/test_shape.py | 18 +++-- tests/integ/modin/series/test_take.py | 2 +- tests/integ/modin/series/test_to_snowflake.py | 2 +- tests/integ/modin/test_concat.py | 5 +- .../integ/modin/test_from_pandas_to_pandas.py | 4 +- tests/integ/modin/test_internal_frame.py | 2 +- tests/integ/modin/test_numpy.py | 6 +- 56 files changed, 370 insertions(+), 280 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 815f522bbe..9aa1b1fb26 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -297,6 +297,8 @@ def __init__( # pd.DataFrame({'a': 1, 'b': 2}, index=[0]) dummy_index = index + if is_scalar(data) and not isinstance(index, type(None)): + dummy_index = index query_compiler = from_pandas( pandas.DataFrame( data=data, @@ -308,9 +310,10 @@ def __init__( )._query_compiler if index is not None: - if isinstance(data, (type(self), Series)): + if isinstance(data, (type(self), Series, type(None))): # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. + # If data is None and an index is provided, set the index. labels = index if isinstance(labels, Index): labels = labels.to_series()._query_compiler @@ -326,16 +329,28 @@ def __init__( index_qc_list = [index._query_compiler] elif isinstance(index, Index): index_qc_list = [index.to_series()._query_compiler] - elif isinstance(index, pd.MultiIndex): - index_qc_list = [ - s._query_compiler - for s in [ - pd.Series(index.get_level_values(level)) - for level in range(index.nlevels) - ] - ] else: - index_qc_list = [Series(index)._query_compiler] + if ( + not isinstance(index, pandas.MultiIndex) + and is_list_like(index) + and len(index) > 0 + and all( + (not isinstance(i, tuple) and is_list_like(i)) + for i in index + ) + ): + # If given a list of lists, convert it to a MultiIndex. + index = pandas.MultiIndex.from_arrays(index) + if isinstance(index, pandas.MultiIndex): + index_qc_list = [ + s._query_compiler + for s in [ + pd.Series(index.get_level_values(level)) + for level in range(index.nlevels) + ] + ] + else: + index_qc_list = [Series(index)._query_compiler] query_compiler = query_compiler.set_index(index_qc_list) if isinstance(data, DataFrame): diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index c442ecf995..492098c2b6 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -410,6 +410,9 @@ def __init__( # CASE 4: Non-Snowpark pandas data # If the data is not a Snowpark pandas object, convert it to a query compiler. name = MODIN_UNNAMED_SERIES_LABEL if name is None else name + dummy_index = None + if is_scalar(data) and not isinstance(index, type(None)): + dummy_index = index if ( isinstance(data, (native_pd.Series, native_pd.Index)) and data.name is not None @@ -418,15 +421,22 @@ def __init__( query_compiler = from_pandas( native_pd.DataFrame( native_pd.Series( - data=data, dtype=dtype, name=name, copy=copy, fastpath=fastpath + data=data, + dtype=dtype, + index=dummy_index, + name=name, + copy=copy, + fastpath=fastpath, ) ) )._query_compiler if index is not None: - if is_dict_like(data) or isinstance(data, (type(self))): + if is_dict_like(data) or isinstance(data, (type(self), type(None))): # The `index` parameter is used to select the rows from `data` that will be in the resultant Series. # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. + # If data is None and an index is provided, all the values in the Series will be NaN and the index + # will be the provided index. labels = index if isinstance(labels, Index): labels = labels.to_series()._query_compiler @@ -443,7 +453,14 @@ def __init__( elif isinstance(index, Index): index_qc_list = [index.to_series()._query_compiler] else: - if is_list_like(index) and is_list_like(index[0]): + if ( + not isinstance(index, native_pd.MultiIndex) + and is_list_like(index) + and len(index) > 0 + and all( + (not isinstance(i, tuple) and is_list_like(i)) for i in index + ) + ): # If given a list of lists, convert it to a MultiIndex. index = native_pd.MultiIndex.from_arrays(index) if isinstance(index, native_pd.MultiIndex): diff --git a/tests/integ/modin/frame/test_add_prefix.py b/tests/integ/modin/frame/test_add_prefix.py index 5ac652ea92..8cf30f4913 100644 --- a/tests/integ/modin/frame/test_add_prefix.py +++ b/tests/integ/modin/frame/test_add_prefix.py @@ -46,7 +46,7 @@ def test_df_add_prefix_multiindex(prefix, native_df_with_multiindex_columns): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("prefix", TEST_ADD_PREFIX_DATA) def test_df_add_prefix_time_column_df( prefix, time_column_snowpark_pandas_df, time_column_native_df diff --git a/tests/integ/modin/frame/test_add_suffix.py b/tests/integ/modin/frame/test_add_suffix.py index 4fbaf1e319..0dceff54d7 100644 --- a/tests/integ/modin/frame/test_add_suffix.py +++ b/tests/integ/modin/frame/test_add_suffix.py @@ -46,7 +46,7 @@ def test_df_add_suffix_multiindex(suffix, native_df_with_multiindex_columns): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("suffix", TEST_ADD_SUFFIX_DATA) def test_df_add_suffix_time_column_df( suffix, time_column_snowpark_pandas_df, time_column_native_df diff --git a/tests/integ/modin/frame/test_assign.py b/tests/integ/modin/frame/test_assign.py index 2f4ab8da44..f60107057e 100644 --- a/tests/integ/modin/frame/test_assign.py +++ b/tests/integ/modin/frame/test_assign.py @@ -36,7 +36,7 @@ def assign_func(df): eval_snowpark_pandas_result(snow_df, native_df, assign_func) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=3) @pytest.mark.parametrize( "index", [[2, 1, 0], [4, 5, 6]], ids=["reversed_index", "different_index"] ) @@ -136,7 +136,7 @@ def test_assign_short_series(): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=3) @pytest.mark.parametrize( "index", [[1, 0], [4, 5]], ids=["reversed_index", "different_index"] ) @@ -240,7 +240,7 @@ def test_overwrite_columns_via_assign(): ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_assign_basic_timedelta_series(): snow_df, native_df = create_test_dfs( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], diff --git a/tests/integ/modin/frame/test_astype.py b/tests/integ/modin/frame/test_astype.py index 8007b264b4..dbd267b307 100644 --- a/tests/integ/modin/frame/test_astype.py +++ b/tests/integ/modin/frame/test_astype.py @@ -35,7 +35,7 @@ def test_series_input(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_input_negative(): df = pd.DataFrame({"a": [1, 2, 3], "b": [2.4, 2.5, 3.1]}) with pytest.raises(KeyError, match="not found in columns"): diff --git a/tests/integ/modin/frame/test_at.py b/tests/integ/modin/frame/test_at.py index f43270ff53..9194416648 100644 --- a/tests/integ/modin/frame/test_at.py +++ b/tests/integ/modin/frame/test_at.py @@ -20,7 +20,7 @@ def test_at_get_default_index_str_columns( ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_at_set_default_index_str_columns( default_index_snowpark_pandas_df, default_index_native_df, @@ -44,7 +44,7 @@ def test_at_get_str_index_str_columns( assert str_index_snowpark_pandas_df.at["b", "B"] == str_index_native_df.at["b", "B"] -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_at_set_str_index_str_columns( str_index_snowpark_pandas_df, str_index_native_df, @@ -57,7 +57,7 @@ def at_set_helper(df): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_at_get_time_index_time_columns( time_index_snowpark_pandas_df, time_index_native_df, @@ -68,7 +68,7 @@ def test_at_get_time_index_time_columns( ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_at_set_time_index_time_columns( time_index_snowpark_pandas_df, time_index_native_df, diff --git a/tests/integ/modin/frame/test_axis.py b/tests/integ/modin/frame/test_axis.py index a6a156a05f..0fb3fa2c5f 100644 --- a/tests/integ/modin/frame/test_axis.py +++ b/tests/integ/modin/frame/test_axis.py @@ -244,7 +244,7 @@ def test_set_columns_index_name(index_name): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_duplicate_labels_assignment(): # Duplicate data labels snow_df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py index c78cefaa3a..c26b28e4ab 100644 --- a/tests/integ/modin/frame/test_cache_result.py +++ b/tests/integ/modin/frame/test_cache_result.py @@ -81,6 +81,8 @@ def perform_chained_operations(df, module): @pytest.mark.parametrize("inplace", [True, False]) def test_cache_result_empty_dataframe(init_kwargs, inplace): snow_df, native_df = create_test_dfs(**init_kwargs) + print(snow_df) + print(native_df) snow_df_copy = snow_df.copy(deep=True) with SqlCounter(query_count=1): cached_snow_df = cache_and_return_df(snow_df, inplace) diff --git a/tests/integ/modin/frame/test_copy.py b/tests/integ/modin/frame/test_copy.py index b4c5f4f2a5..7844ca321a 100644 --- a/tests/integ/modin/frame/test_copy.py +++ b/tests/integ/modin/frame/test_copy.py @@ -28,7 +28,7 @@ def native_df(snow_df): @pytest.mark.parametrize("deep", [None, True, False]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_copy(deep, snow_df, native_df): # Verify copy is same as original assert_snowpark_pandas_equal_to_pandas(snow_df.copy(deep=deep), native_df) @@ -61,7 +61,7 @@ def test_copy_deep_false_column_names(snow_df): lambda df: df.rename(columns={"a": "new_a"}, inplace=True), ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_copy_inplace_operations_on_deep_copy(snow_df, native_df, operation): snow_df_copy = snow_df.copy(deep=True) operation(snow_df_copy) @@ -79,7 +79,7 @@ def test_copy_inplace_operations_on_deep_copy(snow_df, native_df, operation): lambda df: df.rename(columns={"a": "new_a"}, inplace=True), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_copy_inplace_operations_on_shallow_copy(snow_df, operation): snow_df_copy = snow_df.copy(deep=False) operation(snow_df_copy) diff --git a/tests/integ/modin/frame/test_describe.py b/tests/integ/modin/frame/test_describe.py index a9668c5794..28425ab695 100644 --- a/tests/integ/modin/frame/test_describe.py +++ b/tests/integ/modin/frame/test_describe.py @@ -255,8 +255,8 @@ def timestamp_describe_comparator(snow_res, native_res): @pytest.mark.parametrize( "index", [ - pytest.param(None, id="default_index"), - pytest.param(["one", "two", "three", "four", "five", "six"], id="flat_index"), + # pytest.param(None, id="default_index"), + # pytest.param(["one", "two", "three", "four", "five", "six"], id="flat_index"), pytest.param( [ np.array(["bar", "bar", "baz", "baz", "foo", "foo"]), diff --git a/tests/integ/modin/frame/test_drop.py b/tests/integ/modin/frame/test_drop.py index cc1a1a203d..4dcae76af7 100644 --- a/tests/integ/modin/frame/test_drop.py +++ b/tests/integ/modin/frame/test_drop.py @@ -209,7 +209,7 @@ def test_drop_invalid_labels_axis0_negative( ([], None), # empty labels ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=4) def test_drop_invalid_axis1_labels_errors_ignore(labels, level, multiindex_snow_df): result = multiindex_snow_df.drop(labels, level=level, axis=1, errors="ignore") assert_frame_equal(multiindex_snow_df, result) @@ -231,7 +231,7 @@ def test_drop_invalid_axis1_labels_errors_ignore(labels, level, multiindex_snow_ ([], None), # empty labels ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=4) def test_drop_invalid_axis0_labels_errors_ignore(labels, level, multiindex_snow_df): result = multiindex_snow_df.drop(labels, level=level, errors="ignore") assert_frame_equal(multiindex_snow_df, result) @@ -263,7 +263,7 @@ def test_empty_tuple_multiindex(multiindex_snow_df, axis): assert len(result.index) == 0 -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=4) def test_drop_preserve_index_names(multiindex_snow_df): df_dropped_e = multiindex_snow_df.drop("red", axis=1) df_inplace_e = multiindex_snow_df.copy() diff --git a/tests/integ/modin/frame/test_dtypes.py b/tests/integ/modin/frame/test_dtypes.py index c3773bdd6d..49d8abfe2a 100644 --- a/tests/integ/modin/frame/test_dtypes.py +++ b/tests/integ/modin/frame/test_dtypes.py @@ -18,7 +18,7 @@ StringType, VariantType, ) -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_frame_equal, assert_series_equal, @@ -77,7 +77,7 @@ def validate_series_snowpark_dtype(series: pd.Series, snowpark_type: DataType) - ), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=1) def test_integer(dataframe_input, input_dtype, logical_dtype): expected = native_pd.Series(dataframe_input, dtype=input_dtype) created = pd.Series(dataframe_input, dtype=input_dtype) @@ -218,7 +218,7 @@ def test_extended_float64_with_nan(): ), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=1) def test_float(dataframe_input, input_dtype, expected_dtype, logical_dtype): expected = native_pd.Series(dataframe_input, dtype=input_dtype) created = pd.Series(dataframe_input, dtype=input_dtype) @@ -256,7 +256,7 @@ def test_float(dataframe_input, input_dtype, expected_dtype, logical_dtype): ), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=1) def test_string(dataframe_input, input_dtype, index): expected = native_pd.Series(dataframe_input, dtype=input_dtype) created = pd.Series(dataframe_input) @@ -305,7 +305,7 @@ def test_string_explicit(dataframe_input, input_dtype, index): (["level0"], ["col1", "col2", "col1"]), ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_insert_multiindex_multi_label(label1, label2): arrays = [["apple", "apple", "banana", "banana"], [1, 2, 1, 2]] index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) @@ -452,24 +452,24 @@ def test_empty(input_dtype, expected_dtype, snowpark_dtype, to_pandas_dtype): @pytest.mark.parametrize( - "index, expected_index_dtype", + "index, expected_index_dtype, join_count", [ - (None, np.dtype("int64")), - (native_pd.Index([]), np.dtype("object")), - (native_pd.Index([], dtype="float64"), np.dtype("float64")), + (None, np.dtype("int64"), 0), + (native_pd.Index([]), np.dtype("object"), 1), + (native_pd.Index([], dtype="float64"), np.dtype("float64"), 1), ], ) -@sql_count_checker(query_count=1) -def test_empty_index(index, expected_index_dtype): - expected = native_pd.Series(data=[], index=index) - assert expected.dtype == np.dtype("object") - assert expected.index.dtype == expected_index_dtype - created = pd.Series(data=[], index=index) - assert created.dtype == np.dtype("object") - assert created.index.dtype == expected_index_dtype - roundtripped = created.to_pandas() - assert roundtripped.dtype == np.dtype("object") - assert roundtripped.index.dtype == expected_index_dtype +def test_empty_index(index, expected_index_dtype, join_count): + with SqlCounter(query_count=1, join_count=join_count): + expected = native_pd.Series(data=[], index=index) + assert expected.dtype == np.dtype("object") + assert expected.index.dtype == expected_index_dtype + created = pd.Series(data=[], index=index) + assert created.dtype == np.dtype("object") + assert created.index.dtype == expected_index_dtype + roundtripped = created.to_pandas() + assert roundtripped.dtype == np.dtype("object") + assert roundtripped.index.dtype == expected_index_dtype @pytest.mark.parametrize( diff --git a/tests/integ/modin/frame/test_iat.py b/tests/integ/modin/frame/test_iat.py index 2191fb8db8..dbf3d50759 100644 --- a/tests/integ/modin/frame/test_iat.py +++ b/tests/integ/modin/frame/test_iat.py @@ -103,7 +103,7 @@ def iat_set_helper(df): (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=4) def test_iat_get_time_index_time_columns( key, time_index_snowpark_pandas_df, @@ -121,7 +121,7 @@ def test_iat_get_time_index_time_columns( (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=4) def test_iat_set_time_index_time_columns( key, time_index_snowpark_pandas_df, diff --git a/tests/integ/modin/frame/test_idxmax_idxmin.py b/tests/integ/modin/frame/test_idxmax_idxmin.py index 94ca1d55b9..f9dc28bba9 100644 --- a/tests/integ/modin/frame/test_idxmax_idxmin.py +++ b/tests/integ/modin/frame/test_idxmax_idxmin.py @@ -13,7 +13,6 @@ from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result -@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize( "data, index", [ @@ -74,13 +73,17 @@ def test_idxmax_idxmin_df(data, index, func, axis, skipna): pytest.xfail( "Snowpark pandas returns a Series with None whereas pandas throws a ValueError" ) - eval_snowpark_pandas_result( - *create_test_dfs( - data=data, - index=index, - ), - lambda df: getattr(df, func)(axis=axis, skipna=skipna), - ) + with SqlCounter( + query_count=1, + join_count=0 if index is None or (data == {} and index == []) else 1, + ): + eval_snowpark_pandas_result( + *create_test_dfs( + data=data, + index=index, + ), + lambda df: getattr(df, func)(axis=axis, skipna=skipna), + ) @sql_count_checker(query_count=1, join_count=1) @@ -173,7 +176,7 @@ def test_idxmax_idxmin_df_numeric_only_axis_1_different_column_dtypes( ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax_idxmin_with_dates(func, axis): @@ -214,7 +217,7 @@ def test_idxmax_idxmin_with_timedelta(func, axis): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax_idxmin_with_strings(func, axis): diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index ecc1ec19db..c7a1c980c9 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -344,55 +344,61 @@ def test_insert_multiindex_dict_negative(): @pytest.mark.parametrize( - "df_index, value_index", + "df_index, value_index, join_count", [ - ([3, 0, 4], [1, 2, 3]), - ([(1, 0), (1, 2), (2, 2)], [(1, 1), (1, 2), (2, 2)]), - ([1.0, 2.5, 3.0], [1, 2, 3]), # Long and Double can be joined + ([3, 0, 4], [1, 2, 3], 6), + ([(1, 0), (1, 2), (2, 2)], [(1, 1), (1, 2), (2, 2)], 11), + ([1.0, 2.5, 3.0], [1, 2, 3], 6), # Long and Double can be joined ], ) -@sql_count_checker(query_count=4, join_count=1) -def test_insert_compatible_index(df_index, value_index): +def test_insert_compatible_index(df_index, value_index, join_count): snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) value = pd.DataFrame({"col2": ["x", "y", "z"]}, index=native_pd.Index(value_index)) - eval_snowpark_pandas_result( - snow_df, - snow_df.to_pandas(), - lambda df: df.insert( - 0, "col3", value if isinstance(df, pd.DataFrame) else value.to_pandas() - ), - inplace=True, # insert operation is always inplace - ) + with SqlCounter(query_count=4, join_count=join_count): + eval_snowpark_pandas_result( + snow_df, + snow_df.to_pandas(), + lambda df: df.insert( + 0, "col3", value if isinstance(df, pd.DataFrame) else value.to_pandas() + ), + inplace=True, # insert operation is always inplace + ) @pytest.mark.parametrize( - "df_index, value_index", + "df_index, value_index, join_count", [ - ([3, 2, 1], [(1, 0, 1), (1, 2, 3), (2, 1, 0)]), # length mismatch 1 != 3 + ([3, 2, 1], [(1, 0, 1), (1, 2, 3), (2, 1, 0)], 3), # length mismatch 1 != 3 ( [(3, 1), (2, 1), (1, 2)], [(1, 0, 1), (1, 2, 3), (2, 1, 0)], + 3, ), # length mismatch 2 != 3 - ([1, 2, 3], [(1, 0), (1, 2), (2, 2)]), # 1 != 2 - ([(1, 0), (1, 2), (2, 2)], [(1, 2, 3), (3, 4, 5), (6, 5, 4)]), # 2 != 3 - ([(1, 2, 3), (3, 4, 5), (6, 5, 4)], [3, 1, 2]), # length mismatch 3 != 1 + ([1, 2, 3], [(1, 0), (1, 2), (2, 2)], 2), # 1 != 2 + ([(1, 0), (1, 2), (2, 2)], [(1, 2, 3), (3, 4, 5), (6, 5, 4)], 3), # 2 != 3 + ([(1, 2, 3), (3, 4, 5), (6, 5, 4)], [3, 1, 2], 1), # length mismatch 3 != 1 ( [(1, 1), (1, 2), (2, 2)], ["(1, 0)", "(1, 2)", "(2, 2)"], + 1, ), # length and type mismatch ], ) -@sql_count_checker(query_count=1) -def test_insert_index_num_levels_mismatch_negative(df_index, value_index): - snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) - value = pd.DataFrame({"col2": ["w", "x", "y"]}, index=native_pd.Index(value_index)) - # This is different behavior from native pandas. Native pandas in some cases - # insert new column with null values but in Snowpark pandas we always raise error. - with pytest.raises( - ValueError, - match="Number of index levels of inserted column are different from frame index", - ): - snow_df.insert(0, "col3", value) +def test_insert_index_num_levels_mismatch_negative(df_index, value_index, join_count): + with SqlCounter(query_count=1, join_count=join_count): + snow_df = pd.DataFrame( + {"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index) + ) + value = pd.DataFrame( + {"col2": ["w", "x", "y"]}, index=native_pd.Index(value_index) + ) + # This is different behavior from native pandas. Native pandas in some cases + # insert new column with null values but in Snowpark pandas we always raise error. + with pytest.raises( + ValueError, + match="Number of index levels of inserted column are different from frame index", + ): + snow_df.insert(0, "col3", value) @pytest.mark.parametrize( @@ -407,7 +413,7 @@ def test_insert_index_num_levels_mismatch_negative(df_index, value_index): ), # type mismatch boolean != long ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2, join_count=4) def test_insert_index_type_mismatch(df_index, value_index, expected_index): # Note: This is different behavior than native pandas. In native pandas when # index datatype mismatch new columns in inserted will all NULL values. @@ -424,7 +430,7 @@ def test_insert_index_type_mismatch(df_index, value_index, expected_index): assert_snowpark_pandas_equal_to_pandas(snow_df, expected_df) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_insert_with_null_index_values(): snow_df = pd.DataFrame( {"A": ["p", "q", "r", "s"]}, native_pd.Index(["a", None, "b", None]) @@ -440,7 +446,7 @@ def test_insert_with_null_index_values(): ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_insert_multiple_null(): snow_df = pd.DataFrame( {"A": ["p", "q", "r", "s"]}, native_pd.Index(["a", "b", "c", "d"]) @@ -465,8 +471,8 @@ def test_insert_multiple_null(): @pytest.mark.parametrize( "index, value, expected_query_count, expected_join_count", [ - ([1, 2], native_pd.Series([1, 2], index=[2, 3]), 1, 1), - ([1, 2], [3, 4], 2, 1), + ([1, 2], native_pd.Series([1, 2], index=[2, 3]), 1, 3), + ([1, 2], [3, 4], 2, 3), ], ) def test_insert_into_empty_dataframe_with_index( diff --git a/tests/integ/modin/frame/test_join.py b/tests/integ/modin/frame/test_join.py index f37011065b..2721a8f6aa 100644 --- a/tests/integ/modin/frame/test_join.py +++ b/tests/integ/modin/frame/test_join.py @@ -269,21 +269,21 @@ def test_join_validate_negative(lvalues, rvalues, validate): left.join(right, validate=validate) -@sql_count_checker(query_count=6, join_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_join_timedelta(left, right): right = right.astype("timedelta64[ns]") eval_snowpark_pandas_result( + pd.DataFrame(left), left, - left.to_pandas(), lambda df: df.join( - right if isinstance(df, pd.DataFrame) else right.to_pandas() + pd.DataFrame(right) if isinstance(df, pd.DataFrame) else right ), ) left = left.astype("timedelta64[ns]") eval_snowpark_pandas_result( + pd.DataFrame(left), left, - left.to_pandas(), lambda df: df.join( - right if isinstance(df, pd.DataFrame) else right.to_pandas() + pd.DataFrame(right) if isinstance(df, pd.DataFrame) else right ), ) diff --git a/tests/integ/modin/frame/test_mask.py b/tests/integ/modin/frame/test_mask.py index 53afbd7bf8..7b47880557 100644 --- a/tests/integ/modin/frame/test_mask.py +++ b/tests/integ/modin/frame/test_mask.py @@ -864,7 +864,7 @@ def perform_mask(df): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=4) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], @@ -909,7 +909,7 @@ def perform_mask(df): ) -@sql_count_checker(query_count=2, join_count=3, union_count=1) +@sql_count_checker(query_count=2, join_count=5, union_count=1) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], diff --git a/tests/integ/modin/frame/test_nunique.py b/tests/integ/modin/frame/test_nunique.py index d0cad8ec2a..6fd1751e3a 100644 --- a/tests/integ/modin/frame/test_nunique.py +++ b/tests/integ/modin/frame/test_nunique.py @@ -85,12 +85,12 @@ def test_dataframe_nunique_no_columns(native_df): ), ], ) -@sql_count_checker(query_count=1) def test_dataframe_nunique_multiindex(index, columns): - eval_snowpark_pandas_result( - *create_test_dfs(TEST_DATA, index=index, columns=columns), - lambda df: df.nunique(axis=0), - ) + with SqlCounter(query_count=1, join_count=0 if index is None else 2): + eval_snowpark_pandas_result( + *create_test_dfs(TEST_DATA, index=index, columns=columns), + lambda df: df.nunique(axis=0), + ) @sql_count_checker(query_count=0) diff --git a/tests/integ/modin/frame/test_rank.py b/tests/integ/modin/frame/test_rank.py index 1687ce4905..05fa47b99b 100644 --- a/tests/integ/modin/frame/test_rank.py +++ b/tests/integ/modin/frame/test_rank.py @@ -7,7 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, eval_snowpark_pandas_result, @@ -40,7 +40,6 @@ ] -@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -56,13 +55,16 @@ ) # test df.rank with all method, na_option, ascending parameter combinations def test_df_rank(data, index, method, ascending, na_option): - snow_df = pd.DataFrame(data, index=index) - native_df = native_pd.DataFrame(data, index=index) - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.rank(method=method, na_option=na_option, ascending=ascending), - ) + with SqlCounter( + query_count=1, join_count=2 if isinstance(index, native_pd.MultiIndex) else 0 + ): + snow_df = pd.DataFrame(data, index=index) + native_df = native_pd.DataFrame(data, index=index) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.rank(method=method, na_option=na_option, ascending=ascending), + ) @sql_count_checker(query_count=1) @@ -118,7 +120,6 @@ def test_rank_unsupported_args_negative(method, ascending, na_option): snow_df.rank(axis=1, method=method, ascending=ascending, na_option=na_option) -@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -134,10 +135,15 @@ def test_rank_unsupported_args_negative(method, ascending, na_option): ) # test df percentile rank def test_df_rank_pct(data, index, method, ascending, na_option): - snow_df = pd.DataFrame(data, index=index).rank( - method=method, ascending=ascending, na_option=na_option, pct=True - ) - native_df = native_pd.DataFrame(data, index=index).rank( - method=method, ascending=ascending, na_option=na_option, pct=True - ) - assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64(snow_df, native_df) + with SqlCounter( + query_count=1, join_count=2 if isinstance(index, native_pd.MultiIndex) else 0 + ): + snow_df = pd.DataFrame(data, index=index).rank( + method=method, ascending=ascending, na_option=na_option, pct=True + ) + native_df = native_pd.DataFrame(data, index=index).rank( + method=method, ascending=ascending, na_option=na_option, pct=True + ) + assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64( + snow_df, native_df + ) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index 98d0a41e7a..1f7a7e3966 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -454,7 +454,7 @@ def test_reindex_columns_fill_method_with_old_na_values_negative( lambda df: df.reindex(columns=list("CEBFGA"), method=method), ) - @sql_count_checker(query_count=5) + @sql_count_checker(query_count=5, join_count=1) @pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) def test_reindex_columns_datetime_with_fill(self, limit, method): @@ -495,7 +495,7 @@ def test_reindex_columns_non_overlapping_columns(self): snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("EFG")) ) - @sql_count_checker(query_count=5) + @sql_count_checker(query_count=5, join_count=1) def test_reindex_columns_non_overlapping_datetime_columns(self): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_df = native_pd.DataFrame( @@ -520,7 +520,7 @@ def perform_reindex(df): snow_df, native_df, perform_reindex, check_freq=False ) - @sql_count_checker(query_count=2) + @sql_count_checker(query_count=2, join_count=1) def test_reindex_columns_non_overlapping_different_types_columns(self): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_df = native_pd.DataFrame( diff --git a/tests/integ/modin/frame/test_rename.py b/tests/integ/modin/frame/test_rename.py index a5595ec716..15351ec6fa 100644 --- a/tests/integ/modin/frame/test_rename.py +++ b/tests/integ/modin/frame/test_rename.py @@ -294,7 +294,7 @@ def test_rename_objects(self, snow_float_string_frame): assert "FOO" in renamed assert "foo" not in renamed - @sql_count_checker(query_count=6, join_count=2) + @sql_count_checker(query_count=6, join_count=8) def test_rename_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"]) @@ -420,7 +420,7 @@ def test_rename_mapper_and_positional_arguments_raises(self): with pytest.raises(TypeError, match=msg): df.rename({}, columns={}, index={}) - @sql_count_checker(query_count=1, join_count=1) + @sql_count_checker(query_count=1, join_count=5) def test_rename_with_duplicate_columns(self): # GH#4403 df4 = DataFrame( diff --git a/tests/integ/modin/frame/test_repr.py b/tests/integ/modin/frame/test_repr.py index 2109bdccb5..f499146806 100644 --- a/tests/integ/modin/frame/test_repr.py +++ b/tests/integ/modin/frame/test_repr.py @@ -227,7 +227,7 @@ def test_repr_deviating_behavior(): assert native_str[:N] == snow_str[:N] -@sql_count_checker(query_count=2, union_count=1) +@sql_count_checker(query_count=2, union_count=1, join_count=6) def test_repr_of_multiindex_df(): tuples = [ ("cobra", "mark i"), diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index 6152089f39..6bbdc30fa0 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -141,7 +141,7 @@ def setitem(df): else: df[key] = val - expected_join_count = 3 if isinstance(key.start, int) else 4 + expected_join_count = 6 if isinstance(key.start, int) else 7 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result(snow_df, native_df, setitem, inplace=True) @@ -361,9 +361,7 @@ def func_insert_new_column(df, column): df[key] = column expected_join_count = 2 - if isinstance(column, native_pd.Series): - expected_join_count = 1 - elif isinstance(column, native_pd.Index) and not isinstance( + if isinstance(column, native_pd.Index) and not isinstance( column, native_pd.DatetimeIndex ): expected_join_count = 4 diff --git a/tests/integ/modin/frame/test_stack.py b/tests/integ/modin/frame/test_stack.py index 9b06c32ff0..80c437dea7 100644 --- a/tests/integ/modin/frame/test_stack.py +++ b/tests/integ/modin/frame/test_stack.py @@ -20,7 +20,7 @@ ) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_stack(data, index, columns, dropna, sort): eval_snowpark_pandas_result( *create_test_dfs(data=data, index=index, columns=columns), diff --git a/tests/integ/modin/groupby/test_groupby_dataframe_rank.py b/tests/integ/modin/groupby/test_groupby_dataframe_rank.py index 3bb4a4b455..78443c3bbf 100644 --- a/tests/integ/modin/groupby/test_groupby_dataframe_rank.py +++ b/tests/integ/modin/groupby/test_groupby_dataframe_rank.py @@ -7,7 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, eval_snowpark_pandas_result, @@ -211,7 +211,6 @@ ] -@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -233,16 +232,18 @@ def test_df_groupby_rank(data, index, method, ascending, na_option, dropna): snow_df = pd.DataFrame(data, index=index) native_df = native_pd.DataFrame(data, index=index) - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.groupby("group", dropna=dropna).rank( - method=method, na_option=na_option, ascending=ascending - ), - ) + with SqlCounter( + query_count=1, join_count=2 if isinstance(index, pd.MultiIndex) else 0 + ): + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.groupby("group", dropna=dropna).rank( + method=method, na_option=na_option, ascending=ascending + ), + ) -@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -272,10 +273,14 @@ def test_df_rank_pct(data, index, method, ascending, na_option, dropna): .groupby("group", dropna=dropna) .rank(method=method, ascending=ascending, na_option=na_option, pct=True) ) - assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64(snow_df, native_df) + with SqlCounter( + query_count=1, join_count=2 if isinstance(index, pd.MultiIndex) else 0 + ): + assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64( + snow_df, native_df + ) -@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA_MUL) @pytest.mark.parametrize( "method", @@ -293,13 +298,16 @@ def test_df_rank_pct(data, index, method, ascending, na_option, dropna): def test_df_groupby_rank_by_list(data, index, method, ascending, na_option): snow_df = pd.DataFrame(data, index=index) native_df = native_pd.DataFrame(data, index=index) - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.groupby(["group", "a"]).rank( - method=method, na_option=na_option, ascending=ascending - ), - ) + with SqlCounter( + query_count=1, join_count=2 if isinstance(index, pd.MultiIndex) else 0 + ): + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.groupby(["group", "a"]).rank( + method=method, na_option=na_option, ascending=ascending + ), + ) @pytest.mark.parametrize( diff --git a/tests/integ/modin/groupby/test_groupby_default2pandas.py b/tests/integ/modin/groupby/test_groupby_default2pandas.py index 49d45a1009..74aac8f77c 100644 --- a/tests/integ/modin/groupby/test_groupby_default2pandas.py +++ b/tests/integ/modin/groupby/test_groupby_default2pandas.py @@ -124,7 +124,7 @@ def test_groupby_with_numpy_array(basic_snowpark_pandas_df) -> None: "by_list", [[2, 1, 1, 2, 3, 3], [[2, 1, 1, 2, 3, 3], "a"]], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_groupby_series_with_numpy_array(series_multi_numeric, by_list) -> None: with pytest.raises( NotImplementedError, match=AGGREGATE_UNSUPPORTED_GROUPING_ERROR_PATTERN diff --git a/tests/integ/modin/groupby/test_groupby_head_tail.py b/tests/integ/modin/groupby/test_groupby_head_tail.py index 90819ec2d6..d462b89150 100644 --- a/tests/integ/modin/groupby/test_groupby_head_tail.py +++ b/tests/integ/modin/groupby/test_groupby_head_tail.py @@ -45,7 +45,7 @@ class TestDataFrameGroupByHeadTail: ["lion", 1234, 456, 78, 9], ] - @sql_count_checker(query_count=1) + @sql_count_checker(query_count=1, join_count=1) def test_df_groupby_head_tail(self, op_type, n, dropna, as_index, sort, group_keys): """ Test DataFrameGroupBy.head and DataFrameGroupBy.tail with a small df with no NA values. @@ -66,7 +66,7 @@ def test_df_groupby_head_tail(self, op_type, n, dropna, as_index, sort, group_ke check_index_type=False, ) - @sql_count_checker(query_count=6) + @sql_count_checker(query_count=6, join_count=1) def test_df_groupby_head_tail_large_data( self, op_type, n, dropna, as_index, sort, group_keys, large_df_with_na_values ): diff --git a/tests/integ/modin/groupby/test_groupby_idxmax_idxmin.py b/tests/integ/modin/groupby/test_groupby_idxmax_idxmin.py index ec1e36d1e3..e87b6327bc 100644 --- a/tests/integ/modin/groupby/test_groupby_idxmax_idxmin.py +++ b/tests/integ/modin/groupby/test_groupby_idxmax_idxmin.py @@ -20,7 +20,7 @@ @pytest.mark.parametrize("grouping_columns", ["B", ["A", "B"]]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_df_groupby_idxmax_idxmin_on_axis_0( df_with_multiple_columns, grouping_columns, skipna, func ): @@ -73,7 +73,7 @@ def test_df_groupby_idxmax_idxmin_on_axis_1_negative(df_with_multiple_columns, f @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize("numeric_only", [True, False]) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_groupby_idxmax_idxmin_with_different_column_dtypes_on_axis_0( func, numeric_only ): diff --git a/tests/integ/modin/groupby/test_groupby_ngroups.py b/tests/integ/modin/groupby/test_groupby_ngroups.py index 332e4c88eb..6216c4c223 100644 --- a/tests/integ/modin/groupby/test_groupby_ngroups.py +++ b/tests/integ/modin/groupby/test_groupby_ngroups.py @@ -17,7 +17,7 @@ def assert_ngroups_equal(snow_res, pd_res): @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=4) def test_groupby_sort_multiindex_series(series_multi_numeric, by): snow_ser = series_multi_numeric diff --git a/tests/integ/modin/groupby/test_groupby_series.py b/tests/integ/modin/groupby/test_groupby_series.py index ae8ae0926d..10dd08b6fd 100644 --- a/tests/integ/modin/groupby/test_groupby_series.py +++ b/tests/integ/modin/groupby/test_groupby_series.py @@ -19,14 +19,14 @@ @pytest.mark.parametrize("by", ["a", ["b"], ["a", "b"]]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=4) def test_groupby_sort_multiindex_series(series_multi_numeric, agg_method, by): native_mseries_group = series_multi_numeric.to_pandas().groupby(by=by, sort=True) mseries_group = series_multi_numeric.groupby(by=by, sort=True) eval_snowpark_pandas_result(mseries_group, native_mseries_group, agg_method) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=3, join_count=6) def test_groupby_sort_false_multiindex_series(series_multi_numeric): # it is known that groupby sort=False is buggy with multiIndex, it is always # sorting when only part of the level is used. @@ -48,7 +48,7 @@ def test_groupby_sort_false_multiindex_series(series_multi_numeric): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_groupby_series_count_with_nan(): index = native_pd.Index(["a", "b", "b", "a", "c"]) index.names = ["grp_col"] @@ -75,7 +75,7 @@ def test_groupby_series_count_with_nan(): ], ) @pytest.mark.parametrize("sort", [True, False]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_groupby_agg_series(agg_func, sort): index = native_pd.Index(["a", "b", "b", "a", "c"]) index.names = ["grp_col"] @@ -113,7 +113,7 @@ def test_groupby_agg_series_dict_func_negative(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize( "agg_func, type_str", [({"x": ("y", "sum")}, "tuple"), ({"x": pd.NamedAgg("y", "sum")}, "NamedAgg")], @@ -139,7 +139,7 @@ def test_groupby_agg_series_raises_for_2_tuple_agg(agg_func, type_str): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("aggs", [{"minimum": min}, {"minimum": min, "maximum": max}]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_groupby_agg_series_named_agg(aggs, sort): index = native_pd.Index(["a", "b", "b", "a", "c"]) index.names = ["grp_col"] @@ -164,7 +164,7 @@ def test_groupby_series_numeric_only(series_str, numeric_only): @pytest.mark.parametrize("level", [0, 1, [1, 0], "b", [1, 1], [0, "b"], [-1]]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=4) def test_groupby_sort_multiindex_series_level(series_multi_numeric, level): native_series = series_multi_numeric.to_pandas() @@ -173,7 +173,7 @@ def test_groupby_sort_multiindex_series_level(series_multi_numeric, level): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_groupby_series_single_index(): snow_ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) native_ser = native_pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index f9c2cf173c..9a629101f3 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -505,3 +505,12 @@ def test_create_series_with_list_of_lists_index(): native_series = native_pd.Series(data, index=arrays) snow_series = pd.Series(data, index=arrays) assert_series_equal(snow_series, native_series) + + +@sql_count_checker(query_count=1, join_count=1) +def test_create_series_with_none_data_and_non_empty_index(): + # When creating an empty Series with a non-empty index, the index should be used as the index of the Series. + index = ["A", "B", "C", "D"] + native_series = native_pd.Series(None, index=index, dtype=object) + snow_series = pd.Series(None, index=index, dtype=object) + assert_series_equal(snow_series, native_series) diff --git a/tests/integ/modin/index/test_index_methods.py b/tests/integ/modin/index/test_index_methods.py index 8f6f5b9f59..d8c3646d97 100644 --- a/tests/integ/modin/index/test_index_methods.py +++ b/tests/integ/modin/index/test_index_methods.py @@ -359,7 +359,7 @@ def test_has_duplicates(index): assert index.has_duplicates == snow_index.has_duplicates -@sql_count_checker(query_count=6) +@sql_count_checker(query_count=6, join_count=6) def test_index_parent(): """ Check whether the parent field in Index is updated properly. diff --git a/tests/integ/modin/resample/test_resample.py b/tests/integ/modin/resample/test_resample.py index af99185294..b4e8858273 100644 --- a/tests/integ/modin/resample/test_resample.py +++ b/tests/integ/modin/resample/test_resample.py @@ -145,7 +145,7 @@ def test_resample_duplicated_timestamps(): @freq @interval @agg_func -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_series(freq, interval, agg_func): rule = f"{interval}{freq}" eval_snowpark_pandas_result( @@ -188,7 +188,7 @@ def test_resample_df_with_nan(agg_func): @agg_func -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_ser_with_nan(agg_func): # 1 resample bin of all NaN, 1 resample bin partially NaN, 1 resample bin no NaNs eval_snowpark_pandas_result( @@ -242,7 +242,7 @@ def test_resample_df_getitem(): ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2, join_count=3) def test_resample_ser_getitem(): eval_snowpark_pandas_result( *create_test_series( diff --git a/tests/integ/modin/resample/test_resample_fillna.py b/tests/integ/modin/resample/test_resample_fillna.py index d4e959123a..96ad514a2b 100644 --- a/tests/integ/modin/resample/test_resample_fillna.py +++ b/tests/integ/modin/resample/test_resample_fillna.py @@ -44,7 +44,7 @@ def test_resample_fill(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=5) +@sql_count_checker(query_count=2, join_count=3) def test_resample_fill_ser(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -139,7 +139,7 @@ def test_resample_ffill_missing_in_middle(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=5) +@sql_count_checker(query_count=2, join_count=3) def test_resample_ffill_ser_missing_in_middle(interval, agg_func): datecol = native_pd.to_datetime( [ diff --git a/tests/integ/modin/series/test_add_prefix.py b/tests/integ/modin/series/test_add_prefix.py index 4d05f78d94..6bba930c43 100644 --- a/tests/integ/modin/series/test_add_prefix.py +++ b/tests/integ/modin/series/test_add_prefix.py @@ -46,7 +46,7 @@ def test_series_add_prefix_multiindex(prefix, multiindex_native_int_series): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("prefix", TEST_ADD_PREFIX_DATA) def test_series_add_prefix_time_column_df(prefix, time_index_series_data): series_data, kwargs = time_index_series_data diff --git a/tests/integ/modin/series/test_add_suffix.py b/tests/integ/modin/series/test_add_suffix.py index 43a98ab951..f3329c6789 100644 --- a/tests/integ/modin/series/test_add_suffix.py +++ b/tests/integ/modin/series/test_add_suffix.py @@ -46,7 +46,7 @@ def test_add_suffix_multiindex(suffix, multiindex_native_int_series): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("suffix", TEST_ADD_SUFFIX_DATA) def test_add_suffix_time_column_df(suffix, time_index_series_data): series_data, kwargs = time_index_series_data diff --git a/tests/integ/modin/series/test_at.py b/tests/integ/modin/series/test_at.py index 9452a0d736..4533c20d35 100644 --- a/tests/integ/modin/series/test_at.py +++ b/tests/integ/modin/series/test_at.py @@ -18,7 +18,7 @@ def test_at_get_default_index( ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_at_set_default_index( default_index_snowpark_pandas_series, default_index_native_series, @@ -42,7 +42,7 @@ def test_at_get_str_index( assert str_index_snowpark_pandas_series.at["b"] == str_index_native_series.at["b"] -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_at_set_str_index( str_index_snowpark_pandas_series, str_index_native_series, @@ -58,7 +58,7 @@ def at_set_helper(series): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_at_get_time_index( time_index_snowpark_pandas_series, time_index_native_series, @@ -69,7 +69,7 @@ def test_at_get_time_index( ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_at_set_time_index( time_index_snowpark_pandas_series, time_index_native_series, diff --git a/tests/integ/modin/series/test_bitwise_operators.py b/tests/integ/modin/series/test_bitwise_operators.py index eda9c536c9..ad542fd223 100644 --- a/tests/integ/modin/series/test_bitwise_operators.py +++ b/tests/integ/modin/series/test_bitwise_operators.py @@ -11,7 +11,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, eval_snowpark_pandas_result, @@ -47,15 +47,16 @@ def try_cast_to_snow_series(value: Any) -> Any: @pytest.mark.parametrize("value", BITWISE_TEST_DATA) -@sql_count_checker(query_count=1) def test_bitwise_unary(value): # Note: In pandas, using NaN values without specfiying a null-compatible dtype will yield an error. # SnowPandas will allow this behavior. # Note: NaN values like pd.NA, pd.NaT, np.nan will raise a TypeError: boolean value of NA is ambiguous - snow_value = try_cast_to_snow_series(value) - - eval_snowpark_pandas_result(snow_value, native_pd.Series(value), lambda s: ~s) + with SqlCounter( + query_count=1, join_count=1 if isinstance(value, native_pd.Series) else 0 + ): + snow_value = try_cast_to_snow_series(value) + eval_snowpark_pandas_result(snow_value, native_pd.Series(value), lambda s: ~s) @pytest.mark.parametrize("series", SERIES_BITWISE_TEST_DATA) @@ -121,7 +122,6 @@ def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): @pytest.mark.parametrize( "op", [operator.or_, operator.and_] ) # |, &. ^ is not supported in Snowflake -@sql_count_checker(query_count=2, join_count=2) def test_bitwise_binary_between_series(lhs, rhs, op): def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): snow_ans = op(snow_lhs, snow_rhs) @@ -131,10 +131,14 @@ def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): snow_ans, native_ans, lambda s: s, check_index_type=False ) - check_op(lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs)) + with SqlCounter( + query_count=2, + join_count=10 if isinstance(lhs.index, native_pd.MultiIndex) else 6, + ): + check_op(lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs)) - # commute series - check_op(rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs)) + # commute series + check_op(rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs)) # Due to differences in logical or/and in SQL and pandas' |,& implementation, behavior doesn't match here, in particular @@ -230,18 +234,21 @@ def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): ), ], ) -@sql_count_checker(query_count=1, join_count=1) def test_bitwise_binary_between_series_with_deviating_behavior_or( lhs, rhs, expected_pandas, expected_snowpark_pandas ): - snow_ans = try_cast_to_snow_series(lhs) | try_cast_to_snow_series(rhs) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_ans, expected_snowpark_pandas - ) + with SqlCounter( + query_count=1, + join_count=5 if isinstance(lhs.index, native_pd.MultiIndex) else 3, + ): + snow_ans = try_cast_to_snow_series(lhs) | try_cast_to_snow_series(rhs) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_ans, expected_snowpark_pandas + ) - # test here pandas to track any version regressions - native_ans = lhs | rhs - tm.assert_series_equal(native_ans, expected_pandas, check_index_type=False) + # test here pandas to track any version regressions + native_ans = lhs | rhs + tm.assert_series_equal(native_ans, expected_pandas, check_index_type=False) @pytest.mark.parametrize( @@ -315,16 +322,19 @@ def test_bitwise_binary_between_series_with_deviating_behavior_or( ), ], ) -@sql_count_checker(query_count=1, join_count=1) def test_bitwise_binary_between_series_with_deviating_behavior_and( lhs, rhs, expected_pandas, expected_snowpark_pandas ): - snow_ans = try_cast_to_snow_series(lhs) & try_cast_to_snow_series(rhs) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_ans, expected_snowpark_pandas - ) + with SqlCounter( + query_count=1, + join_count=5 if isinstance(lhs.index, native_pd.MultiIndex) else 3, + ): + snow_ans = try_cast_to_snow_series(lhs) & try_cast_to_snow_series(rhs) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_ans, expected_snowpark_pandas + ) - # test here pandas to track any version regressions - native_ans = lhs & rhs - print(native_ans.index) - tm.assert_series_equal(native_ans, expected_pandas, check_index_type=False) + # test here pandas to track any version regressions + native_ans = lhs & rhs + print(native_ans.index) + tm.assert_series_equal(native_ans, expected_pandas, check_index_type=False) diff --git a/tests/integ/modin/series/test_compare.py b/tests/integ/modin/series/test_compare.py index c5c927343e..8d60d7f75a 100644 --- a/tests/integ/modin/series/test_compare.py +++ b/tests/integ/modin/series/test_compare.py @@ -50,7 +50,7 @@ class TestDefaultParameters: # copying the original series's index to the final resulting dataframe # adds 1 extra query to materialize the index. query_count=QUERY_COUNT + 1, - join_count=JOIN_COUNT, + join_count=5, ) def test_no_diff(self, base_series): other_series = base_series.copy() diff --git a/tests/integ/modin/series/test_describe.py b/tests/integ/modin/series/test_describe.py index 9ecd2e33a3..32876f1608 100644 --- a/tests/integ/modin/series/test_describe.py +++ b/tests/integ/modin/series/test_describe.py @@ -8,7 +8,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_series_equal, create_test_series, @@ -129,15 +129,18 @@ def timestamp_describe_comparator(snow_res, native_res): @pytest.mark.parametrize( - "index", + "index, join_count", [ - pytest.param(None, id="default_index"), - pytest.param(["one", "two", "three", "four", "five", "six"], id="flat_index"), + pytest.param(None, 0, id="default_index"), + pytest.param( + ["one", "two", "three", "four", "five", "six"], 6, id="flat_index" + ), pytest.param( [ np.array(["bar", "bar", "baz", "baz", "foo", "foo"]), np.array(["one", "two", "one", "two", "one", "two"]), ], + 12, id="2D_index", ), ], @@ -151,8 +154,10 @@ def timestamp_describe_comparator(snow_res, native_res): ], ids=["ints", "floats", "objects"], ) -@sql_count_checker(query_count=1, union_count=5) -def test_describe_multiindex(data, index): - eval_snowpark_pandas_result( - *create_test_series(data, index=index), lambda ser: ser.describe() - ) +def test_describe_multiindex(data, index, join_count): + if isinstance(data[0], str) and index is not None: + join_count = 8 if len(index) == 2 else 4 + with SqlCounter(query_count=1, union_count=5, join_count=join_count): + eval_snowpark_pandas_result( + *create_test_series(data, index=index), lambda ser: ser.describe() + ) diff --git a/tests/integ/modin/series/test_empty.py b/tests/integ/modin/series/test_empty.py index a30a69116c..8e7aa9d915 100644 --- a/tests/integ/modin/series/test_empty.py +++ b/tests/integ/modin/series/test_empty.py @@ -34,7 +34,7 @@ "empty series with only index", ], ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_series_empty(args, kwargs): eval_snowpark_pandas_result( pd.Series(*args, **kwargs), diff --git a/tests/integ/modin/series/test_iat.py b/tests/integ/modin/series/test_iat.py index b3e2255403..7b9a4d4c06 100644 --- a/tests/integ/modin/series/test_iat.py +++ b/tests/integ/modin/series/test_iat.py @@ -103,7 +103,7 @@ def iat_set_helper(series): (0,), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=4) def test_iat_get_time_index( key, time_index_snowpark_pandas_series, @@ -122,7 +122,7 @@ def test_iat_get_time_index( (0,), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=4) def test_iat_set_time_index( key, time_index_snowpark_pandas_series, diff --git a/tests/integ/modin/series/test_mask.py b/tests/integ/modin/series/test_mask.py index 2ef2465b58..0d3680cff4 100644 --- a/tests/integ/modin/series/test_mask.py +++ b/tests/integ/modin/series/test_mask.py @@ -76,7 +76,7 @@ def test_series_mask_duplicate_labels(): eval_snowpark_pandas_result(snow_ser, native_ser, lambda ser: ser.mask(ser > 3)) -@sql_count_checker(query_count=1, join_count=0) +@sql_count_checker(query_count=1, join_count=1) def test_series_mask_multi_index(): data = [1, 2, 3, 4, 5] index = [("a", "x"), ("b", "y"), ("c", "z"), ("d", "u"), ("e", "v")] @@ -233,7 +233,7 @@ def test_series_mask_with_scalar_cond(cond): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_series_mask_series_cond_unmatched_index(): data = [1, 2, 3, 4] index1 = [0, 1, 2, 3] @@ -258,9 +258,10 @@ def perform_mask(series): ) -@sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("index", ["matched_index", "unmatched_index"]) -def test_series_mask_short_series_cond(index): +@pytest.mark.parametrize( + "index, join_count", [("matched_index", 1), ("unmatched_index", 2)] +) +def test_series_mask_short_series_cond(index, join_count): data = [1, 2, 3, 4] if index != "matched_index": index = [7, 8, 9] @@ -279,16 +280,18 @@ def perform_mask(series): else: return series.mask(native_cond, -1) - eval_snowpark_pandas_result( - snow_ser, - native_ser, - perform_mask, - ) + with SqlCounter(query_count=1, join_count=join_count): + eval_snowpark_pandas_result( + snow_ser, + native_ser, + perform_mask, + ) -@sql_count_checker(query_count=1, join_count=1) -@pytest.mark.parametrize("index", ["matched_index", "unmatched_index"]) -def test_series_mask_long_series_cond(index): +@pytest.mark.parametrize( + "index, join_count", [("matched_index", 1), ("unmatched_index", 2)] +) +def test_series_mask_long_series_cond(index, join_count): data = [1, 2, 3, 4] if index != "matched_index": index = [7, 8, 9, 10, 11] @@ -307,8 +310,9 @@ def perform_mask(series): else: return series.mask(native_cond, -1) - eval_snowpark_pandas_result( - snow_ser, - native_ser, - perform_mask, - ) + with SqlCounter(query_count=1, join_count=join_count): + eval_snowpark_pandas_result( + snow_ser, + native_ser, + perform_mask, + ) diff --git a/tests/integ/modin/series/test_nlargest_nsmallest.py b/tests/integ/modin/series/test_nlargest_nsmallest.py index a15cc5dfb2..253230156b 100644 --- a/tests/integ/modin/series/test_nlargest_nsmallest.py +++ b/tests/integ/modin/series/test_nlargest_nsmallest.py @@ -88,7 +88,7 @@ def test_nlargest_nsmallest_non_numeric_types(method, data): assert_series_equal(getattr(snow_s, method)(n), expected_s) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=3, join_count=2) def test_nlargest_nsmallest_no_columns(method): snow_s = pd.Series(query_compiler=pd.DataFrame(index=[1, 2])._query_compiler) snow_s = snow_s diff --git a/tests/integ/modin/series/test_nunique.py b/tests/integ/modin/series/test_nunique.py index bb20e9e4a5..f2aba15ada 100644 --- a/tests/integ/modin/series/test_nunique.py +++ b/tests/integ/modin/series/test_nunique.py @@ -8,7 +8,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_values_equal, create_test_series, @@ -63,11 +63,11 @@ def test_series_nunique_deviating_nan_behavior(input_data, expected): ), ], ) -@sql_count_checker(query_count=1) def test_dataframe_nunique_multiindex(index): data = [0.1, 0.2, 0.1, 0] - eval_snowpark_pandas_result( - *create_test_series(data, index=index), - lambda ser: ser.nunique(), - comparator=assert_values_equal, - ) + with SqlCounter(query_count=1, join_count=0 if index is None else 2): + eval_snowpark_pandas_result( + *create_test_series(data, index=index), + lambda ser: ser.nunique(), + comparator=assert_values_equal, + ) diff --git a/tests/integ/modin/series/test_rank.py b/tests/integ/modin/series/test_rank.py index 2544f12e43..24801b581f 100644 --- a/tests/integ/modin/series/test_rank.py +++ b/tests/integ/modin/series/test_rank.py @@ -7,7 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, eval_snowpark_pandas_result, @@ -83,7 +83,6 @@ def test_series_rank_numeric_only(method, ascending, na_option): ) -@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -99,10 +98,15 @@ def test_series_rank_numeric_only(method, ascending, na_option): ) # test Series percentile rank def test_df_rank_pct(data, index, method, ascending, na_option): - snow_df = pd.DataFrame(data, index=index).rank( - method=method, ascending=ascending, na_option=na_option, pct=True - ) - native_df = native_pd.DataFrame(data, index=index).rank( - method=method, ascending=ascending, na_option=na_option, pct=True - ) - assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64(snow_df, native_df) + with SqlCounter( + query_count=1, join_count=2 if isinstance(index, native_pd.MultiIndex) else 0 + ): + snow_df = pd.DataFrame(data, index=index).rank( + method=method, ascending=ascending, na_option=na_option, pct=True + ) + native_df = native_pd.DataFrame(data, index=index).rank( + method=method, ascending=ascending, na_option=na_option, pct=True + ) + assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64( + snow_df, native_df + ) diff --git a/tests/integ/modin/series/test_rename.py b/tests/integ/modin/series/test_rename.py index 4ccf29706f..53873e0b2f 100644 --- a/tests/integ/modin/series/test_rename.py +++ b/tests/integ/modin/series/test_rename.py @@ -45,7 +45,7 @@ def renamer(x): # values in the variant column will be quoted assert_index_equal(renamed.index, renamed2.index.str.replace('"', "")) - @sql_count_checker(query_count=1, join_count=1) + @sql_count_checker(query_count=1, join_count=2) def test_rename_partial_dict(self): # partial dict ser = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64") @@ -63,7 +63,7 @@ def test_rename_retain_index_name(self): renamed = renamer.rename({}) assert renamed.index.name == renamer.index.name - @sql_count_checker(query_count=2, join_count=1) + @sql_count_checker(query_count=2, join_count=2) def test_rename_by_series(self): ser = Series(range(5), name="foo") renamer = Series({1: 10, 2: 20}) @@ -80,7 +80,7 @@ def test_rename_set_name(self): tm.assert_numpy_array_equal(result.index.values, ser.index.values) assert ser.name is None - @sql_count_checker(query_count=5) + @sql_count_checker(query_count=5, join_count=5) def test_rename_set_name_inplace(self): ser = Series(range(3), index=list("abc")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: diff --git a/tests/integ/modin/series/test_setitem.py b/tests/integ/modin/series/test_setitem.py index 39358b9870..929226bc89 100644 --- a/tests/integ/modin/series/test_setitem.py +++ b/tests/integ/modin/series/test_setitem.py @@ -1601,7 +1601,7 @@ def test_series_setitem_with_empty_key_and_empty_series_item( else: snowpark_key = key - with SqlCounter(query_count=4): + with SqlCounter(query_count=1): native_ser[key] = item snowpark_ser[ pd.Series(snowpark_key) diff --git a/tests/integ/modin/series/test_shape.py b/tests/integ/modin/series/test_shape.py index 7bbc1270a0..ba62dfde67 100644 --- a/tests/integ/modin/series/test_shape.py +++ b/tests/integ/modin/series/test_shape.py @@ -9,7 +9,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -34,11 +34,13 @@ "empty series with only index", ], ) -@sql_count_checker(query_count=1) def test_series_shape(args, kwargs): - eval_snowpark_pandas_result( - pd.Series(*args, **kwargs), - native_pd.Series(*args, **kwargs), - lambda df: df.shape, - comparator=lambda x, y: x == y, - ) + with SqlCounter( + query_count=1, join_count=1 if kwargs.get("index", None) == [] else 0 + ): + eval_snowpark_pandas_result( + pd.Series(*args, **kwargs), + native_pd.Series(*args, **kwargs), + lambda df: df.shape, + comparator=lambda x, y: x == y, + ) diff --git a/tests/integ/modin/series/test_take.py b/tests/integ/modin/series/test_take.py index 7861686a02..2ba09be1b8 100644 --- a/tests/integ/modin/series/test_take.py +++ b/tests/integ/modin/series/test_take.py @@ -16,7 +16,7 @@ def test_series_take(): actual = ser.take([1, 3, 4]) expected = pd.Series([5, 2, 4], index=[1, 3, 4]) - with SqlCounter(query_count=2, join_count=2): + with SqlCounter(query_count=2, join_count=3): assert_series_equal(actual, expected) actual = ser.take([-1, 3, 4]) diff --git a/tests/integ/modin/series/test_to_snowflake.py b/tests/integ/modin/series/test_to_snowflake.py index 92b428f70e..f542edfa17 100644 --- a/tests/integ/modin/series/test_to_snowflake.py +++ b/tests/integ/modin/series/test_to_snowflake.py @@ -68,7 +68,7 @@ def test_to_snowflake_index_label_none_raises(test_table_name): snow_series.to_snowflake(test_table_name, if_exists="replace", index=True) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_to_snowflake_multiindex(test_table_name, snow_series): index = native_pd.MultiIndex.from_arrays( [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], names=("number", "color") diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index d82d0266e3..5c236731a0 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -1058,7 +1058,7 @@ def test_concat_sorted_frames(): ), # duplicate in frame2 ], ) -@sql_count_checker(query_count=2, union_count=1) +@sql_count_checker(query_count=2, union_count=1, join_count=1) def test_concat_duplicate_columns(columns1, columns2, expected_rows, expected_cols): df1 = pd.DataFrame([[1, 2, 3]], columns=columns1) df2 = pd.DataFrame([[4, 5, 6]], columns=columns2) @@ -1123,7 +1123,7 @@ def test_concat_from_file(resources_path): ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=5) def test_concat_keys(): native_data = { "one": native_pd.Series([1, 2, 3], index=["a", "b", "c"]), @@ -1180,5 +1180,6 @@ def test_df_creation_from_series_from_same_df(): @sql_count_checker(query_count=0) def test_concat_timedelta_not_implemented(df1): + df1 = pd.DataFrame(df1) with pytest.raises(NotImplementedError): pd.concat([df1, df1, df1.astype({"C": "timedelta64[ns]"})]) diff --git a/tests/integ/modin/test_from_pandas_to_pandas.py b/tests/integ/modin/test_from_pandas_to_pandas.py index 8490bb808f..c9e29147ae 100644 --- a/tests/integ/modin/test_from_pandas_to_pandas.py +++ b/tests/integ/modin/test_from_pandas_to_pandas.py @@ -572,7 +572,7 @@ def test_from_pandas_series_with_tuple_name(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_series_to_pandas(): array = ["a", "b", "c"] pandas_series = native_pd.Series(data=array, index=array) @@ -632,7 +632,7 @@ def test_snowpark_pandas_statement_params(): assert "efg" == mock_to_pandas.call_args.kwargs["statement_params"]["abc"] -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=5) def test_create_df_from_series(): native_data = { "one": native_pd.Series([1, 2, 3], index=["a", "b", "c"]), diff --git a/tests/integ/modin/test_internal_frame.py b/tests/integ/modin/test_internal_frame.py index da38322b9a..c7a95fa601 100644 --- a/tests/integ/modin/test_internal_frame.py +++ b/tests/integ/modin/test_internal_frame.py @@ -38,7 +38,7 @@ def test_strip_duplicates(input, expected): assert_frame_equal(result, pd.DataFrame(expected)) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2, join_count=2) def test_strip_duplicates_after_sort(): df = pd.DataFrame({"A": [0, 1, 0, 1, 2], "B": [1, 2, 3, 4, 5]}) df = df.sort_values(by="B", ascending=False) diff --git a/tests/integ/modin/test_numpy.py b/tests/integ/modin/test_numpy.py index cafbd08f36..43b9ef263f 100644 --- a/tests/integ/modin/test_numpy.py +++ b/tests/integ/modin/test_numpy.py @@ -113,7 +113,7 @@ def test_np_where_notimplemented(): ) -@sql_count_checker(query_count=5, join_count=4) +@sql_count_checker(query_count=5, join_count=7) def test_scalar(): pdf_scalar = native_pd.DataFrame([[99, 99], [99, 99]]) sdf_scalar = pd.DataFrame([[99, 99], [99, 99]]) @@ -172,7 +172,7 @@ def test_different_inputs(cond, x, y): assert_array_equal(sp_result, np_orig_result) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2, join_count=3) def test_broadcast_scalar_x_df(): input_df = native_pd.DataFrame([[False, True], [False, True]]) input_df2 = native_pd.DataFrame([[1, 0], [0, 1]]) @@ -183,7 +183,7 @@ def test_broadcast_scalar_x_df(): assert_array_equal(snow_result, np_result) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2, join_count=3) def test_broadcast_scalar_x_ser(): input_ser = native_pd.Series([False, True]) input_ser2 = native_pd.Series([1, 0]) From f4a80f385f800c8b529ef1b6c452a78bb9731280 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 9 Sep 2024 10:48:24 -0700 Subject: [PATCH 23/42] fix loc and iloc tests --- tests/integ/modin/binary/test_binary_op.py | 24 +++++--- tests/integ/modin/frame/test_fillna.py | 2 +- tests/integ/modin/frame/test_getitem.py | 2 +- tests/integ/modin/frame/test_iloc.py | 46 ++++++++------- tests/integ/modin/frame/test_loc.py | 56 +++++++++---------- tests/integ/modin/frame/test_merge.py | 6 +- tests/integ/modin/frame/test_transpose.py | 2 +- tests/integ/modin/frame/test_where.py | 4 +- .../index/test_datetime_index_methods.py | 2 +- .../modin/resample/test_resample_negative.py | 2 +- tests/integ/modin/series/test_all_any.py | 2 +- tests/integ/modin/series/test_empty.py | 19 ++++--- tests/integ/modin/series/test_iloc.py | 34 +++++------ tests/integ/modin/series/test_loc.py | 50 +++++++++-------- tests/integ/modin/test_telemetry.py | 2 +- 15 files changed, 135 insertions(+), 118 deletions(-) diff --git a/tests/integ/modin/binary/test_binary_op.py b/tests/integ/modin/binary/test_binary_op.py index cd036bcb04..4f50096ce4 100644 --- a/tests/integ/modin/binary/test_binary_op.py +++ b/tests/integ/modin/binary/test_binary_op.py @@ -1289,20 +1289,26 @@ def test_other_with_native_pandas_object_raises(op): ], ) @pytest.mark.parametrize("op", [operator.add]) -@sql_count_checker(query_count=2, join_count=2) def test_binary_add_between_series_for_index_alignment(lhs, rhs, op): def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): snow_ans = op(snow_lhs, snow_rhs) native_ans = op(native_lhs, native_rhs) - # for one multi-index test case (marked with comment) the "inferred_type" doesn't match (Snowpark: float vs. pandas integer) - eval_snowpark_pandas_result( - snow_ans, native_ans, lambda s: s, check_index_type=False - ) + with SqlCounter( + query_count=2, join_count=10 if isinstance(lhs.index, pd.MultiIndex) else 6 + ): + # for one multi-index test case (marked with comment) the "inferred_type" doesn't match (Snowpark: float vs. pandas integer) + eval_snowpark_pandas_result( + snow_ans, native_ans, lambda s: s, check_index_type=False + ) - check_op(lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs)) + check_op( + lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs) + ) - # commute series - check_op(rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs)) + # commute series + check_op( + rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs) + ) # MOD TESTS @@ -1872,7 +1878,7 @@ def test_binary_rpow_between_df_and_list_like_on_axis_1(rhs): "rmod", ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_generated_docstring_examples(opname): # test for operators that correct examples are generated and match up with pandas. # if this test passes, this ensures that all the examples generated in utils.py will be correct. diff --git a/tests/integ/modin/frame/test_fillna.py b/tests/integ/modin/frame/test_fillna.py index 677c8d3ddc..6ae668d694 100644 --- a/tests/integ/modin/frame/test_fillna.py +++ b/tests/integ/modin/frame/test_fillna.py @@ -426,7 +426,7 @@ def test_multiindex_df_values_dict_various_levels(test_fillna_multiindex_df): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=3) def test_multiindex_df_values_series(test_fillna_multiindex_df, test_fillna_multiindex): values = pd.Series([10, 1, 2, 3], index=test_fillna_multiindex) native_values = native_pd.Series([10, 1, 2, 3], index=test_fillna_multiindex) diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py index fd4ede77d7..e08e25513a 100644 --- a/tests/integ/modin/frame/test_getitem.py +++ b/tests/integ/modin/frame/test_getitem.py @@ -343,7 +343,7 @@ def test_df_getitem_with_slice( slice("z", "a", -1), ], ) -@sql_count_checker(query_count=1, join_count=0) +@sql_count_checker(query_count=1, join_count=1) def test_df_getitem_with_non_int_slice(key): data = {"a": [1, 2, 3], "b": [4, 5, 6]} index = ["x", "y", "z"] diff --git a/tests/integ/modin/frame/test_iloc.py b/tests/integ/modin/frame/test_iloc.py index cb69f78172..d8b874b1cf 100644 --- a/tests/integ/modin/frame/test_iloc.py +++ b/tests/integ/modin/frame/test_iloc.py @@ -118,7 +118,7 @@ ("RangeIndex", 0), ("Index[bool]", 1), ("emptyFloatSeries", 2), - ("multi_index_Series", 2), + ("multi_index_Series", 6), ] # Snowflake type checking will fail if the item values aren't type compatible, so we normalize to int to stay compatible. @@ -315,7 +315,10 @@ def eval_func(df): if key == "RangeIndex": expected_query_count = 1 - with SqlCounter(query_count=expected_query_count, join_count=0): + with SqlCounter( + query_count=expected_query_count, + join_count=4 if key == "multi_index_Series" else 0, + ): eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, eval_func ) @@ -445,7 +448,7 @@ def test_df_iloc_get_diff2native( ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2, join_count=8) def test_df_iloc_get_with_conflict(): # index and data columns have conflict in get_by_col df = DataFrame({"A": [0, 1]}, index=native_pd.Index([2, 3], name="A")).rename( @@ -2619,34 +2622,34 @@ def perform_iloc(df): @pytest.mark.parametrize( - "row_key, row_key_index", + "row_key, row_key_index, row_add_joins", [ - [1, None], - [[3, 0], None], - [[1, 2], [("A",), ("B",)]], - [[2, 1], [("A", 1), ("B", 2)]], + [1, None, 0], + [[3, 0], None, 0], + [[1, 2], [("A",), ("B",)], 1], + [[2, 1], [("A", 1), ("B", 2)], 2], ], ) @pytest.mark.parametrize( - "col_key, col_key_index", + "col_key, col_key_index, col_add_joins", [ - [2, None], - [[2, 1], None], - [[1, 2], [("X",), ("Y",)]], - [[2, 1], [("X", 11), ("Y", 21)]], + [2, None, 0], + [[2, 1], None, 0], + [[1, 2], [("X",), ("Y",)], 1], + [[2, 1], [("X", 11), ("Y", 21)], 2], ], ) @pytest.mark.parametrize( "item_values, item_index, item_columns, expected_join_count", [ - [999, None, None, 2], - [TEST_ITEMS_DATA_2X2, None, None, 3], - [TEST_ITEMS_DATA_2X2, [("r", 20), ("s", 25)], None, 5], - [TEST_ITEMS_DATA_2X2, [("r", 20), ("s", 25)], [("e", 5), ("f", 6)], 5], - [TEST_ITEMS_DATA_2X2, None, [("e", 5), ("f", 6)], 3], + [999, None, None, 6], + [TEST_ITEMS_DATA_2X2, None, None, 7], + [TEST_ITEMS_DATA_2X2, [("r", 20), ("s", 25)], None, 9], + [TEST_ITEMS_DATA_2X2, [("r", 20), ("s", 25)], [("e", 5), ("f", 6)], 9], + [TEST_ITEMS_DATA_2X2, None, [("e", 5), ("f", 6)], 7], ], ) -def test_df_iloc_set_with_multi_index( +def test_df_iloc_set_with_multiindex( row_key, row_key_index, col_key, @@ -2655,6 +2658,8 @@ def test_df_iloc_set_with_multi_index( item_index, item_columns, expected_join_count, + row_add_joins, + col_add_joins, ): df_data = [ [1, 2, 3, 4, 5], @@ -2733,6 +2738,7 @@ def helper_iloc(df): if isinstance(snow_col_key, pd.Series): expected_query_count += 1 + expected_join_count += row_add_joins + col_add_joins with SqlCounter(query_count=expected_query_count, join_count=expected_join_count): eval_snowpark_pandas_result(snow_df, native_df, helper_iloc, inplace=True) @@ -2808,7 +2814,7 @@ def iloc_helper(df: Union[pd.DataFrame, native_pd.DataFrame]) -> None: # For a Series row key, the key is joined with the df to derive the iloc results. For column keys, a select # statement is used instead of a join. - join_count = 2 if axis == "row" else 0 + join_count = 4 if axis == "row" else 2 query_count = 1 if axis == "row" else 2 # Evaluate with MultiIndex created from tuples. diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index ec9e93a77f..d94f9f21d0 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -215,7 +215,7 @@ def test_df_loc_get_col_non_boolean_key( "key", boolean_indexer, ) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_df_loc_get_col_boolean_indexer( key, str_index_snowpark_pandas_df, str_index_native_df ): @@ -243,7 +243,7 @@ def test_df_loc_get_col_boolean_indexer( "key", list_like_time_col_inputs, ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_df_loc_get_col_time_df( key, time_column_snowpark_pandas_df, time_column_native_df ): @@ -258,7 +258,7 @@ def test_df_loc_get_col_time_df( "key", snowpark_pandas_int_index_row_inputs, ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_df_loc_get_int_index_row_snowpark_pandas_input( key, default_index_snowpark_pandas_df, @@ -606,7 +606,7 @@ def test_mi_df_loc_get_non_boolean_list_tuple_key(mi_table_df, row, col): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2, join_count=4) def test_mi_df_loc_get_boolean_series_row_key(mi_table_df): df = pd.DataFrame(mi_table_df) bool_indexer = [False, True, True, False, False, True] @@ -639,7 +639,7 @@ def test_mi_df_loc_get_boolean_series_row_key(mi_table_df): ) -@sql_count_checker(query_count=3, join_count=0) +@sql_count_checker(query_count=3, join_count=2) def test_mi_df_loc_get_boolean_series_col_key(mi_table_df): df = pd.DataFrame(mi_table_df) bool_indexer = [False, True] @@ -1448,11 +1448,9 @@ def helper(df): snow_df.to_pandas() else: expected_query_count = 1 - expected_join_count = 1 + expected_join_count = 2 if key == slice(None): expected_join_count = 0 - elif isinstance(key, slice) and key.step == 2: - expected_join_count += 1 with SqlCounter( query_count=expected_query_count, join_count=expected_join_count @@ -1672,7 +1670,7 @@ def test_df_loc_get_key_bool_self_series(): [random.choice([True, False]) for _ in range(5)], ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_df_loc_get_key_bool_series_with_aligned_indices(key, use_default_index): # aligned indices means both row_pos and index are exactly match if use_default_index: @@ -1701,7 +1699,7 @@ def test_df_loc_get_key_bool_series_with_aligned_indices(key, use_default_index) [random.choice([True, False]) for _ in range(5)], ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_df_loc_get_key_bool_series_with_unaligned_and_distinct_indices( key, use_default_index ): @@ -1778,7 +1776,7 @@ def test_df_loc_get_key_bool_series_with_unaligned_and_duplicate_indices(): ], # larger length ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_df_loc_get_key_bool_series_with_mismatch_index_len(key, use_default_index): if use_default_index: index = None @@ -2406,7 +2404,7 @@ def loc_set_helper(df): ["a", "a", "c", "d"], ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_df_loc_set_scalar_row_key_enlargement( row_key, col_key, item_values, data_index ): @@ -2478,7 +2476,7 @@ def set_loc_helper(df): ["a", "a", "c", "d"], ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_df_loc_set_scalar_row_key_enlargement_deviates_from_native_pandas( row_key, col_key, item_values, data_index ): @@ -3203,7 +3201,7 @@ def test_df_loc_set_boolean_series_with_non_default_index_key_and_scalar_item(): ["duplicate", [1, 1, 2, 3]], ], ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=5) def test_df_loc_set_duplicate_index( self_index_type, self_index_val, index, columns, item ): @@ -3784,7 +3782,7 @@ def loc_set_helper(df): [2, "x"], ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_df_setitem_boolean_key(key, index): item = 99 num_columns = 3 @@ -3862,7 +3860,7 @@ def test_df_single_value_with_slice_key(): eval_snowpark_pandas_result(snowpark_df, native_df, lambda df: df.loc[0:1]) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_df_loc_set_none(): native_df = native_pd.DataFrame({"a": [1, 2, 3]}) @@ -3885,7 +3883,7 @@ def loc_set_helper(df): ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=4) def test_df_loc_set_with_index_and_column_labels(): """ Create a DataFrame using 3 Series objects and perform loc set with a scalar. @@ -3932,25 +3930,25 @@ def test_raise_set_cell_with_list_like_value_error(): pytest.param( "1 day", 2, - 3, + 4, marks=pytest.mark.xfail( reason="SNOW-1652608 result series name incorrectly set" ), - ), # 1 join from squeeze, 2 joins from to_pandas during eval + ), pytest.param( native_pd.to_timedelta("1 day"), 2, - 3, + 4, marks=pytest.mark.xfail( reason="SNOW-1652608 result series name incorrectly set" ), - ), # 1 join from squeeze, 2 joins from to_pandas during eval - (["1 day", "3 days"], 1, 1), - ([True, False, False], 1, 1), - (slice(None, "4 days"), 1, 0), - (slice(None, "4 days", 2), 1, 0), - (slice("1 day", "2 days"), 1, 0), - (slice("1 day 1 hour", "2 days 2 hours", -1), 1, 0), + ), + (["1 day", "3 days"], 1, 2), + ([True, False, False], 1, 2), + (slice(None, "4 days"), 1, 1), + (slice(None, "4 days", 2), 1, 1), + (slice("1 day", "2 days"), 1, 1), + (slice("1 day 1 hour", "2 days 2 hours", -1), 1, 1), ], ) def test_df_loc_get_with_timedelta(key, query_count, join_count): @@ -4017,7 +4015,7 @@ def test_df_loc_get_with_timedelta(key, query_count, join_count): ), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_df_loc_get_with_timedelta_behavior_difference(key, expected_result): # In these test cases, native pandas raises a KeyError but Snowpark pandas works correctly. data = { @@ -4037,7 +4035,7 @@ def test_df_loc_get_with_timedelta_behavior_difference(key, expected_result): assert_frame_equal(actual_result, expected_result) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=2) def test_df_loc_get_with_timedeltaindex_key(): data = { "A": [1, 2, 3], diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py index 8b9b5472e3..15ad41a580 100644 --- a/tests/integ/modin/frame/test_merge.py +++ b/tests/integ/modin/frame/test_merge.py @@ -302,7 +302,7 @@ def test_merge_on_index_columns(left_df, right_df, how, on, sort): @pytest.mark.parametrize("index1", [[3, 4], [1.5, 8.0], [None, None]]) @pytest.mark.parametrize("index2", [[7, 8], [1.5, 3.0], [None, None]]) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3, join_count=5) def test_join_type_mismatch(index1, index2): df1 = pd.DataFrame({"A": [1, 2]}, index=index1) df2 = pd.DataFrame({"B": [3, 4]}, index=index2) @@ -351,7 +351,7 @@ def test_join_type_mismatch_negative(index1, index2): ), ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_join_type_mismatch_diff_with_native_pandas(index1, index2, expected_res): df1 = pd.DataFrame({"A": [1, 2]}, index=index1) df2 = pd.DataFrame({"B": [3, 4]}, index=index2) @@ -960,7 +960,7 @@ def test_merge_no_join_keys_negative(left_name, right_name, left_df, right_df): ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_merge_no_join_keys_common_index_negative(left_df, right_df): left_df = pd.DataFrame({"A": [1, 2, 3]}, native_pd.Index([7, 8, 9], name="KEY")) right_df = pd.DataFrame({"B": [1, 2, 3]}, native_pd.Index([7, 8, 9], name="KEY")) diff --git a/tests/integ/modin/frame/test_transpose.py b/tests/integ/modin/frame/test_transpose.py index 894bbbbc1b..469a66dd51 100644 --- a/tests/integ/modin/frame/test_transpose.py +++ b/tests/integ/modin/frame/test_transpose.py @@ -242,7 +242,7 @@ def test_dataframe_transpose_preserve_float_dtypes(): assert all([dtype == "float64" for dtype in snow_df.T.dtypes]) -@sql_count_checker(query_count=1, union_count=1) +@sql_count_checker(query_count=1, union_count=1, join_count=2) def test_dataframe_transpose_single_numeric_column(): single_column_data = ({0: "A", 1: "B", 2: "C", 3: "D"},) native_df = native_pd.DataFrame(single_column_data, index=(0,)) diff --git a/tests/integ/modin/frame/test_where.py b/tests/integ/modin/frame/test_where.py index 006b7e76fb..75a5d6db7a 100644 --- a/tests/integ/modin/frame/test_where.py +++ b/tests/integ/modin/frame/test_where.py @@ -902,7 +902,7 @@ def perform_where(df): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=4) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], @@ -947,7 +947,7 @@ def perform_where(df): ) -@sql_count_checker(query_count=2, join_count=3, union_count=1) +@sql_count_checker(query_count=2, join_count=5, union_count=1) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], diff --git a/tests/integ/modin/index/test_datetime_index_methods.py b/tests/integ/modin/index/test_datetime_index_methods.py index 56fd40a6cb..b727b4750e 100644 --- a/tests/integ/modin/index/test_datetime_index_methods.py +++ b/tests/integ/modin/index/test_datetime_index_methods.py @@ -89,7 +89,7 @@ def test_non_default_args(kwargs): pd.DatetimeIndex(query_compiler=idx._query_compiler, **kwargs) -@sql_count_checker(query_count=6) +@sql_count_checker(query_count=6, join_count=6) def test_index_parent(): """ Check whether the parent field in Index is updated properly. diff --git a/tests/integ/modin/resample/test_resample_negative.py b/tests/integ/modin/resample/test_resample_negative.py index e20fc397ef..44319c120b 100644 --- a/tests/integ/modin/resample/test_resample_negative.py +++ b/tests/integ/modin/resample/test_resample_negative.py @@ -137,7 +137,7 @@ def test_resample_fillna_invalid_method(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_resample_tz_negative(): snow_df = pd.DataFrame( {"a": range(3)}, diff --git a/tests/integ/modin/series/test_all_any.py b/tests/integ/modin/series/test_all_any.py index 0f78b320fe..d0d1c0987a 100644 --- a/tests/integ/modin/series/test_all_any.py +++ b/tests/integ/modin/series/test_all_any.py @@ -65,7 +65,7 @@ def test_any_int(data): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_all_named_index(): data = [1, 0, 3] index_name = ["a", "b", "c"] diff --git a/tests/integ/modin/series/test_empty.py b/tests/integ/modin/series/test_empty.py index 8e7aa9d915..d53cd6e3d5 100644 --- a/tests/integ/modin/series/test_empty.py +++ b/tests/integ/modin/series/test_empty.py @@ -9,7 +9,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import sql_count_checker +from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -34,14 +34,17 @@ "empty series with only index", ], ) -@sql_count_checker(query_count=1, join_count=1) def test_series_empty(args, kwargs): - eval_snowpark_pandas_result( - pd.Series(*args, **kwargs), - native_pd.Series(*args, **kwargs), - lambda df: df.empty, - comparator=lambda x, y: x == y, - ) + with SqlCounter( + query_count=1, + join_count=1 if (args == [] and kwargs.get("index", None) == []) else 0, + ): + eval_snowpark_pandas_result( + pd.Series(*args, **kwargs), + native_pd.Series(*args, **kwargs), + lambda df: df.empty, + comparator=lambda x, y: x == y, + ) @sql_count_checker(query_count=5, join_count=2) diff --git a/tests/integ/modin/series/test_iloc.py b/tests/integ/modin/series/test_iloc.py index b35681e4ee..eea764af40 100644 --- a/tests/integ/modin/series/test_iloc.py +++ b/tests/integ/modin/series/test_iloc.py @@ -78,7 +78,7 @@ def test_diff2native(default_index_snowpark_pandas_series, default_index_native_ "key, val", setitem_key_val_pair, ) -def test_setitem( +def test_series_iloc_setitem( key, val, default_index_native_int_snowpark_pandas_series, @@ -89,7 +89,7 @@ def operation(ser): # Based on snowflake type results, the result becomes 'str' type so we normalize to float for comparison. return ser.astype("float") - expected_join_count = 3 if isinstance(val, list) else 2 + expected_join_count = 5 if isinstance(val, list) else 4 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( default_index_native_int_snowpark_pandas_series, @@ -777,25 +777,25 @@ def perform_iloc(df): @pytest.mark.parametrize( - "row_key, row_key_index", + "row_key, row_key_index, add_joins", [ - [1, None], - [[3, 0], None], - [[1, 2], [("A",), ("B",)]], - [[2, 1], [("A", 1), ("B", 2)]], + [1, None, 0], + [[3, 0], None, 0], + [[1, 2], [("A",), ("B",)], 1], + [[2, 1], [("A", 1), ("B", 2)], 2], ], ) @pytest.mark.parametrize( "item_values, item_index, expected_join_count", [ - [999, None, 2], - [TEST_ITEMS_DATA_2X1, None, 3], - [TEST_ITEMS_DATA_2X1, [("r",), ("s",)], 4], - [TEST_ITEMS_DATA_2X1, [("r", 20), ("s", 25)], 5], + [999, None, 6], + [TEST_ITEMS_DATA_2X1, None, 7], + [TEST_ITEMS_DATA_2X1, [("r",), ("s",)], 8], + [TEST_ITEMS_DATA_2X1, [("r", 20), ("s", 25)], 9], ], ) -def test_df_iloc_set_with_multi_index( - row_key, row_key_index, item_values, item_index, expected_join_count +def test_df_iloc_set_with_multiindex( + row_key, row_key_index, item_values, item_index, expected_join_count, add_joins ): ser_data = [10, 11, 12, 13, 14] row_index = pd.MultiIndex.from_tuples( @@ -835,7 +835,7 @@ def helper_iloc(ser): else: ser.iloc[snow_row_key] = snow_items - with SqlCounter(query_count=1, join_count=expected_join_count): + with SqlCounter(query_count=1, join_count=expected_join_count + add_joins): eval_snowpark_pandas_result(snow_ser, native_ser, helper_iloc, inplace=True) @@ -851,7 +851,7 @@ def iloc_helper(series: Union[pd.Series, native_pd.Series]) -> None: ) # test ser with default index - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=4): eval_snowpark_pandas_result( default_index_int_series, default_index_native_int_series, @@ -859,7 +859,7 @@ def iloc_helper(series: Union[pd.Series, native_pd.Series]) -> None: ) # test ser with non default index - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=4): eval_snowpark_pandas_result( int_series_with_non_default_index, native_int_series_with_non_default_index, @@ -867,7 +867,7 @@ def iloc_helper(series: Union[pd.Series, native_pd.Series]) -> None: ) # test ser with MultiIndex - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=4): eval_snowpark_pandas_result( int_series_with_multiindex, multiindex_native_int_series, diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index 70af8247bd..da13247cd7 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -224,7 +224,6 @@ def apply_loc(df): [random.choice([True, False]) for _ in range(5)], ], ) -@sql_count_checker(query_count=1, join_count=1) def test_series_loc_get_key_bool_series_with_aligned_indices(key, use_default_index): # aligned indices means both row_pos and index are exactly match if use_default_index: @@ -234,13 +233,14 @@ def test_series_loc_get_key_bool_series_with_aligned_indices(key, use_default_in index = native_pd.Index(["a", "a", None, "b", "b"], name="index") native_series = native_pd.Series([1, 2, 3, 4, 5], index=index) snow_series = pd.Series(native_series) - eval_snowpark_pandas_result( - snow_series, - native_series, - lambda s: s.loc[pd.Series(key, index=index, dtype="bool")] - if isinstance(s, pd.Series) - else s.loc[native_pd.Series(key, index=index, dtype="bool")], - ) + with SqlCounter(query_count=1, join_count=1 if use_default_index else 2): + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda s: s.loc[pd.Series(key, index=index, dtype="bool")] + if isinstance(s, pd.Series) + else s.loc[native_pd.Series(key, index=index, dtype="bool")], + ) @pytest.mark.parametrize( @@ -861,7 +861,7 @@ def loc_set_helper(s): ["a", "a", "c", "d"], ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=3) def test_series_loc_set_scalar_row_key_enlargement(row_key, item_values, ser_index): data = [1, 2, 3, 4] @@ -1407,7 +1407,7 @@ def test_series_loc_set_slice_item_negative(key, default_index_native_series): [2, "x"], ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_loc_set_boolean_key(key, index): # series.loc[True/False key] = scalar item # ---------------------------------------- @@ -1596,7 +1596,7 @@ def test_series_loc_set_with_scalar_key_and_list_like_item( assert_series_equal(snowpark_ser, native_ser) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) @pytest.mark.parametrize("key", SCALAR_LIKE_VALUES) @pytest.mark.parametrize("item", SCALAR_LIKE_VALUES) def test_series_loc_set_with_scalar_key_and_scalar_item( @@ -1776,7 +1776,7 @@ def test_series_partial_string_indexing_behavior_diff(): assert len(series_minute["2022"]) == 0 -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_series_loc_set_none(): # Note that pandas does not support df.loc[None,:] like the series does here. native_s = native_pd.Series([1, 2, 3]) @@ -1792,18 +1792,22 @@ def loc_set_helper(s): @pytest.mark.parametrize( "key, query_count, join_count", [ - ("1 day", 2, 2), # 1 join from squeeze, 1 join from to_pandas during eval ( - native_pd.to_timedelta("1 day"), + "1 day", 2, + 4, + ), # 1 join from series creation (double counted), 1 join from squeeze, 1 join from to_pandas during eval + ( + native_pd.to_timedelta("1 day"), 2, - ), # 1 join from squeeze, 1 join from to_pandas during eval - (["1 day", "3 days"], 1, 1), - ([True, False, False], 1, 1), - (slice(None, "4 days"), 1, 0), - (slice(None, "4 days", 2), 1, 0), - (slice("1 day", "2 days"), 1, 0), - (slice("1 day 1 hour", "2 days 2 hours", 1), 1, 0), + 4, + ), # 1 join from series creation (double counted), 1 join from squeeze, 1 join from to_pandas during eval + (["1 day", "3 days"], 1, 2), + ([True, False, False], 1, 2), + (slice(None, "4 days"), 1, 1), + (slice(None, "4 days", 2), 1, 1), + (slice("1 day", "2 days"), 1, 1), + (slice("1 day 1 hour", "2 days 2 hours", 1), 1, 1), ], ) def test_series_loc_get_with_timedelta(key, query_count, join_count): @@ -1854,7 +1858,7 @@ def test_series_loc_get_with_timedelta(key, query_count, join_count): ), ], ) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_series_loc_get_with_timedelta_behavior_difference(key, expected_result): data = ["A", "B", "C", "D"] idx = ["1 days", "2 days", "3 days", "25 hours"] @@ -1869,7 +1873,7 @@ def test_series_loc_get_with_timedelta_behavior_difference(key, expected_result) assert_series_equal(actual_result, expected_result) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=2, join_count=2) def test_series_loc_get_with_timedeltaindex_key(): data = ["A", "B", "C"] idx = ["1 days", "2 days", "3 days"] diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index 80317357af..ec1710980a 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -342,7 +342,7 @@ def test_telemetry_with_update_inplace(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_telemetry_with_resample(): # verify api_calls have been collected correctly for Resample APIs index = pandas.date_range("1/1/2000", periods=9, freq="min") From ce1ffa65b8e7795266aeec8f343f22e0b40e5c84 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 9 Sep 2024 11:24:59 -0700 Subject: [PATCH 24/42] fix test --- tests/integ/modin/binary/test_binary_op.py | 24 ++++++++----------- tests/integ/modin/frame/test_loc.py | 18 +++++++------- .../modin/groupby/test_groupby_basic_agg.py | 2 +- 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/tests/integ/modin/binary/test_binary_op.py b/tests/integ/modin/binary/test_binary_op.py index 4f50096ce4..ff83a1b8c7 100644 --- a/tests/integ/modin/binary/test_binary_op.py +++ b/tests/integ/modin/binary/test_binary_op.py @@ -1293,22 +1293,18 @@ def test_binary_add_between_series_for_index_alignment(lhs, rhs, op): def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): snow_ans = op(snow_lhs, snow_rhs) native_ans = op(native_lhs, native_rhs) - with SqlCounter( - query_count=2, join_count=10 if isinstance(lhs.index, pd.MultiIndex) else 6 - ): - # for one multi-index test case (marked with comment) the "inferred_type" doesn't match (Snowpark: float vs. pandas integer) - eval_snowpark_pandas_result( - snow_ans, native_ans, lambda s: s, check_index_type=False - ) + # for one multi-index test case (marked with comment) the "inferred_type" doesn't match (Snowpark: float vs. pandas integer) + eval_snowpark_pandas_result( + snow_ans, native_ans, lambda s: s, check_index_type=False + ) - check_op( - lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs) - ) + with SqlCounter( + query_count=2, join_count=10 if isinstance(lhs.index, pd.MultiIndex) else 6 + ): + check_op(lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs)) - # commute series - check_op( - rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs) - ) + # commute series + check_op(rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs)) # MOD TESTS diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index d94f9f21d0..d75b16658d 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -1670,7 +1670,6 @@ def test_df_loc_get_key_bool_self_series(): [random.choice([True, False]) for _ in range(5)], ], ) -@sql_count_checker(query_count=1, join_count=2) def test_df_loc_get_key_bool_series_with_aligned_indices(key, use_default_index): # aligned indices means both row_pos and index are exactly match if use_default_index: @@ -1681,14 +1680,15 @@ def test_df_loc_get_key_bool_series_with_aligned_indices(key, use_default_index) native_df = native_pd.DataFrame( {"c1": [1, 2, 3, 4, 5], "c2": ["x", "y", "z", "d", "e"]}, index=index ) - snow_df = pd.DataFrame(native_df) - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.loc[pd.Series(key, index=index, dtype="bool")] - if isinstance(df, pd.DataFrame) - else df.loc[native_pd.Series(key, index=index, dtype="bool")], - ) + with SqlCounter(query_count=1, join_count=1 if use_default_index else 2): + snow_df = pd.DataFrame(native_df) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.loc[pd.Series(key, index=index, dtype="bool")] + if isinstance(df, pd.DataFrame) + else df.loc[native_pd.Series(key, index=index, dtype="bool")], + ) @pytest.mark.parametrize( diff --git a/tests/integ/modin/groupby/test_groupby_basic_agg.py b/tests/integ/modin/groupby/test_groupby_basic_agg.py index 09acd49bb2..f3002901d0 100644 --- a/tests/integ/modin/groupby/test_groupby_basic_agg.py +++ b/tests/integ/modin/groupby/test_groupby_basic_agg.py @@ -951,7 +951,7 @@ def test_groupby_with_level(df_multi, level): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=2) def test_groupby_with_hier_columns(): tuples = list( zip( From 00d2a8b6af28792b3dbc58b663cc51e91740280e Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 9 Sep 2024 12:04:42 -0700 Subject: [PATCH 25/42] fix test --- tests/integ/modin/series/test_all_any.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/modin/series/test_all_any.py b/tests/integ/modin/series/test_all_any.py index d0d1c0987a..517252e7af 100644 --- a/tests/integ/modin/series/test_all_any.py +++ b/tests/integ/modin/series/test_all_any.py @@ -77,7 +77,7 @@ def test_all_named_index(): ) -@sql_count_checker(query_count=1) +@sql_count_checker(query_count=1, join_count=1) def test_any_named_index(): data = [1, 0, 3] index_name = ["a", "b", "c"] From cb918495a48615d5a19747c4476be00e3f6db60e Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 9 Sep 2024 12:22:47 -0700 Subject: [PATCH 26/42] fix last valid index error --- src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py index c2c224e404..d121baf823 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py @@ -128,6 +128,7 @@ def get_valid_index_values( ------- Optional[Row]: The desired index (a Snowpark Row) if it exists, else None. """ + frame = frame.ensure_row_position_column() index_quoted_identifier = frame.index_column_snowflake_quoted_identifiers data_quoted_identifier = frame.data_column_snowflake_quoted_identifiers row_position_quoted_identifier = frame.row_position_snowflake_quoted_identifier From d9fdbb06127b0cacd16b5bb15a0edf15af700676 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 9 Sep 2024 12:34:00 -0700 Subject: [PATCH 27/42] remove stuff unnecessarily commented out --- src/snowflake/snowpark/modin/plugin/docstrings/base.py | 2 +- tests/integ/modin/frame/test_describe.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/base.py b/src/snowflake/snowpark/modin/plugin/docstrings/base.py index 657da3e528..4eb1bd1584 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/base.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/base.py @@ -1649,7 +1649,7 @@ def last_valid_index(): >>> df.last_valid_index() 12 >>> df = pd.DataFrame([5, 6, 7, 8], index=["i", "am", "iron", "man"]) - >>> df.last_valid_index() # doctest: +SKIP + >>> df.last_valid_index() 'man' """ diff --git a/tests/integ/modin/frame/test_describe.py b/tests/integ/modin/frame/test_describe.py index 28425ab695..a9668c5794 100644 --- a/tests/integ/modin/frame/test_describe.py +++ b/tests/integ/modin/frame/test_describe.py @@ -255,8 +255,8 @@ def timestamp_describe_comparator(snow_res, native_res): @pytest.mark.parametrize( "index", [ - # pytest.param(None, id="default_index"), - # pytest.param(["one", "two", "three", "four", "five", "six"], id="flat_index"), + pytest.param(None, id="default_index"), + pytest.param(["one", "two", "three", "four", "five", "six"], id="flat_index"), pytest.param( [ np.array(["bar", "bar", "baz", "baz", "foo", "foo"]), From 3d5b785ef43707ca26bffa7aaaa928be3729487d Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 9 Sep 2024 12:37:49 -0700 Subject: [PATCH 28/42] explain high query count --- tests/integ/modin/binary/test_binary_op.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integ/modin/binary/test_binary_op.py b/tests/integ/modin/binary/test_binary_op.py index ff83a1b8c7..5732eca6cc 100644 --- a/tests/integ/modin/binary/test_binary_op.py +++ b/tests/integ/modin/binary/test_binary_op.py @@ -1298,6 +1298,13 @@ def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): snow_ans, native_ans, lambda s: s, check_index_type=False ) + # The join count is high because: + # - When creating a single index Series, 1 join is performed; four series are created. + # Therefore, 4 joins are performed. Each binary operation uses 1 join; two operations are performed. + # This results in 6 joins. + # - Similarly, when creating a MultiIndex Series, 1 join is performed per column in the MultiIndex, in our case + # there are two columns. Four Series are created, resulting in 8 joins. Each binary operation uses 1 join; + # two operations are performed. This results in 10 joins. with SqlCounter( query_count=2, join_count=10 if isinstance(lhs.index, pd.MultiIndex) else 6 ): From 7f9dbaa4c2937a1490fb0a8ac7db85acf357fe3e Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 9 Sep 2024 14:27:45 -0700 Subject: [PATCH 29/42] rewrite binary op test, fix coverage --- .../snowpark/modin/pandas/dataframe.py | 6 ++--- .../plugin/extensions/series_overrides.py | 2 +- tests/integ/modin/binary/test_binary_op.py | 26 ++++++------------- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 9aa1b1fb26..1c8f9f084a 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -195,7 +195,7 @@ def __init__( # the DataFrame and sets columns to the columns provided. query_compiler = from_pandas( self.__constructor__(columns=columns) - )._query_compiler + )._query_compiler # pragma: no cover elif isinstance(data, DataFrame): # CASE 5: data is a Snowpark pandas DataFrame @@ -256,7 +256,7 @@ def __init__( if dtype is not None: new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) if index is not None: - if isinstance(index, Index): + if isinstance(index, Index): # pragma: no cover index = index.to_series()._query_compiler elif isinstance(index, Series): index = index._query_compiler @@ -318,7 +318,7 @@ def __init__( if isinstance(labels, Index): labels = labels.to_series()._query_compiler elif isinstance(labels, Series): - labels = labels._query_compiler + labels = labels._query_compiler # pragma: no cover else: labels = Index(labels).to_series()._query_compiler query_compiler = query_compiler.reindex(axis=0, labels=labels) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index 492098c2b6..e4f3f4856d 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -441,7 +441,7 @@ def __init__( if isinstance(labels, Index): labels = labels.to_series()._query_compiler elif isinstance(labels, Series): - labels = labels._query_compiler + labels = labels._query_compiler # pragma: no cover else: labels = Index(labels).to_series()._query_compiler query_compiler = query_compiler.reindex(axis=0, labels=labels) diff --git a/tests/integ/modin/binary/test_binary_op.py b/tests/integ/modin/binary/test_binary_op.py index 5732eca6cc..9bc8dd9f43 100644 --- a/tests/integ/modin/binary/test_binary_op.py +++ b/tests/integ/modin/binary/test_binary_op.py @@ -17,7 +17,6 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native -from tests.integ.modin.series.test_bitwise_operators import try_cast_to_snow_series from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equal_to_pandas, @@ -1294,24 +1293,15 @@ def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): snow_ans = op(snow_lhs, snow_rhs) native_ans = op(native_lhs, native_rhs) # for one multi-index test case (marked with comment) the "inferred_type" doesn't match (Snowpark: float vs. pandas integer) - eval_snowpark_pandas_result( - snow_ans, native_ans, lambda s: s, check_index_type=False - ) - - # The join count is high because: - # - When creating a single index Series, 1 join is performed; four series are created. - # Therefore, 4 joins are performed. Each binary operation uses 1 join; two operations are performed. - # This results in 6 joins. - # - Similarly, when creating a MultiIndex Series, 1 join is performed per column in the MultiIndex, in our case - # there are two columns. Four Series are created, resulting in 8 joins. Each binary operation uses 1 join; - # two operations are performed. This results in 10 joins. - with SqlCounter( - query_count=2, join_count=10 if isinstance(lhs.index, pd.MultiIndex) else 6 - ): - check_op(lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs)) + with SqlCounter(query_count=1, join_count=1): + eval_snowpark_pandas_result( + snow_ans, native_ans, lambda s: s, check_index_type=False + ) - # commute series - check_op(rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs)) + snow_lhs, snow_rhs = pd.Series(lhs), pd.Series(rhs) + check_op(lhs, rhs, snow_lhs, snow_rhs) + # commute series + check_op(rhs, lhs, snow_rhs, snow_lhs) # MOD TESTS From 6de9f4924e30692e5815158e39a10aa63fcb43dc Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 11 Sep 2024 14:19:50 -0700 Subject: [PATCH 30/42] fix tests --- .../snowpark/modin/pandas/dataframe.py | 7 +- .../plugin/extensions/series_overrides.py | 6 +- tests/integ/modin/binary/test_binary_op.py | 2 +- tests/integ/modin/frame/test_add_prefix.py | 2 +- tests/integ/modin/frame/test_add_suffix.py | 2 +- tests/integ/modin/frame/test_assign.py | 4 +- tests/integ/modin/frame/test_astype.py | 2 +- tests/integ/modin/frame/test_at.py | 8 +- tests/integ/modin/frame/test_axis.py | 2 +- tests/integ/modin/frame/test_copy.py | 6 +- tests/integ/modin/frame/test_drop.py | 6 +- tests/integ/modin/frame/test_dtypes.py | 40 ++++---- tests/integ/modin/frame/test_fillna.py | 2 +- tests/integ/modin/frame/test_getitem.py | 2 +- tests/integ/modin/frame/test_iat.py | 4 +- tests/integ/modin/frame/test_idxmax_idxmin.py | 23 ++--- tests/integ/modin/frame/test_iloc.py | 46 +++++----- tests/integ/modin/frame/test_insert.py | 92 +++++++++---------- tests/integ/modin/frame/test_loc.py | 32 ++++--- tests/integ/modin/frame/test_mask.py | 4 +- tests/integ/modin/frame/test_merge.py | 6 +- tests/integ/modin/frame/test_nunique.py | 10 +- tests/integ/modin/frame/test_rank.py | 40 ++++---- tests/integ/modin/frame/test_reindex.py | 6 +- tests/integ/modin/frame/test_rename.py | 4 +- tests/integ/modin/frame/test_repr.py | 2 +- tests/integ/modin/frame/test_setitem.py | 6 +- tests/integ/modin/frame/test_stack.py | 2 +- tests/integ/modin/frame/test_transpose.py | 2 +- tests/integ/modin/frame/test_where.py | 4 +- .../integ/modin/groupby/test_groupby_apply.py | 28 +++--- .../modin/groupby/test_groupby_basic_agg.py | 4 +- .../groupby/test_groupby_dataframe_rank.py | 46 ++++------ .../groupby/test_groupby_default2pandas.py | 2 +- .../modin/groupby/test_groupby_head_tail.py | 4 +- .../groupby/test_groupby_idxmax_idxmin.py | 4 +- .../modin/groupby/test_groupby_ngroups.py | 2 +- .../modin/groupby/test_groupby_series.py | 18 ++-- .../modin/groupby/test_groupby_transform.py | 8 +- .../index/test_datetime_index_methods.py | 2 +- .../test_df_series_creation_with_index.py | 4 +- tests/integ/modin/index/test_index_methods.py | 2 +- tests/integ/modin/resample/test_resample.py | 6 +- .../modin/resample/test_resample_fillna.py | 4 +- .../modin/resample/test_resample_negative.py | 2 +- tests/integ/modin/series/test_add_prefix.py | 2 +- tests/integ/modin/series/test_add_suffix.py | 2 +- tests/integ/modin/series/test_all_any.py | 4 +- tests/integ/modin/series/test_at.py | 8 +- .../modin/series/test_bitwise_operators.py | 62 ++++++------- tests/integ/modin/series/test_compare.py | 2 +- tests/integ/modin/series/test_describe.py | 23 ++--- tests/integ/modin/series/test_empty.py | 19 ++-- tests/integ/modin/series/test_iat.py | 4 +- tests/integ/modin/series/test_iloc.py | 30 +++--- tests/integ/modin/series/test_loc.py | 24 ++--- tests/integ/modin/series/test_mask.py | 42 ++++----- .../modin/series/test_nlargest_nsmallest.py | 2 +- tests/integ/modin/series/test_nunique.py | 14 +-- tests/integ/modin/series/test_rank.py | 22 ++--- tests/integ/modin/series/test_rename.py | 6 +- tests/integ/modin/series/test_setitem.py | 22 ++--- tests/integ/modin/series/test_shape.py | 18 ++-- tests/integ/modin/series/test_size.py | 21 ++--- tests/integ/modin/series/test_take.py | 8 +- tests/integ/modin/series/test_to_snowflake.py | 2 +- tests/integ/modin/series/test_transpose.py | 6 +- tests/integ/modin/series/test_where.py | 42 ++++----- tests/integ/modin/test_concat.py | 4 +- tests/integ/modin/test_numpy.py | 6 +- .../modin/types/test_timedelta_indexing.py | 24 ++--- 71 files changed, 437 insertions(+), 492 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 1c8f9f084a..f83c1d7549 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -297,7 +297,7 @@ def __init__( # pd.DataFrame({'a': 1, 'b': 2}, index=[0]) dummy_index = index - if is_scalar(data) and not isinstance(index, type(None)): + if not isinstance(index, (Index, type(self))): dummy_index = index query_compiler = from_pandas( pandas.DataFrame( @@ -309,7 +309,10 @@ def __init__( ) )._query_compiler - if index is not None: + if index is not None and ( + isinstance(index, (Index, Series)) + or isinstance(data, (Index, Series, type(self))) + ): if isinstance(data, (type(self), Series, type(None))): # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index e4f3f4856d..7808f4050a 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -411,7 +411,7 @@ def __init__( # If the data is not a Snowpark pandas object, convert it to a query compiler. name = MODIN_UNNAMED_SERIES_LABEL if name is None else name dummy_index = None - if is_scalar(data) and not isinstance(index, type(None)): + if not isinstance(index, (Index, type(self))): dummy_index = index if ( isinstance(data, (native_pd.Series, native_pd.Index)) @@ -431,7 +431,9 @@ def __init__( ) )._query_compiler - if index is not None: + if index is not None and ( + isinstance(index, (Index, type(self))) or isinstance(data, (Index, type(self))) + ): if is_dict_like(data) or isinstance(data, (type(self), type(None))): # The `index` parameter is used to select the rows from `data` that will be in the resultant Series. # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. diff --git a/tests/integ/modin/binary/test_binary_op.py b/tests/integ/modin/binary/test_binary_op.py index 9bc8dd9f43..3190751887 100644 --- a/tests/integ/modin/binary/test_binary_op.py +++ b/tests/integ/modin/binary/test_binary_op.py @@ -1871,7 +1871,7 @@ def test_binary_rpow_between_df_and_list_like_on_axis_1(rhs): "rmod", ], ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_generated_docstring_examples(opname): # test for operators that correct examples are generated and match up with pandas. # if this test passes, this ensures that all the examples generated in utils.py will be correct. diff --git a/tests/integ/modin/frame/test_add_prefix.py b/tests/integ/modin/frame/test_add_prefix.py index 8cf30f4913..5ac652ea92 100644 --- a/tests/integ/modin/frame/test_add_prefix.py +++ b/tests/integ/modin/frame/test_add_prefix.py @@ -46,7 +46,7 @@ def test_df_add_prefix_multiindex(prefix, native_df_with_multiindex_columns): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("prefix", TEST_ADD_PREFIX_DATA) def test_df_add_prefix_time_column_df( prefix, time_column_snowpark_pandas_df, time_column_native_df diff --git a/tests/integ/modin/frame/test_add_suffix.py b/tests/integ/modin/frame/test_add_suffix.py index 0dceff54d7..4fbaf1e319 100644 --- a/tests/integ/modin/frame/test_add_suffix.py +++ b/tests/integ/modin/frame/test_add_suffix.py @@ -46,7 +46,7 @@ def test_df_add_suffix_multiindex(suffix, native_df_with_multiindex_columns): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("suffix", TEST_ADD_SUFFIX_DATA) def test_df_add_suffix_time_column_df( suffix, time_column_snowpark_pandas_df, time_column_native_df diff --git a/tests/integ/modin/frame/test_assign.py b/tests/integ/modin/frame/test_assign.py index f60107057e..8f1e1294e2 100644 --- a/tests/integ/modin/frame/test_assign.py +++ b/tests/integ/modin/frame/test_assign.py @@ -36,7 +36,7 @@ def assign_func(df): eval_snowpark_pandas_result(snow_df, native_df, assign_func) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=2) @pytest.mark.parametrize( "index", [[2, 1, 0], [4, 5, 6]], ids=["reversed_index", "different_index"] ) @@ -136,7 +136,7 @@ def test_assign_short_series(): assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=2) @pytest.mark.parametrize( "index", [[1, 0], [4, 5]], ids=["reversed_index", "different_index"] ) diff --git a/tests/integ/modin/frame/test_astype.py b/tests/integ/modin/frame/test_astype.py index dbd267b307..8007b264b4 100644 --- a/tests/integ/modin/frame/test_astype.py +++ b/tests/integ/modin/frame/test_astype.py @@ -35,7 +35,7 @@ def test_series_input(): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_input_negative(): df = pd.DataFrame({"a": [1, 2, 3], "b": [2.4, 2.5, 3.1]}) with pytest.raises(KeyError, match="not found in columns"): diff --git a/tests/integ/modin/frame/test_at.py b/tests/integ/modin/frame/test_at.py index 9194416648..f43270ff53 100644 --- a/tests/integ/modin/frame/test_at.py +++ b/tests/integ/modin/frame/test_at.py @@ -20,7 +20,7 @@ def test_at_get_default_index_str_columns( ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_at_set_default_index_str_columns( default_index_snowpark_pandas_df, default_index_native_df, @@ -44,7 +44,7 @@ def test_at_get_str_index_str_columns( assert str_index_snowpark_pandas_df.at["b", "B"] == str_index_native_df.at["b", "B"] -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_at_set_str_index_str_columns( str_index_snowpark_pandas_df, str_index_native_df, @@ -57,7 +57,7 @@ def at_set_helper(df): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_at_get_time_index_time_columns( time_index_snowpark_pandas_df, time_index_native_df, @@ -68,7 +68,7 @@ def test_at_get_time_index_time_columns( ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_at_set_time_index_time_columns( time_index_snowpark_pandas_df, time_index_native_df, diff --git a/tests/integ/modin/frame/test_axis.py b/tests/integ/modin/frame/test_axis.py index 0fb3fa2c5f..a6a156a05f 100644 --- a/tests/integ/modin/frame/test_axis.py +++ b/tests/integ/modin/frame/test_axis.py @@ -244,7 +244,7 @@ def test_set_columns_index_name(index_name): ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1) def test_duplicate_labels_assignment(): # Duplicate data labels snow_df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) diff --git a/tests/integ/modin/frame/test_copy.py b/tests/integ/modin/frame/test_copy.py index 7844ca321a..b4c5f4f2a5 100644 --- a/tests/integ/modin/frame/test_copy.py +++ b/tests/integ/modin/frame/test_copy.py @@ -28,7 +28,7 @@ def native_df(snow_df): @pytest.mark.parametrize("deep", [None, True, False]) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_copy(deep, snow_df, native_df): # Verify copy is same as original assert_snowpark_pandas_equal_to_pandas(snow_df.copy(deep=deep), native_df) @@ -61,7 +61,7 @@ def test_copy_deep_false_column_names(snow_df): lambda df: df.rename(columns={"a": "new_a"}, inplace=True), ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_copy_inplace_operations_on_deep_copy(snow_df, native_df, operation): snow_df_copy = snow_df.copy(deep=True) operation(snow_df_copy) @@ -79,7 +79,7 @@ def test_copy_inplace_operations_on_deep_copy(snow_df, native_df, operation): lambda df: df.rename(columns={"a": "new_a"}, inplace=True), ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_copy_inplace_operations_on_shallow_copy(snow_df, operation): snow_df_copy = snow_df.copy(deep=False) operation(snow_df_copy) diff --git a/tests/integ/modin/frame/test_drop.py b/tests/integ/modin/frame/test_drop.py index 4dcae76af7..cc1a1a203d 100644 --- a/tests/integ/modin/frame/test_drop.py +++ b/tests/integ/modin/frame/test_drop.py @@ -209,7 +209,7 @@ def test_drop_invalid_labels_axis0_negative( ([], None), # empty labels ], ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2) def test_drop_invalid_axis1_labels_errors_ignore(labels, level, multiindex_snow_df): result = multiindex_snow_df.drop(labels, level=level, axis=1, errors="ignore") assert_frame_equal(multiindex_snow_df, result) @@ -231,7 +231,7 @@ def test_drop_invalid_axis1_labels_errors_ignore(labels, level, multiindex_snow_ ([], None), # empty labels ], ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2) def test_drop_invalid_axis0_labels_errors_ignore(labels, level, multiindex_snow_df): result = multiindex_snow_df.drop(labels, level=level, errors="ignore") assert_frame_equal(multiindex_snow_df, result) @@ -263,7 +263,7 @@ def test_empty_tuple_multiindex(multiindex_snow_df, axis): assert len(result.index) == 0 -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2) def test_drop_preserve_index_names(multiindex_snow_df): df_dropped_e = multiindex_snow_df.drop("red", axis=1) df_inplace_e = multiindex_snow_df.copy() diff --git a/tests/integ/modin/frame/test_dtypes.py b/tests/integ/modin/frame/test_dtypes.py index 49d8abfe2a..c3773bdd6d 100644 --- a/tests/integ/modin/frame/test_dtypes.py +++ b/tests/integ/modin/frame/test_dtypes.py @@ -18,7 +18,7 @@ StringType, VariantType, ) -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_frame_equal, assert_series_equal, @@ -77,7 +77,7 @@ def validate_series_snowpark_dtype(series: pd.Series, snowpark_type: DataType) - ), ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2) def test_integer(dataframe_input, input_dtype, logical_dtype): expected = native_pd.Series(dataframe_input, dtype=input_dtype) created = pd.Series(dataframe_input, dtype=input_dtype) @@ -218,7 +218,7 @@ def test_extended_float64_with_nan(): ), ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2) def test_float(dataframe_input, input_dtype, expected_dtype, logical_dtype): expected = native_pd.Series(dataframe_input, dtype=input_dtype) created = pd.Series(dataframe_input, dtype=input_dtype) @@ -256,7 +256,7 @@ def test_float(dataframe_input, input_dtype, expected_dtype, logical_dtype): ), ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2) def test_string(dataframe_input, input_dtype, index): expected = native_pd.Series(dataframe_input, dtype=input_dtype) created = pd.Series(dataframe_input) @@ -305,7 +305,7 @@ def test_string_explicit(dataframe_input, input_dtype, index): (["level0"], ["col1", "col2", "col1"]), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1) def test_insert_multiindex_multi_label(label1, label2): arrays = [["apple", "apple", "banana", "banana"], [1, 2, 1, 2]] index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) @@ -452,24 +452,24 @@ def test_empty(input_dtype, expected_dtype, snowpark_dtype, to_pandas_dtype): @pytest.mark.parametrize( - "index, expected_index_dtype, join_count", + "index, expected_index_dtype", [ - (None, np.dtype("int64"), 0), - (native_pd.Index([]), np.dtype("object"), 1), - (native_pd.Index([], dtype="float64"), np.dtype("float64"), 1), + (None, np.dtype("int64")), + (native_pd.Index([]), np.dtype("object")), + (native_pd.Index([], dtype="float64"), np.dtype("float64")), ], ) -def test_empty_index(index, expected_index_dtype, join_count): - with SqlCounter(query_count=1, join_count=join_count): - expected = native_pd.Series(data=[], index=index) - assert expected.dtype == np.dtype("object") - assert expected.index.dtype == expected_index_dtype - created = pd.Series(data=[], index=index) - assert created.dtype == np.dtype("object") - assert created.index.dtype == expected_index_dtype - roundtripped = created.to_pandas() - assert roundtripped.dtype == np.dtype("object") - assert roundtripped.index.dtype == expected_index_dtype +@sql_count_checker(query_count=1) +def test_empty_index(index, expected_index_dtype): + expected = native_pd.Series(data=[], index=index) + assert expected.dtype == np.dtype("object") + assert expected.index.dtype == expected_index_dtype + created = pd.Series(data=[], index=index) + assert created.dtype == np.dtype("object") + assert created.index.dtype == expected_index_dtype + roundtripped = created.to_pandas() + assert roundtripped.dtype == np.dtype("object") + assert roundtripped.index.dtype == expected_index_dtype @pytest.mark.parametrize( diff --git a/tests/integ/modin/frame/test_fillna.py b/tests/integ/modin/frame/test_fillna.py index 6ae668d694..677c8d3ddc 100644 --- a/tests/integ/modin/frame/test_fillna.py +++ b/tests/integ/modin/frame/test_fillna.py @@ -426,7 +426,7 @@ def test_multiindex_df_values_dict_various_levels(test_fillna_multiindex_df): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2) def test_multiindex_df_values_series(test_fillna_multiindex_df, test_fillna_multiindex): values = pd.Series([10, 1, 2, 3], index=test_fillna_multiindex) native_values = native_pd.Series([10, 1, 2, 3], index=test_fillna_multiindex) diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py index e08e25513a..76a30f1e68 100644 --- a/tests/integ/modin/frame/test_getitem.py +++ b/tests/integ/modin/frame/test_getitem.py @@ -343,7 +343,7 @@ def test_df_getitem_with_slice( slice("z", "a", -1), ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_df_getitem_with_non_int_slice(key): data = {"a": [1, 2, 3], "b": [4, 5, 6]} index = ["x", "y", "z"] diff --git a/tests/integ/modin/frame/test_iat.py b/tests/integ/modin/frame/test_iat.py index dbf3d50759..2191fb8db8 100644 --- a/tests/integ/modin/frame/test_iat.py +++ b/tests/integ/modin/frame/test_iat.py @@ -103,7 +103,7 @@ def iat_set_helper(df): (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=2) def test_iat_get_time_index_time_columns( key, time_index_snowpark_pandas_df, @@ -121,7 +121,7 @@ def test_iat_get_time_index_time_columns( (-7, -7), ], ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=2) def test_iat_set_time_index_time_columns( key, time_index_snowpark_pandas_df, diff --git a/tests/integ/modin/frame/test_idxmax_idxmin.py b/tests/integ/modin/frame/test_idxmax_idxmin.py index f9dc28bba9..56159484a2 100644 --- a/tests/integ/modin/frame/test_idxmax_idxmin.py +++ b/tests/integ/modin/frame/test_idxmax_idxmin.py @@ -13,6 +13,7 @@ from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result +@sql_count_checker(query_count=1) @pytest.mark.parametrize( "data, index", [ @@ -73,20 +74,16 @@ def test_idxmax_idxmin_df(data, index, func, axis, skipna): pytest.xfail( "Snowpark pandas returns a Series with None whereas pandas throws a ValueError" ) - with SqlCounter( - query_count=1, - join_count=0 if index is None or (data == {} and index == []) else 1, - ): - eval_snowpark_pandas_result( - *create_test_dfs( - data=data, - index=index, - ), - lambda df: getattr(df, func)(axis=axis, skipna=skipna), - ) + eval_snowpark_pandas_result( + *create_test_dfs( + data=data, + index=index, + ), + lambda df: getattr(df, func)(axis=axis, skipna=skipna), + ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize( "data, index", [ @@ -217,7 +214,7 @@ def test_idxmax_idxmin_with_timedelta(func, axis): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax_idxmin_with_strings(func, axis): diff --git a/tests/integ/modin/frame/test_iloc.py b/tests/integ/modin/frame/test_iloc.py index d8b874b1cf..d9bb4c1bc8 100644 --- a/tests/integ/modin/frame/test_iloc.py +++ b/tests/integ/modin/frame/test_iloc.py @@ -118,7 +118,7 @@ ("RangeIndex", 0), ("Index[bool]", 1), ("emptyFloatSeries", 2), - ("multi_index_Series", 6), + ("multi_index_Series", 2), ] # Snowflake type checking will fail if the item values aren't type compatible, so we normalize to int to stay compatible. @@ -315,10 +315,7 @@ def eval_func(df): if key == "RangeIndex": expected_query_count = 1 - with SqlCounter( - query_count=expected_query_count, - join_count=4 if key == "multi_index_Series" else 0, - ): + with SqlCounter(query_count=expected_query_count): eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, eval_func ) @@ -448,7 +445,7 @@ def test_df_iloc_get_diff2native( ) -@sql_count_checker(query_count=2, join_count=8) +@sql_count_checker(query_count=2, join_count=4) def test_df_iloc_get_with_conflict(): # index and data columns have conflict in get_by_col df = DataFrame({"A": [0, 1]}, index=native_pd.Index([2, 3], name="A")).rename( @@ -2622,31 +2619,31 @@ def perform_iloc(df): @pytest.mark.parametrize( - "row_key, row_key_index, row_add_joins", + "row_key, row_key_index", [ - [1, None, 0], - [[3, 0], None, 0], - [[1, 2], [("A",), ("B",)], 1], - [[2, 1], [("A", 1), ("B", 2)], 2], + [1, None], + [[3, 0], None], + [[1, 2], [("A",), ("B",)]], + [[2, 1], [("A", 1), ("B", 2)]], ], ) @pytest.mark.parametrize( - "col_key, col_key_index, col_add_joins", + "col_key, col_key_index", [ - [2, None, 0], - [[2, 1], None, 0], - [[1, 2], [("X",), ("Y",)], 1], - [[2, 1], [("X", 11), ("Y", 21)], 2], + [2, None], + [[2, 1], None], + [[1, 2], [("X",), ("Y",)]], + [[2, 1], [("X", 11), ("Y", 21)]], ], ) @pytest.mark.parametrize( "item_values, item_index, item_columns, expected_join_count", [ - [999, None, None, 6], - [TEST_ITEMS_DATA_2X2, None, None, 7], - [TEST_ITEMS_DATA_2X2, [("r", 20), ("s", 25)], None, 9], - [TEST_ITEMS_DATA_2X2, [("r", 20), ("s", 25)], [("e", 5), ("f", 6)], 9], - [TEST_ITEMS_DATA_2X2, None, [("e", 5), ("f", 6)], 7], + [999, None, None, 2], + [TEST_ITEMS_DATA_2X2, None, None, 3], + [TEST_ITEMS_DATA_2X2, [("r", 20), ("s", 25)], None, 5], + [TEST_ITEMS_DATA_2X2, [("r", 20), ("s", 25)], [("e", 5), ("f", 6)], 5], + [TEST_ITEMS_DATA_2X2, None, [("e", 5), ("f", 6)], 3], ], ) def test_df_iloc_set_with_multiindex( @@ -2658,8 +2655,6 @@ def test_df_iloc_set_with_multiindex( item_index, item_columns, expected_join_count, - row_add_joins, - col_add_joins, ): df_data = [ [1, 2, 3, 4, 5], @@ -2715,6 +2710,7 @@ def test_df_iloc_set_with_multiindex( native_items.columns = pd.MultiIndex.from_tuples(item_columns) if row_key_index: + # Using native pandas index since row_key[2] is a MultiIndex object. snow_row_key = pd.Series(row_key, index=native_pd.Index(row_key_index)) native_row_key = native_pd.Series(row_key, index=native_pd.Index(row_key_index)) else: @@ -2722,6 +2718,7 @@ def test_df_iloc_set_with_multiindex( native_row_key = row_key if col_key_index: + # Using native pandas index since col_key[2] is a MultiIndex object. snow_col_key = pd.Series(col_key, index=native_pd.Index(col_key_index)) native_col_key = native_pd.Series(col_key, index=native_pd.Index(col_key_index)) else: @@ -2738,7 +2735,6 @@ def helper_iloc(df): if isinstance(snow_col_key, pd.Series): expected_query_count += 1 - expected_join_count += row_add_joins + col_add_joins with SqlCounter(query_count=expected_query_count, join_count=expected_join_count): eval_snowpark_pandas_result(snow_df, native_df, helper_iloc, inplace=True) @@ -2814,7 +2810,7 @@ def iloc_helper(df: Union[pd.DataFrame, native_pd.DataFrame]) -> None: # For a Series row key, the key is joined with the df to derive the iloc results. For column keys, a select # statement is used instead of a join. - join_count = 4 if axis == "row" else 2 + join_count = 2 if axis == "row" else 0 query_count = 1 if axis == "row" else 2 # Evaluate with MultiIndex created from tuples. diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index c7a1c980c9..414889d337 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -277,13 +277,13 @@ def test_insert_loc_negative(native_df, loc, expected_query_count): @pytest.mark.parametrize( "value, expected_query_count, expected_join_count", [ - (np.array(["a", "b", "c", "d"]), 2, 5), # numpy array of shape (N,) - (np.array([["a"], ["b"], ["c"], ["d"]]), 2, 5), # numpy array of shape (N, 1) - (["a", "b", "c", "d"], 2, 5), # python list - (("a", "b", "c", "d"), 2, 5), # python tuple - ({(3, 1): 1}, 1, 3), # python dict - ("abc", 1, 2), # sting scalar - (1, 1, 2), # int scalar + (np.array(["a", "b", "c", "d"]), 2, 1), # numpy array of shape (N,) + (np.array([["a"], ["b"], ["c"], ["d"]]), 2, 1), # numpy array of shape (N, 1) + (["a", "b", "c", "d"], 2, 1), # python list + (("a", "b", "c", "d"), 2, 1), # python tuple + ({(3, 1): 1}, 1, 1), # python dict + ("abc", 1, 0), # sting scalar + (1, 1, 0), # int scalar ], ) def test_insert_multiindex_array_like_and_scalar( @@ -310,7 +310,7 @@ def test_insert_multiindex_array_like_and_scalar( ("a", "b", "c", "d"), # python tuple ], ) -@sql_count_checker(query_count=2, join_count=5) +@sql_count_checker(query_count=2, join_count=1) def test_insert_empty_multiindex_frame(value): mi = pd.MultiIndex.from_arrays([np.array([], dtype=int), np.array([], dtype=int)]) snow_df = pd.DataFrame([], index=mi) @@ -344,61 +344,55 @@ def test_insert_multiindex_dict_negative(): @pytest.mark.parametrize( - "df_index, value_index, join_count", + "df_index, value_index", [ - ([3, 0, 4], [1, 2, 3], 6), - ([(1, 0), (1, 2), (2, 2)], [(1, 1), (1, 2), (2, 2)], 11), - ([1.0, 2.5, 3.0], [1, 2, 3], 6), # Long and Double can be joined + ([3, 0, 4], [1, 2, 3]), + ([(1, 0), (1, 2), (2, 2)], [(1, 1), (1, 2), (2, 2)]), + ([1.0, 2.5, 3.0], [1, 2, 3]), # Long and Double can be joined ], ) -def test_insert_compatible_index(df_index, value_index, join_count): +@sql_count_checker(query_count=4, join_count=1) +def test_insert_compatible_index(df_index, value_index): snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) value = pd.DataFrame({"col2": ["x", "y", "z"]}, index=native_pd.Index(value_index)) - with SqlCounter(query_count=4, join_count=join_count): - eval_snowpark_pandas_result( - snow_df, - snow_df.to_pandas(), - lambda df: df.insert( - 0, "col3", value if isinstance(df, pd.DataFrame) else value.to_pandas() - ), - inplace=True, # insert operation is always inplace - ) + eval_snowpark_pandas_result( + snow_df, + snow_df.to_pandas(), + lambda df: df.insert( + 0, "col3", value if isinstance(df, pd.DataFrame) else value.to_pandas() + ), + inplace=True, # insert operation is always inplace + ) @pytest.mark.parametrize( - "df_index, value_index, join_count", + "df_index, value_index", [ - ([3, 2, 1], [(1, 0, 1), (1, 2, 3), (2, 1, 0)], 3), # length mismatch 1 != 3 + ([3, 2, 1], [(1, 0, 1), (1, 2, 3), (2, 1, 0)]), # length mismatch 1 != 3 ( [(3, 1), (2, 1), (1, 2)], [(1, 0, 1), (1, 2, 3), (2, 1, 0)], - 3, ), # length mismatch 2 != 3 - ([1, 2, 3], [(1, 0), (1, 2), (2, 2)], 2), # 1 != 2 - ([(1, 0), (1, 2), (2, 2)], [(1, 2, 3), (3, 4, 5), (6, 5, 4)], 3), # 2 != 3 - ([(1, 2, 3), (3, 4, 5), (6, 5, 4)], [3, 1, 2], 1), # length mismatch 3 != 1 + ([1, 2, 3], [(1, 0), (1, 2), (2, 2)]), # 1 != 2 + ([(1, 0), (1, 2), (2, 2)], [(1, 2, 3), (3, 4, 5), (6, 5, 4)]), # 2 != 3 + ([(1, 2, 3), (3, 4, 5), (6, 5, 4)], [3, 1, 2]), # length mismatch 3 != 1 ( [(1, 1), (1, 2), (2, 2)], ["(1, 0)", "(1, 2)", "(2, 2)"], - 1, ), # length and type mismatch ], ) -def test_insert_index_num_levels_mismatch_negative(df_index, value_index, join_count): - with SqlCounter(query_count=1, join_count=join_count): - snow_df = pd.DataFrame( - {"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index) - ) - value = pd.DataFrame( - {"col2": ["w", "x", "y"]}, index=native_pd.Index(value_index) - ) - # This is different behavior from native pandas. Native pandas in some cases - # insert new column with null values but in Snowpark pandas we always raise error. - with pytest.raises( - ValueError, - match="Number of index levels of inserted column are different from frame index", - ): - snow_df.insert(0, "col3", value) +@sql_count_checker(query_count=1) +def test_insert_index_num_levels_mismatch_negative(df_index, value_index): + snow_df = pd.DataFrame({"col1": ["p", "q", "r"]}, index=native_pd.Index(df_index)) + value = pd.DataFrame({"col2": ["w", "x", "y"]}, index=native_pd.Index(value_index)) + # This is different behavior from native pandas. Native pandas in some cases + # insert new column with null values but in Snowpark pandas we always raise error. + with pytest.raises( + ValueError, + match="Number of index levels of inserted column are different from frame index", + ): + snow_df.insert(0, "col3", value) @pytest.mark.parametrize( @@ -413,7 +407,7 @@ def test_insert_index_num_levels_mismatch_negative(df_index, value_index, join_c ), # type mismatch boolean != long ], ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2, join_count=1) def test_insert_index_type_mismatch(df_index, value_index, expected_index): # Note: This is different behavior than native pandas. In native pandas when # index datatype mismatch new columns in inserted will all NULL values. @@ -430,7 +424,7 @@ def test_insert_index_type_mismatch(df_index, value_index, expected_index): assert_snowpark_pandas_equal_to_pandas(snow_df, expected_df) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_insert_with_null_index_values(): snow_df = pd.DataFrame( {"A": ["p", "q", "r", "s"]}, native_pd.Index(["a", None, "b", None]) @@ -446,7 +440,7 @@ def test_insert_with_null_index_values(): ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_insert_multiple_null(): snow_df = pd.DataFrame( {"A": ["p", "q", "r", "s"]}, native_pd.Index(["a", "b", "c", "d"]) @@ -471,8 +465,8 @@ def test_insert_multiple_null(): @pytest.mark.parametrize( "index, value, expected_query_count, expected_join_count", [ - ([1, 2], native_pd.Series([1, 2], index=[2, 3]), 1, 3), - ([1, 2], [3, 4], 2, 3), + ([1, 2], native_pd.Series([1, 2], index=[2, 3]), 1, 1), + ([1, 2], [3, 4], 2, 1), ], ) def test_insert_into_empty_dataframe_with_index( diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index d75b16658d..33c1fb98e5 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -215,7 +215,7 @@ def test_df_loc_get_col_non_boolean_key( "key", boolean_indexer, ) -@sql_count_checker(query_count=3, join_count=1) +@sql_count_checker(query_count=3) def test_df_loc_get_col_boolean_indexer( key, str_index_snowpark_pandas_df, str_index_native_df ): @@ -243,7 +243,7 @@ def test_df_loc_get_col_boolean_indexer( "key", list_like_time_col_inputs, ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_df_loc_get_col_time_df( key, time_column_snowpark_pandas_df, time_column_native_df ): @@ -258,7 +258,7 @@ def test_df_loc_get_col_time_df( "key", snowpark_pandas_int_index_row_inputs, ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_df_loc_get_int_index_row_snowpark_pandas_input( key, default_index_snowpark_pandas_df, @@ -606,7 +606,7 @@ def test_mi_df_loc_get_non_boolean_list_tuple_key(mi_table_df, row, col): ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2, join_count=2) def test_mi_df_loc_get_boolean_series_row_key(mi_table_df): df = pd.DataFrame(mi_table_df) bool_indexer = [False, True, True, False, False, True] @@ -639,7 +639,7 @@ def test_mi_df_loc_get_boolean_series_row_key(mi_table_df): ) -@sql_count_checker(query_count=3, join_count=2) +@sql_count_checker(query_count=3) def test_mi_df_loc_get_boolean_series_col_key(mi_table_df): df = pd.DataFrame(mi_table_df) bool_indexer = [False, True] @@ -1448,9 +1448,11 @@ def helper(df): snow_df.to_pandas() else: expected_query_count = 1 - expected_join_count = 2 + expected_join_count = 1 if key == slice(None): expected_join_count = 0 + elif isinstance(key, slice) and key.step == 2: + expected_join_count += 1 with SqlCounter( query_count=expected_query_count, join_count=expected_join_count @@ -1680,7 +1682,7 @@ def test_df_loc_get_key_bool_series_with_aligned_indices(key, use_default_index) native_df = native_pd.DataFrame( {"c1": [1, 2, 3, 4, 5], "c2": ["x", "y", "z", "d", "e"]}, index=index ) - with SqlCounter(query_count=1, join_count=1 if use_default_index else 2): + with SqlCounter(query_count=1, join_count=1): snow_df = pd.DataFrame(native_df) eval_snowpark_pandas_result( snow_df, @@ -1699,7 +1701,7 @@ def test_df_loc_get_key_bool_series_with_aligned_indices(key, use_default_index) [random.choice([True, False]) for _ in range(5)], ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_df_loc_get_key_bool_series_with_unaligned_and_distinct_indices( key, use_default_index ): @@ -1776,7 +1778,7 @@ def test_df_loc_get_key_bool_series_with_unaligned_and_duplicate_indices(): ], # larger length ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_df_loc_get_key_bool_series_with_mismatch_index_len(key, use_default_index): if use_default_index: index = None @@ -2404,7 +2406,7 @@ def loc_set_helper(df): ["a", "a", "c", "d"], ], ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_df_loc_set_scalar_row_key_enlargement( row_key, col_key, item_values, data_index ): @@ -2476,7 +2478,7 @@ def set_loc_helper(df): ["a", "a", "c", "d"], ], ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_df_loc_set_scalar_row_key_enlargement_deviates_from_native_pandas( row_key, col_key, item_values, data_index ): @@ -3201,7 +3203,7 @@ def test_df_loc_set_boolean_series_with_non_default_index_key_and_scalar_item(): ["duplicate", [1, 1, 2, 3]], ], ) -@sql_count_checker(query_count=1, join_count=5) +@sql_count_checker(query_count=1, join_count=4) def test_df_loc_set_duplicate_index( self_index_type, self_index_val, index, columns, item ): @@ -3782,7 +3784,7 @@ def loc_set_helper(df): [2, "x"], ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_df_setitem_boolean_key(key, index): item = 99 num_columns = 3 @@ -3860,7 +3862,7 @@ def test_df_single_value_with_slice_key(): eval_snowpark_pandas_result(snowpark_df, native_df, lambda df: df.loc[0:1]) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_df_loc_set_none(): native_df = native_pd.DataFrame({"a": [1, 2, 3]}) @@ -3883,7 +3885,7 @@ def loc_set_helper(df): ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=3) def test_df_loc_set_with_index_and_column_labels(): """ Create a DataFrame using 3 Series objects and perform loc set with a scalar. diff --git a/tests/integ/modin/frame/test_mask.py b/tests/integ/modin/frame/test_mask.py index 7b47880557..53afbd7bf8 100644 --- a/tests/integ/modin/frame/test_mask.py +++ b/tests/integ/modin/frame/test_mask.py @@ -864,7 +864,7 @@ def perform_mask(df): ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2, join_count=3) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], @@ -909,7 +909,7 @@ def perform_mask(df): ) -@sql_count_checker(query_count=2, join_count=5, union_count=1) +@sql_count_checker(query_count=2, join_count=3, union_count=1) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py index 15ad41a580..8b9b5472e3 100644 --- a/tests/integ/modin/frame/test_merge.py +++ b/tests/integ/modin/frame/test_merge.py @@ -302,7 +302,7 @@ def test_merge_on_index_columns(left_df, right_df, how, on, sort): @pytest.mark.parametrize("index1", [[3, 4], [1.5, 8.0], [None, None]]) @pytest.mark.parametrize("index2", [[7, 8], [1.5, 3.0], [None, None]]) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_join_type_mismatch(index1, index2): df1 = pd.DataFrame({"A": [1, 2]}, index=index1) df2 = pd.DataFrame({"B": [3, 4]}, index=index2) @@ -351,7 +351,7 @@ def test_join_type_mismatch_negative(index1, index2): ), ], ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_join_type_mismatch_diff_with_native_pandas(index1, index2, expected_res): df1 = pd.DataFrame({"A": [1, 2]}, index=index1) df2 = pd.DataFrame({"B": [3, 4]}, index=index2) @@ -960,7 +960,7 @@ def test_merge_no_join_keys_negative(left_name, right_name, left_df, right_df): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_no_join_keys_common_index_negative(left_df, right_df): left_df = pd.DataFrame({"A": [1, 2, 3]}, native_pd.Index([7, 8, 9], name="KEY")) right_df = pd.DataFrame({"B": [1, 2, 3]}, native_pd.Index([7, 8, 9], name="KEY")) diff --git a/tests/integ/modin/frame/test_nunique.py b/tests/integ/modin/frame/test_nunique.py index 6fd1751e3a..d0cad8ec2a 100644 --- a/tests/integ/modin/frame/test_nunique.py +++ b/tests/integ/modin/frame/test_nunique.py @@ -85,12 +85,12 @@ def test_dataframe_nunique_no_columns(native_df): ), ], ) +@sql_count_checker(query_count=1) def test_dataframe_nunique_multiindex(index, columns): - with SqlCounter(query_count=1, join_count=0 if index is None else 2): - eval_snowpark_pandas_result( - *create_test_dfs(TEST_DATA, index=index, columns=columns), - lambda df: df.nunique(axis=0), - ) + eval_snowpark_pandas_result( + *create_test_dfs(TEST_DATA, index=index, columns=columns), + lambda df: df.nunique(axis=0), + ) @sql_count_checker(query_count=0) diff --git a/tests/integ/modin/frame/test_rank.py b/tests/integ/modin/frame/test_rank.py index 05fa47b99b..1687ce4905 100644 --- a/tests/integ/modin/frame/test_rank.py +++ b/tests/integ/modin/frame/test_rank.py @@ -7,7 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, eval_snowpark_pandas_result, @@ -40,6 +40,7 @@ ] +@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -55,16 +56,13 @@ ) # test df.rank with all method, na_option, ascending parameter combinations def test_df_rank(data, index, method, ascending, na_option): - with SqlCounter( - query_count=1, join_count=2 if isinstance(index, native_pd.MultiIndex) else 0 - ): - snow_df = pd.DataFrame(data, index=index) - native_df = native_pd.DataFrame(data, index=index) - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.rank(method=method, na_option=na_option, ascending=ascending), - ) + snow_df = pd.DataFrame(data, index=index) + native_df = native_pd.DataFrame(data, index=index) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.rank(method=method, na_option=na_option, ascending=ascending), + ) @sql_count_checker(query_count=1) @@ -120,6 +118,7 @@ def test_rank_unsupported_args_negative(method, ascending, na_option): snow_df.rank(axis=1, method=method, ascending=ascending, na_option=na_option) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -135,15 +134,10 @@ def test_rank_unsupported_args_negative(method, ascending, na_option): ) # test df percentile rank def test_df_rank_pct(data, index, method, ascending, na_option): - with SqlCounter( - query_count=1, join_count=2 if isinstance(index, native_pd.MultiIndex) else 0 - ): - snow_df = pd.DataFrame(data, index=index).rank( - method=method, ascending=ascending, na_option=na_option, pct=True - ) - native_df = native_pd.DataFrame(data, index=index).rank( - method=method, ascending=ascending, na_option=na_option, pct=True - ) - assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64( - snow_df, native_df - ) + snow_df = pd.DataFrame(data, index=index).rank( + method=method, ascending=ascending, na_option=na_option, pct=True + ) + native_df = native_pd.DataFrame(data, index=index).rank( + method=method, ascending=ascending, na_option=na_option, pct=True + ) + assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64(snow_df, native_df) diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index 1f7a7e3966..98d0a41e7a 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -454,7 +454,7 @@ def test_reindex_columns_fill_method_with_old_na_values_negative( lambda df: df.reindex(columns=list("CEBFGA"), method=method), ) - @sql_count_checker(query_count=5, join_count=1) + @sql_count_checker(query_count=5) @pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) def test_reindex_columns_datetime_with_fill(self, limit, method): @@ -495,7 +495,7 @@ def test_reindex_columns_non_overlapping_columns(self): snow_df, native_df, lambda df: df.reindex(axis=1, labels=list("EFG")) ) - @sql_count_checker(query_count=5, join_count=1) + @sql_count_checker(query_count=5) def test_reindex_columns_non_overlapping_datetime_columns(self): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_df = native_pd.DataFrame( @@ -520,7 +520,7 @@ def perform_reindex(df): snow_df, native_df, perform_reindex, check_freq=False ) - @sql_count_checker(query_count=2, join_count=1) + @sql_count_checker(query_count=2) def test_reindex_columns_non_overlapping_different_types_columns(self): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_df = native_pd.DataFrame( diff --git a/tests/integ/modin/frame/test_rename.py b/tests/integ/modin/frame/test_rename.py index 15351ec6fa..a5595ec716 100644 --- a/tests/integ/modin/frame/test_rename.py +++ b/tests/integ/modin/frame/test_rename.py @@ -294,7 +294,7 @@ def test_rename_objects(self, snow_float_string_frame): assert "FOO" in renamed assert "foo" not in renamed - @sql_count_checker(query_count=6, join_count=8) + @sql_count_checker(query_count=6, join_count=2) def test_rename_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"]) @@ -420,7 +420,7 @@ def test_rename_mapper_and_positional_arguments_raises(self): with pytest.raises(TypeError, match=msg): df.rename({}, columns={}, index={}) - @sql_count_checker(query_count=1, join_count=5) + @sql_count_checker(query_count=1, join_count=1) def test_rename_with_duplicate_columns(self): # GH#4403 df4 = DataFrame( diff --git a/tests/integ/modin/frame/test_repr.py b/tests/integ/modin/frame/test_repr.py index f499146806..2109bdccb5 100644 --- a/tests/integ/modin/frame/test_repr.py +++ b/tests/integ/modin/frame/test_repr.py @@ -227,7 +227,7 @@ def test_repr_deviating_behavior(): assert native_str[:N] == snow_str[:N] -@sql_count_checker(query_count=2, union_count=1, join_count=6) +@sql_count_checker(query_count=2, union_count=1) def test_repr_of_multiindex_df(): tuples = [ ("cobra", "mark i"), diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index 6bbdc30fa0..6152089f39 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -141,7 +141,7 @@ def setitem(df): else: df[key] = val - expected_join_count = 6 if isinstance(key.start, int) else 7 + expected_join_count = 3 if isinstance(key.start, int) else 4 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result(snow_df, native_df, setitem, inplace=True) @@ -361,7 +361,9 @@ def func_insert_new_column(df, column): df[key] = column expected_join_count = 2 - if isinstance(column, native_pd.Index) and not isinstance( + if isinstance(column, native_pd.Series): + expected_join_count = 1 + elif isinstance(column, native_pd.Index) and not isinstance( column, native_pd.DatetimeIndex ): expected_join_count = 4 diff --git a/tests/integ/modin/frame/test_stack.py b/tests/integ/modin/frame/test_stack.py index 80c437dea7..9b06c32ff0 100644 --- a/tests/integ/modin/frame/test_stack.py +++ b/tests/integ/modin/frame/test_stack.py @@ -20,7 +20,7 @@ ) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_stack(data, index, columns, dropna, sort): eval_snowpark_pandas_result( *create_test_dfs(data=data, index=index, columns=columns), diff --git a/tests/integ/modin/frame/test_transpose.py b/tests/integ/modin/frame/test_transpose.py index 469a66dd51..894bbbbc1b 100644 --- a/tests/integ/modin/frame/test_transpose.py +++ b/tests/integ/modin/frame/test_transpose.py @@ -242,7 +242,7 @@ def test_dataframe_transpose_preserve_float_dtypes(): assert all([dtype == "float64" for dtype in snow_df.T.dtypes]) -@sql_count_checker(query_count=1, union_count=1, join_count=2) +@sql_count_checker(query_count=1, union_count=1) def test_dataframe_transpose_single_numeric_column(): single_column_data = ({0: "A", 1: "B", 2: "C", 3: "D"},) native_df = native_pd.DataFrame(single_column_data, index=(0,)) diff --git a/tests/integ/modin/frame/test_where.py b/tests/integ/modin/frame/test_where.py index 75a5d6db7a..006b7e76fb 100644 --- a/tests/integ/modin/frame/test_where.py +++ b/tests/integ/modin/frame/test_where.py @@ -902,7 +902,7 @@ def perform_where(df): ) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2, join_count=3) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], @@ -947,7 +947,7 @@ def perform_where(df): ) -@sql_count_checker(query_count=2, join_count=5, union_count=1) +@sql_count_checker(query_count=2, join_count=3, union_count=1) @pytest.mark.parametrize( "data", [[10], [10, 11, 12], [10, 11, 12, 13]], diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index 7c43b00a7b..e83fcbe00b 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -191,7 +191,7 @@ class TestFuncReturnsDataFrame: @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_group_by_one_column_and_one_level_with_default_kwargs( self, grouping_dfs_with_multiindexes, func @@ -206,7 +206,7 @@ def test_group_by_one_column_and_one_level_with_default_kwargs( @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_df_with_default_index(self, grouping_dfs_with_multiindexes): eval_snowpark_pandas_result( @@ -232,7 +232,7 @@ def test_func_returns_empty_frame(self): @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_args_and_kwargs(self, grouping_dfs_with_multiindexes): def func(df, num1, str1): @@ -258,7 +258,7 @@ def func(df, num1, str1): @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_group_by_level(self, grouping_dfs_with_multiindexes, level): eval_snowpark_pandas_result( @@ -281,7 +281,7 @@ def operation(df: native_pd.DataFrame) -> native_pd.DataFrame: # When dropna=False, we can skip the dropna query query_count=4, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ): snow_result = operation(snow_df) pandas_result = operation(pandas_df) @@ -332,7 +332,7 @@ def test_group_dataframe_with_column_of_all_nulls_snow_1233832(self, null_value) @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) @pytest.mark.parametrize( "by, expected_output", @@ -417,7 +417,7 @@ def operation(df: native_pd.DataFrame) -> native_pd.DataFrame: @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) @pytest.mark.parametrize("by", ["level_0", ("a", "string_col_1")]) @pytest.mark.parametrize( @@ -444,7 +444,7 @@ def test_as_index_false(self, grouping_dfs_with_multiindexes, by, func): # transform because we only reindex to the original ordering if query_count=QUERY_COUNT_WITH_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_group_keys_false(self, grouping_dfs_with_multiindexes, as_index): eval_snowpark_pandas_result( @@ -598,7 +598,7 @@ def operation(df: native_pd.DataFrame) -> native_pd.DataFrame: @sql_count_checker( # we need a transform check because group_keys=False. query_count=QUERY_COUNT_WITH_TRANSFORM_CHECK, - join_count=3, + join_count=JOIN_COUNT, udtf_count=UDTF_COUNT, ) def test_apply_transfform_to_subset( @@ -631,7 +631,7 @@ def test_apply_transfform_to_subset( ) @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, - join_count=3, + join_count=JOIN_COUNT, udtf_count=UDTF_COUNT, ) def test_numpy_ints_in_result(self, grouping_dfs_with_multiindexes, result): @@ -800,7 +800,7 @@ def test_root_mean_squared_error(self): @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_multiindex_df(self, grouping_dfs_with_multiindexes, by, sort, as_index): eval_snowpark_pandas_result( @@ -836,7 +836,7 @@ def test_multiindex_df(self, grouping_dfs_with_multiindexes, by, sort, as_index) @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_non_series_or_dataframe_return_types( self, return_value, grouping_dfs_with_multiindexes @@ -918,7 +918,7 @@ class TestFuncReturnsSeries: @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_return_series_with_two_columns( self, grouping_dfs_with_multiindexes, by, level, as_index, sort, group_keys @@ -943,7 +943,7 @@ def test_return_series_with_two_columns( @sql_count_checker( query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=3, + join_count=JOIN_COUNT, ) def test_args_and_kwargs(self, grouping_dfs_with_multiindexes): eval_snowpark_pandas_result( diff --git a/tests/integ/modin/groupby/test_groupby_basic_agg.py b/tests/integ/modin/groupby/test_groupby_basic_agg.py index f3002901d0..d4211f2a41 100644 --- a/tests/integ/modin/groupby/test_groupby_basic_agg.py +++ b/tests/integ/modin/groupby/test_groupby_basic_agg.py @@ -951,8 +951,8 @@ def test_groupby_with_level(df_multi, level): ) -@sql_count_checker(query_count=1, join_count=2) -def test_groupby_with_hier_columns(): +@sql_count_checker(query_count=1) +def test_groupby_with_higher_columns(): tuples = list( zip( *[ diff --git a/tests/integ/modin/groupby/test_groupby_dataframe_rank.py b/tests/integ/modin/groupby/test_groupby_dataframe_rank.py index 78443c3bbf..3bb4a4b455 100644 --- a/tests/integ/modin/groupby/test_groupby_dataframe_rank.py +++ b/tests/integ/modin/groupby/test_groupby_dataframe_rank.py @@ -7,7 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, eval_snowpark_pandas_result, @@ -211,6 +211,7 @@ ] +@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -232,18 +233,16 @@ def test_df_groupby_rank(data, index, method, ascending, na_option, dropna): snow_df = pd.DataFrame(data, index=index) native_df = native_pd.DataFrame(data, index=index) - with SqlCounter( - query_count=1, join_count=2 if isinstance(index, pd.MultiIndex) else 0 - ): - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.groupby("group", dropna=dropna).rank( - method=method, na_option=na_option, ascending=ascending - ), - ) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.groupby("group", dropna=dropna).rank( + method=method, na_option=na_option, ascending=ascending + ), + ) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -273,14 +272,10 @@ def test_df_rank_pct(data, index, method, ascending, na_option, dropna): .groupby("group", dropna=dropna) .rank(method=method, ascending=ascending, na_option=na_option, pct=True) ) - with SqlCounter( - query_count=1, join_count=2 if isinstance(index, pd.MultiIndex) else 0 - ): - assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64( - snow_df, native_df - ) + assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64(snow_df, native_df) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA_MUL) @pytest.mark.parametrize( "method", @@ -298,16 +293,13 @@ def test_df_rank_pct(data, index, method, ascending, na_option, dropna): def test_df_groupby_rank_by_list(data, index, method, ascending, na_option): snow_df = pd.DataFrame(data, index=index) native_df = native_pd.DataFrame(data, index=index) - with SqlCounter( - query_count=1, join_count=2 if isinstance(index, pd.MultiIndex) else 0 - ): - eval_snowpark_pandas_result( - snow_df, - native_df, - lambda df: df.groupby(["group", "a"]).rank( - method=method, na_option=na_option, ascending=ascending - ), - ) + eval_snowpark_pandas_result( + snow_df, + native_df, + lambda df: df.groupby(["group", "a"]).rank( + method=method, na_option=na_option, ascending=ascending + ), + ) @pytest.mark.parametrize( diff --git a/tests/integ/modin/groupby/test_groupby_default2pandas.py b/tests/integ/modin/groupby/test_groupby_default2pandas.py index 74aac8f77c..49d45a1009 100644 --- a/tests/integ/modin/groupby/test_groupby_default2pandas.py +++ b/tests/integ/modin/groupby/test_groupby_default2pandas.py @@ -124,7 +124,7 @@ def test_groupby_with_numpy_array(basic_snowpark_pandas_df) -> None: "by_list", [[2, 1, 1, 2, 3, 3], [[2, 1, 1, 2, 3, 3], "a"]], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1) def test_groupby_series_with_numpy_array(series_multi_numeric, by_list) -> None: with pytest.raises( NotImplementedError, match=AGGREGATE_UNSUPPORTED_GROUPING_ERROR_PATTERN diff --git a/tests/integ/modin/groupby/test_groupby_head_tail.py b/tests/integ/modin/groupby/test_groupby_head_tail.py index d462b89150..90819ec2d6 100644 --- a/tests/integ/modin/groupby/test_groupby_head_tail.py +++ b/tests/integ/modin/groupby/test_groupby_head_tail.py @@ -45,7 +45,7 @@ class TestDataFrameGroupByHeadTail: ["lion", 1234, 456, 78, 9], ] - @sql_count_checker(query_count=1, join_count=1) + @sql_count_checker(query_count=1) def test_df_groupby_head_tail(self, op_type, n, dropna, as_index, sort, group_keys): """ Test DataFrameGroupBy.head and DataFrameGroupBy.tail with a small df with no NA values. @@ -66,7 +66,7 @@ def test_df_groupby_head_tail(self, op_type, n, dropna, as_index, sort, group_ke check_index_type=False, ) - @sql_count_checker(query_count=6, join_count=1) + @sql_count_checker(query_count=6) def test_df_groupby_head_tail_large_data( self, op_type, n, dropna, as_index, sort, group_keys, large_df_with_na_values ): diff --git a/tests/integ/modin/groupby/test_groupby_idxmax_idxmin.py b/tests/integ/modin/groupby/test_groupby_idxmax_idxmin.py index e87b6327bc..ec1e36d1e3 100644 --- a/tests/integ/modin/groupby/test_groupby_idxmax_idxmin.py +++ b/tests/integ/modin/groupby/test_groupby_idxmax_idxmin.py @@ -20,7 +20,7 @@ @pytest.mark.parametrize("grouping_columns", ["B", ["A", "B"]]) @pytest.mark.parametrize("skipna", [False, True]) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_df_groupby_idxmax_idxmin_on_axis_0( df_with_multiple_columns, grouping_columns, skipna, func ): @@ -73,7 +73,7 @@ def test_df_groupby_idxmax_idxmin_on_axis_1_negative(df_with_multiple_columns, f @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize("numeric_only", [True, False]) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_groupby_idxmax_idxmin_with_different_column_dtypes_on_axis_0( func, numeric_only ): diff --git a/tests/integ/modin/groupby/test_groupby_ngroups.py b/tests/integ/modin/groupby/test_groupby_ngroups.py index 6216c4c223..332e4c88eb 100644 --- a/tests/integ/modin/groupby/test_groupby_ngroups.py +++ b/tests/integ/modin/groupby/test_groupby_ngroups.py @@ -17,7 +17,7 @@ def assert_ngroups_equal(snow_res, pd_res): @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]]) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2) def test_groupby_sort_multiindex_series(series_multi_numeric, by): snow_ser = series_multi_numeric diff --git a/tests/integ/modin/groupby/test_groupby_series.py b/tests/integ/modin/groupby/test_groupby_series.py index 10dd08b6fd..7756f8b620 100644 --- a/tests/integ/modin/groupby/test_groupby_series.py +++ b/tests/integ/modin/groupby/test_groupby_series.py @@ -19,14 +19,14 @@ @pytest.mark.parametrize("by", ["a", ["b"], ["a", "b"]]) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2) def test_groupby_sort_multiindex_series(series_multi_numeric, agg_method, by): native_mseries_group = series_multi_numeric.to_pandas().groupby(by=by, sort=True) mseries_group = series_multi_numeric.groupby(by=by, sort=True) eval_snowpark_pandas_result(mseries_group, native_mseries_group, agg_method) -@sql_count_checker(query_count=3, join_count=6) +@sql_count_checker(query_count=3) def test_groupby_sort_false_multiindex_series(series_multi_numeric): # it is known that groupby sort=False is buggy with multiIndex, it is always # sorting when only part of the level is used. @@ -48,7 +48,7 @@ def test_groupby_sort_false_multiindex_series(series_multi_numeric): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_groupby_series_count_with_nan(): index = native_pd.Index(["a", "b", "b", "a", "c"]) index.names = ["grp_col"] @@ -75,7 +75,7 @@ def test_groupby_series_count_with_nan(): ], ) @pytest.mark.parametrize("sort", [True, False]) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_groupby_agg_series(agg_func, sort): index = native_pd.Index(["a", "b", "b", "a", "c"]) index.names = ["grp_col"] @@ -113,7 +113,7 @@ def test_groupby_agg_series_dict_func_negative(): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize( "agg_func, type_str", [({"x": ("y", "sum")}, "tuple"), ({"x": pd.NamedAgg("y", "sum")}, "NamedAgg")], @@ -139,7 +139,7 @@ def test_groupby_agg_series_raises_for_2_tuple_agg(agg_func, type_str): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("aggs", [{"minimum": min}, {"minimum": min, "maximum": max}]) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_groupby_agg_series_named_agg(aggs, sort): index = native_pd.Index(["a", "b", "b", "a", "c"]) index.names = ["grp_col"] @@ -153,7 +153,7 @@ def test_groupby_agg_series_named_agg(aggs, sort): @pytest.mark.parametrize("numeric_only", [False, None]) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_groupby_series_numeric_only(series_str, numeric_only): native_series = series_str.to_pandas() eval_snowpark_pandas_result( @@ -164,7 +164,7 @@ def test_groupby_series_numeric_only(series_str, numeric_only): @pytest.mark.parametrize("level", [0, 1, [1, 0], "b", [1, 1], [0, "b"], [-1]]) -@sql_count_checker(query_count=2, join_count=4) +@sql_count_checker(query_count=2) def test_groupby_sort_multiindex_series_level(series_multi_numeric, level): native_series = series_multi_numeric.to_pandas() @@ -173,7 +173,7 @@ def test_groupby_sort_multiindex_series_level(series_multi_numeric, level): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_groupby_series_single_index(): snow_ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) native_ser = native_pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) diff --git a/tests/integ/modin/groupby/test_groupby_transform.py b/tests/integ/modin/groupby/test_groupby_transform.py index 46ef42f4f4..5f2339f2e4 100644 --- a/tests/integ/modin/groupby/test_groupby_transform.py +++ b/tests/integ/modin/groupby/test_groupby_transform.py @@ -39,7 +39,7 @@ def test_dataframe_groupby_transform( # temporary function's resultant table. # - A second join is performed only when the groupby object specifies dropna=True. # This is because a loc set operation is being performed to correctly set NA values. - with SqlCounter(query_count=6, join_count=2 + (2 if dropna else 0), udtf_count=1): + with SqlCounter(query_count=6, join_count=1 + (1 if dropna else 0), udtf_count=1): eval_snowpark_pandas_result( *df_with_multiple_columns, lambda df: df.groupby( @@ -85,11 +85,11 @@ def test_dataframe_groupby_transform_with_func_args_and_kwargs( Test DataFrameGroupby.transform with functions that require *args and **kwargs. """ # - A UDTF is created to run `groupby.transform(func)` on every group via `apply`. - # - Two joins always occurs when joining the original DataFrame's table with the + # - One join always occurs when joining the original DataFrame's table with the # temporary function's resultant table. - # - Another two joins are performed only when the groupby object specifies dropna=True. + # - A second join is performed only when the groupby object specifies dropna=True. # This is because a loc set operation is being performed to correctly set NA values. - with SqlCounter(query_count=6, join_count=2 + (2 if dropna else 0), udtf_count=1): + with SqlCounter(query_count=6, join_count=1 + (1 if dropna else 0), udtf_count=1): eval_snowpark_pandas_result( *df_with_multiple_columns, lambda df: df.groupby( diff --git a/tests/integ/modin/index/test_datetime_index_methods.py b/tests/integ/modin/index/test_datetime_index_methods.py index b727b4750e..56fd40a6cb 100644 --- a/tests/integ/modin/index/test_datetime_index_methods.py +++ b/tests/integ/modin/index/test_datetime_index_methods.py @@ -89,7 +89,7 @@ def test_non_default_args(kwargs): pd.DatetimeIndex(query_compiler=idx._query_compiler, **kwargs) -@sql_count_checker(query_count=6, join_count=6) +@sql_count_checker(query_count=6) def test_index_parent(): """ Check whether the parent field in Index is updated properly. diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 9a629101f3..1fd5701fda 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -494,7 +494,7 @@ def test_create_df_with_dict_as_data_and_index_as_index(): assert_frame_equal(snow_df, native_df) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1) def test_create_series_with_list_of_lists_index(): # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. arrays = [ @@ -507,7 +507,7 @@ def test_create_series_with_list_of_lists_index(): assert_series_equal(snow_series, native_series) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_create_series_with_none_data_and_non_empty_index(): # When creating an empty Series with a non-empty index, the index should be used as the index of the Series. index = ["A", "B", "C", "D"] diff --git a/tests/integ/modin/index/test_index_methods.py b/tests/integ/modin/index/test_index_methods.py index d8c3646d97..8f6f5b9f59 100644 --- a/tests/integ/modin/index/test_index_methods.py +++ b/tests/integ/modin/index/test_index_methods.py @@ -359,7 +359,7 @@ def test_has_duplicates(index): assert index.has_duplicates == snow_index.has_duplicates -@sql_count_checker(query_count=6, join_count=6) +@sql_count_checker(query_count=6) def test_index_parent(): """ Check whether the parent field in Index is updated properly. diff --git a/tests/integ/modin/resample/test_resample.py b/tests/integ/modin/resample/test_resample.py index b4e8858273..af99185294 100644 --- a/tests/integ/modin/resample/test_resample.py +++ b/tests/integ/modin/resample/test_resample.py @@ -145,7 +145,7 @@ def test_resample_duplicated_timestamps(): @freq @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=1) def test_resample_series(freq, interval, agg_func): rule = f"{interval}{freq}" eval_snowpark_pandas_result( @@ -188,7 +188,7 @@ def test_resample_df_with_nan(agg_func): @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=1) def test_resample_ser_with_nan(agg_func): # 1 resample bin of all NaN, 1 resample bin partially NaN, 1 resample bin no NaNs eval_snowpark_pandas_result( @@ -242,7 +242,7 @@ def test_resample_df_getitem(): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=1) def test_resample_ser_getitem(): eval_snowpark_pandas_result( *create_test_series( diff --git a/tests/integ/modin/resample/test_resample_fillna.py b/tests/integ/modin/resample/test_resample_fillna.py index 96ad514a2b..53352fd4ef 100644 --- a/tests/integ/modin/resample/test_resample_fillna.py +++ b/tests/integ/modin/resample/test_resample_fillna.py @@ -44,7 +44,7 @@ def test_resample_fill(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=1) def test_resample_fill_ser(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -139,7 +139,7 @@ def test_resample_ffill_missing_in_middle(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=1) def test_resample_ffill_ser_missing_in_middle(interval, agg_func): datecol = native_pd.to_datetime( [ diff --git a/tests/integ/modin/resample/test_resample_negative.py b/tests/integ/modin/resample/test_resample_negative.py index 44319c120b..e20fc397ef 100644 --- a/tests/integ/modin/resample/test_resample_negative.py +++ b/tests/integ/modin/resample/test_resample_negative.py @@ -137,7 +137,7 @@ def test_resample_fillna_invalid_method(): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_resample_tz_negative(): snow_df = pd.DataFrame( {"a": range(3)}, diff --git a/tests/integ/modin/series/test_add_prefix.py b/tests/integ/modin/series/test_add_prefix.py index 6bba930c43..4d05f78d94 100644 --- a/tests/integ/modin/series/test_add_prefix.py +++ b/tests/integ/modin/series/test_add_prefix.py @@ -46,7 +46,7 @@ def test_series_add_prefix_multiindex(prefix, multiindex_native_int_series): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("prefix", TEST_ADD_PREFIX_DATA) def test_series_add_prefix_time_column_df(prefix, time_index_series_data): series_data, kwargs = time_index_series_data diff --git a/tests/integ/modin/series/test_add_suffix.py b/tests/integ/modin/series/test_add_suffix.py index f3329c6789..43a98ab951 100644 --- a/tests/integ/modin/series/test_add_suffix.py +++ b/tests/integ/modin/series/test_add_suffix.py @@ -46,7 +46,7 @@ def test_add_suffix_multiindex(suffix, multiindex_native_int_series): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("suffix", TEST_ADD_SUFFIX_DATA) def test_add_suffix_time_column_df(suffix, time_index_series_data): series_data, kwargs = time_index_series_data diff --git a/tests/integ/modin/series/test_all_any.py b/tests/integ/modin/series/test_all_any.py index 517252e7af..0f78b320fe 100644 --- a/tests/integ/modin/series/test_all_any.py +++ b/tests/integ/modin/series/test_all_any.py @@ -65,7 +65,7 @@ def test_any_int(data): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_all_named_index(): data = [1, 0, 3] index_name = ["a", "b", "c"] @@ -77,7 +77,7 @@ def test_all_named_index(): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_any_named_index(): data = [1, 0, 3] index_name = ["a", "b", "c"] diff --git a/tests/integ/modin/series/test_at.py b/tests/integ/modin/series/test_at.py index 4533c20d35..9452a0d736 100644 --- a/tests/integ/modin/series/test_at.py +++ b/tests/integ/modin/series/test_at.py @@ -18,7 +18,7 @@ def test_at_get_default_index( ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_at_set_default_index( default_index_snowpark_pandas_series, default_index_native_series, @@ -42,7 +42,7 @@ def test_at_get_str_index( assert str_index_snowpark_pandas_series.at["b"] == str_index_native_series.at["b"] -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_at_set_str_index( str_index_snowpark_pandas_series, str_index_native_series, @@ -58,7 +58,7 @@ def at_set_helper(series): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_at_get_time_index( time_index_snowpark_pandas_series, time_index_native_series, @@ -69,7 +69,7 @@ def test_at_get_time_index( ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_at_set_time_index( time_index_snowpark_pandas_series, time_index_native_series, diff --git a/tests/integ/modin/series/test_bitwise_operators.py b/tests/integ/modin/series/test_bitwise_operators.py index ad542fd223..eda9c536c9 100644 --- a/tests/integ/modin/series/test_bitwise_operators.py +++ b/tests/integ/modin/series/test_bitwise_operators.py @@ -11,7 +11,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, eval_snowpark_pandas_result, @@ -47,16 +47,15 @@ def try_cast_to_snow_series(value: Any) -> Any: @pytest.mark.parametrize("value", BITWISE_TEST_DATA) +@sql_count_checker(query_count=1) def test_bitwise_unary(value): # Note: In pandas, using NaN values without specfiying a null-compatible dtype will yield an error. # SnowPandas will allow this behavior. # Note: NaN values like pd.NA, pd.NaT, np.nan will raise a TypeError: boolean value of NA is ambiguous - with SqlCounter( - query_count=1, join_count=1 if isinstance(value, native_pd.Series) else 0 - ): - snow_value = try_cast_to_snow_series(value) - eval_snowpark_pandas_result(snow_value, native_pd.Series(value), lambda s: ~s) + snow_value = try_cast_to_snow_series(value) + + eval_snowpark_pandas_result(snow_value, native_pd.Series(value), lambda s: ~s) @pytest.mark.parametrize("series", SERIES_BITWISE_TEST_DATA) @@ -122,6 +121,7 @@ def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): @pytest.mark.parametrize( "op", [operator.or_, operator.and_] ) # |, &. ^ is not supported in Snowflake +@sql_count_checker(query_count=2, join_count=2) def test_bitwise_binary_between_series(lhs, rhs, op): def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): snow_ans = op(snow_lhs, snow_rhs) @@ -131,14 +131,10 @@ def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): snow_ans, native_ans, lambda s: s, check_index_type=False ) - with SqlCounter( - query_count=2, - join_count=10 if isinstance(lhs.index, native_pd.MultiIndex) else 6, - ): - check_op(lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs)) + check_op(lhs, rhs, try_cast_to_snow_series(lhs), try_cast_to_snow_series(rhs)) - # commute series - check_op(rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs)) + # commute series + check_op(rhs, lhs, try_cast_to_snow_series(rhs), try_cast_to_snow_series(lhs)) # Due to differences in logical or/and in SQL and pandas' |,& implementation, behavior doesn't match here, in particular @@ -234,21 +230,18 @@ def check_op(native_lhs, native_rhs, snow_lhs, snow_rhs): ), ], ) +@sql_count_checker(query_count=1, join_count=1) def test_bitwise_binary_between_series_with_deviating_behavior_or( lhs, rhs, expected_pandas, expected_snowpark_pandas ): - with SqlCounter( - query_count=1, - join_count=5 if isinstance(lhs.index, native_pd.MultiIndex) else 3, - ): - snow_ans = try_cast_to_snow_series(lhs) | try_cast_to_snow_series(rhs) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_ans, expected_snowpark_pandas - ) + snow_ans = try_cast_to_snow_series(lhs) | try_cast_to_snow_series(rhs) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_ans, expected_snowpark_pandas + ) - # test here pandas to track any version regressions - native_ans = lhs | rhs - tm.assert_series_equal(native_ans, expected_pandas, check_index_type=False) + # test here pandas to track any version regressions + native_ans = lhs | rhs + tm.assert_series_equal(native_ans, expected_pandas, check_index_type=False) @pytest.mark.parametrize( @@ -322,19 +315,16 @@ def test_bitwise_binary_between_series_with_deviating_behavior_or( ), ], ) +@sql_count_checker(query_count=1, join_count=1) def test_bitwise_binary_between_series_with_deviating_behavior_and( lhs, rhs, expected_pandas, expected_snowpark_pandas ): - with SqlCounter( - query_count=1, - join_count=5 if isinstance(lhs.index, native_pd.MultiIndex) else 3, - ): - snow_ans = try_cast_to_snow_series(lhs) & try_cast_to_snow_series(rhs) - assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( - snow_ans, expected_snowpark_pandas - ) + snow_ans = try_cast_to_snow_series(lhs) & try_cast_to_snow_series(rhs) + assert_snowpark_pandas_equals_to_pandas_without_dtypecheck( + snow_ans, expected_snowpark_pandas + ) - # test here pandas to track any version regressions - native_ans = lhs & rhs - print(native_ans.index) - tm.assert_series_equal(native_ans, expected_pandas, check_index_type=False) + # test here pandas to track any version regressions + native_ans = lhs & rhs + print(native_ans.index) + tm.assert_series_equal(native_ans, expected_pandas, check_index_type=False) diff --git a/tests/integ/modin/series/test_compare.py b/tests/integ/modin/series/test_compare.py index 8d60d7f75a..c5c927343e 100644 --- a/tests/integ/modin/series/test_compare.py +++ b/tests/integ/modin/series/test_compare.py @@ -50,7 +50,7 @@ class TestDefaultParameters: # copying the original series's index to the final resulting dataframe # adds 1 extra query to materialize the index. query_count=QUERY_COUNT + 1, - join_count=5, + join_count=JOIN_COUNT, ) def test_no_diff(self, base_series): other_series = base_series.copy() diff --git a/tests/integ/modin/series/test_describe.py b/tests/integ/modin/series/test_describe.py index 32876f1608..9ecd2e33a3 100644 --- a/tests/integ/modin/series/test_describe.py +++ b/tests/integ/modin/series/test_describe.py @@ -8,7 +8,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_series_equal, create_test_series, @@ -129,18 +129,15 @@ def timestamp_describe_comparator(snow_res, native_res): @pytest.mark.parametrize( - "index, join_count", + "index", [ - pytest.param(None, 0, id="default_index"), - pytest.param( - ["one", "two", "three", "four", "five", "six"], 6, id="flat_index" - ), + pytest.param(None, id="default_index"), + pytest.param(["one", "two", "three", "four", "five", "six"], id="flat_index"), pytest.param( [ np.array(["bar", "bar", "baz", "baz", "foo", "foo"]), np.array(["one", "two", "one", "two", "one", "two"]), ], - 12, id="2D_index", ), ], @@ -154,10 +151,8 @@ def timestamp_describe_comparator(snow_res, native_res): ], ids=["ints", "floats", "objects"], ) -def test_describe_multiindex(data, index, join_count): - if isinstance(data[0], str) and index is not None: - join_count = 8 if len(index) == 2 else 4 - with SqlCounter(query_count=1, union_count=5, join_count=join_count): - eval_snowpark_pandas_result( - *create_test_series(data, index=index), lambda ser: ser.describe() - ) +@sql_count_checker(query_count=1, union_count=5) +def test_describe_multiindex(data, index): + eval_snowpark_pandas_result( + *create_test_series(data, index=index), lambda ser: ser.describe() + ) diff --git a/tests/integ/modin/series/test_empty.py b/tests/integ/modin/series/test_empty.py index d53cd6e3d5..a30a69116c 100644 --- a/tests/integ/modin/series/test_empty.py +++ b/tests/integ/modin/series/test_empty.py @@ -9,7 +9,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -34,17 +34,14 @@ "empty series with only index", ], ) +@sql_count_checker(query_count=1) def test_series_empty(args, kwargs): - with SqlCounter( - query_count=1, - join_count=1 if (args == [] and kwargs.get("index", None) == []) else 0, - ): - eval_snowpark_pandas_result( - pd.Series(*args, **kwargs), - native_pd.Series(*args, **kwargs), - lambda df: df.empty, - comparator=lambda x, y: x == y, - ) + eval_snowpark_pandas_result( + pd.Series(*args, **kwargs), + native_pd.Series(*args, **kwargs), + lambda df: df.empty, + comparator=lambda x, y: x == y, + ) @sql_count_checker(query_count=5, join_count=2) diff --git a/tests/integ/modin/series/test_iat.py b/tests/integ/modin/series/test_iat.py index 7b9a4d4c06..b3e2255403 100644 --- a/tests/integ/modin/series/test_iat.py +++ b/tests/integ/modin/series/test_iat.py @@ -103,7 +103,7 @@ def iat_set_helper(series): (0,), ], ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=2) def test_iat_get_time_index( key, time_index_snowpark_pandas_series, @@ -122,7 +122,7 @@ def test_iat_get_time_index( (0,), ], ) -@sql_count_checker(query_count=1, join_count=4) +@sql_count_checker(query_count=1, join_count=2) def test_iat_set_time_index( key, time_index_snowpark_pandas_series, diff --git a/tests/integ/modin/series/test_iloc.py b/tests/integ/modin/series/test_iloc.py index eea764af40..7b6369934d 100644 --- a/tests/integ/modin/series/test_iloc.py +++ b/tests/integ/modin/series/test_iloc.py @@ -89,7 +89,7 @@ def operation(ser): # Based on snowflake type results, the result becomes 'str' type so we normalize to float for comparison. return ser.astype("float") - expected_join_count = 5 if isinstance(val, list) else 4 + expected_join_count = 3 if isinstance(val, list) else 2 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( default_index_native_int_snowpark_pandas_series, @@ -777,25 +777,25 @@ def perform_iloc(df): @pytest.mark.parametrize( - "row_key, row_key_index, add_joins", + "row_key, row_key_index", [ - [1, None, 0], - [[3, 0], None, 0], - [[1, 2], [("A",), ("B",)], 1], - [[2, 1], [("A", 1), ("B", 2)], 2], + [1, None], + [[3, 0], None], + [[1, 2], [("A",), ("B",)]], + [[2, 1], [("A", 1), ("B", 2)]], ], ) @pytest.mark.parametrize( "item_values, item_index, expected_join_count", [ - [999, None, 6], - [TEST_ITEMS_DATA_2X1, None, 7], - [TEST_ITEMS_DATA_2X1, [("r",), ("s",)], 8], - [TEST_ITEMS_DATA_2X1, [("r", 20), ("s", 25)], 9], + [999, None, 2], + [TEST_ITEMS_DATA_2X1, None, 3], + [TEST_ITEMS_DATA_2X1, [("r",), ("s",)], 4], + [TEST_ITEMS_DATA_2X1, [("r", 20), ("s", 25)], 5], ], ) def test_df_iloc_set_with_multiindex( - row_key, row_key_index, item_values, item_index, expected_join_count, add_joins + row_key, row_key_index, item_values, item_index, expected_join_count ): ser_data = [10, 11, 12, 13, 14] row_index = pd.MultiIndex.from_tuples( @@ -835,7 +835,7 @@ def helper_iloc(ser): else: ser.iloc[snow_row_key] = snow_items - with SqlCounter(query_count=1, join_count=expected_join_count + add_joins): + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result(snow_ser, native_ser, helper_iloc, inplace=True) @@ -851,7 +851,7 @@ def iloc_helper(series: Union[pd.Series, native_pd.Series]) -> None: ) # test ser with default index - with SqlCounter(query_count=1, join_count=4): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( default_index_int_series, default_index_native_int_series, @@ -859,7 +859,7 @@ def iloc_helper(series: Union[pd.Series, native_pd.Series]) -> None: ) # test ser with non default index - with SqlCounter(query_count=1, join_count=4): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( int_series_with_non_default_index, native_int_series_with_non_default_index, @@ -867,7 +867,7 @@ def iloc_helper(series: Union[pd.Series, native_pd.Series]) -> None: ) # test ser with MultiIndex - with SqlCounter(query_count=1, join_count=4): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( int_series_with_multiindex, multiindex_native_int_series, diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index da13247cd7..2603eaa61c 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -224,6 +224,7 @@ def apply_loc(df): [random.choice([True, False]) for _ in range(5)], ], ) +@sql_count_checker(query_count=1, join_count=1) def test_series_loc_get_key_bool_series_with_aligned_indices(key, use_default_index): # aligned indices means both row_pos and index are exactly match if use_default_index: @@ -233,14 +234,13 @@ def test_series_loc_get_key_bool_series_with_aligned_indices(key, use_default_in index = native_pd.Index(["a", "a", None, "b", "b"], name="index") native_series = native_pd.Series([1, 2, 3, 4, 5], index=index) snow_series = pd.Series(native_series) - with SqlCounter(query_count=1, join_count=1 if use_default_index else 2): - eval_snowpark_pandas_result( - snow_series, - native_series, - lambda s: s.loc[pd.Series(key, index=index, dtype="bool")] - if isinstance(s, pd.Series) - else s.loc[native_pd.Series(key, index=index, dtype="bool")], - ) + eval_snowpark_pandas_result( + snow_series, + native_series, + lambda s: s.loc[pd.Series(key, index=index, dtype="bool")] + if isinstance(s, pd.Series) + else s.loc[native_pd.Series(key, index=index, dtype="bool")], + ) @pytest.mark.parametrize( @@ -861,7 +861,7 @@ def loc_set_helper(s): ["a", "a", "c", "d"], ], ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_series_loc_set_scalar_row_key_enlargement(row_key, item_values, ser_index): data = [1, 2, 3, 4] @@ -1407,7 +1407,7 @@ def test_series_loc_set_slice_item_negative(key, default_index_native_series): [2, "x"], ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_series_loc_set_boolean_key(key, index): # series.loc[True/False key] = scalar item # ---------------------------------------- @@ -1596,7 +1596,7 @@ def test_series_loc_set_with_scalar_key_and_list_like_item( assert_series_equal(snowpark_ser, native_ser) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) @pytest.mark.parametrize("key", SCALAR_LIKE_VALUES) @pytest.mark.parametrize("item", SCALAR_LIKE_VALUES) def test_series_loc_set_with_scalar_key_and_scalar_item( @@ -1776,7 +1776,7 @@ def test_series_partial_string_indexing_behavior_diff(): assert len(series_minute["2022"]) == 0 -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_series_loc_set_none(): # Note that pandas does not support df.loc[None,:] like the series does here. native_s = native_pd.Series([1, 2, 3]) diff --git a/tests/integ/modin/series/test_mask.py b/tests/integ/modin/series/test_mask.py index 0d3680cff4..baeaa37751 100644 --- a/tests/integ/modin/series/test_mask.py +++ b/tests/integ/modin/series/test_mask.py @@ -76,8 +76,8 @@ def test_series_mask_duplicate_labels(): eval_snowpark_pandas_result(snow_ser, native_ser, lambda ser: ser.mask(ser > 3)) -@sql_count_checker(query_count=1, join_count=1) -def test_series_mask_multi_index(): +@sql_count_checker(query_count=1) +def test_series_mask_multiindex(): data = [1, 2, 3, 4, 5] index = [("a", "x"), ("b", "y"), ("c", "z"), ("d", "u"), ("e", "v")] @@ -233,7 +233,7 @@ def test_series_mask_with_scalar_cond(cond): ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_series_mask_series_cond_unmatched_index(): data = [1, 2, 3, 4] index1 = [0, 1, 2, 3] @@ -258,10 +258,9 @@ def perform_mask(series): ) -@pytest.mark.parametrize( - "index, join_count", [("matched_index", 1), ("unmatched_index", 2)] -) -def test_series_mask_short_series_cond(index, join_count): +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("index", ["matched_index", "unmatched_index"]) +def test_series_mask_short_series_cond(index): data = [1, 2, 3, 4] if index != "matched_index": index = [7, 8, 9] @@ -280,18 +279,16 @@ def perform_mask(series): else: return series.mask(native_cond, -1) - with SqlCounter(query_count=1, join_count=join_count): - eval_snowpark_pandas_result( - snow_ser, - native_ser, - perform_mask, - ) + eval_snowpark_pandas_result( + snow_ser, + native_ser, + perform_mask, + ) -@pytest.mark.parametrize( - "index, join_count", [("matched_index", 1), ("unmatched_index", 2)] -) -def test_series_mask_long_series_cond(index, join_count): +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("index", ["matched_index", "unmatched_index"]) +def test_series_mask_long_series_cond(index): data = [1, 2, 3, 4] if index != "matched_index": index = [7, 8, 9, 10, 11] @@ -310,9 +307,8 @@ def perform_mask(series): else: return series.mask(native_cond, -1) - with SqlCounter(query_count=1, join_count=join_count): - eval_snowpark_pandas_result( - snow_ser, - native_ser, - perform_mask, - ) + eval_snowpark_pandas_result( + snow_ser, + native_ser, + perform_mask, + ) diff --git a/tests/integ/modin/series/test_nlargest_nsmallest.py b/tests/integ/modin/series/test_nlargest_nsmallest.py index 253230156b..a15cc5dfb2 100644 --- a/tests/integ/modin/series/test_nlargest_nsmallest.py +++ b/tests/integ/modin/series/test_nlargest_nsmallest.py @@ -88,7 +88,7 @@ def test_nlargest_nsmallest_non_numeric_types(method, data): assert_series_equal(getattr(snow_s, method)(n), expected_s) -@sql_count_checker(query_count=3, join_count=2) +@sql_count_checker(query_count=3) def test_nlargest_nsmallest_no_columns(method): snow_s = pd.Series(query_compiler=pd.DataFrame(index=[1, 2])._query_compiler) snow_s = snow_s diff --git a/tests/integ/modin/series/test_nunique.py b/tests/integ/modin/series/test_nunique.py index f2aba15ada..bb20e9e4a5 100644 --- a/tests/integ/modin/series/test_nunique.py +++ b/tests/integ/modin/series/test_nunique.py @@ -8,7 +8,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_values_equal, create_test_series, @@ -63,11 +63,11 @@ def test_series_nunique_deviating_nan_behavior(input_data, expected): ), ], ) +@sql_count_checker(query_count=1) def test_dataframe_nunique_multiindex(index): data = [0.1, 0.2, 0.1, 0] - with SqlCounter(query_count=1, join_count=0 if index is None else 2): - eval_snowpark_pandas_result( - *create_test_series(data, index=index), - lambda ser: ser.nunique(), - comparator=assert_values_equal, - ) + eval_snowpark_pandas_result( + *create_test_series(data, index=index), + lambda ser: ser.nunique(), + comparator=assert_values_equal, + ) diff --git a/tests/integ/modin/series/test_rank.py b/tests/integ/modin/series/test_rank.py index 24801b581f..2544f12e43 100644 --- a/tests/integ/modin/series/test_rank.py +++ b/tests/integ/modin/series/test_rank.py @@ -7,7 +7,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64, eval_snowpark_pandas_result, @@ -83,6 +83,7 @@ def test_series_rank_numeric_only(method, ascending, na_option): ) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("data, index", TEST_RANK_DATA) @pytest.mark.parametrize( "method", @@ -98,15 +99,10 @@ def test_series_rank_numeric_only(method, ascending, na_option): ) # test Series percentile rank def test_df_rank_pct(data, index, method, ascending, na_option): - with SqlCounter( - query_count=1, join_count=2 if isinstance(index, native_pd.MultiIndex) else 0 - ): - snow_df = pd.DataFrame(data, index=index).rank( - method=method, ascending=ascending, na_option=na_option, pct=True - ) - native_df = native_pd.DataFrame(data, index=index).rank( - method=method, ascending=ascending, na_option=na_option, pct=True - ) - assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64( - snow_df, native_df - ) + snow_df = pd.DataFrame(data, index=index).rank( + method=method, ascending=ascending, na_option=na_option, pct=True + ) + native_df = native_pd.DataFrame(data, index=index).rank( + method=method, ascending=ascending, na_option=na_option, pct=True + ) + assert_snowpark_pandas_equals_to_pandas_with_coerce_to_float64(snow_df, native_df) diff --git a/tests/integ/modin/series/test_rename.py b/tests/integ/modin/series/test_rename.py index 53873e0b2f..4ccf29706f 100644 --- a/tests/integ/modin/series/test_rename.py +++ b/tests/integ/modin/series/test_rename.py @@ -45,7 +45,7 @@ def renamer(x): # values in the variant column will be quoted assert_index_equal(renamed.index, renamed2.index.str.replace('"', "")) - @sql_count_checker(query_count=1, join_count=2) + @sql_count_checker(query_count=1, join_count=1) def test_rename_partial_dict(self): # partial dict ser = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64") @@ -63,7 +63,7 @@ def test_rename_retain_index_name(self): renamed = renamer.rename({}) assert renamed.index.name == renamer.index.name - @sql_count_checker(query_count=2, join_count=2) + @sql_count_checker(query_count=2, join_count=1) def test_rename_by_series(self): ser = Series(range(5), name="foo") renamer = Series({1: 10, 2: 20}) @@ -80,7 +80,7 @@ def test_rename_set_name(self): tm.assert_numpy_array_equal(result.index.values, ser.index.values) assert ser.name is None - @sql_count_checker(query_count=5, join_count=5) + @sql_count_checker(query_count=5) def test_rename_set_name_inplace(self): ser = Series(range(3), index=list("abc")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: diff --git a/tests/integ/modin/series/test_setitem.py b/tests/integ/modin/series/test_setitem.py index 929226bc89..50405643bc 100644 --- a/tests/integ/modin/series/test_setitem.py +++ b/tests/integ/modin/series/test_setitem.py @@ -175,7 +175,7 @@ (None, 35), # None scalar ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_series_setitem_scalar_key_and_scalar_item( key, item, default_index_native_int_series ): @@ -276,7 +276,7 @@ def test_series_setitem_none_key_and_scalar_item_mixed_type_series( (3.14, "a"), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_series_setitem_scalar_key_and_scalar_item_mixed_type_series_type_coercion( key, item, mixed_type_index_native_series_mixed_type_index ): @@ -341,7 +341,7 @@ def test_series_setitem_scalar_key_and_scalar_item_mixed_type_series_type_coerci # TODO: SNOW-986548 fix where key is False, row is missed in this case @pytest.mark.parametrize("key", [True, False]) @pytest.mark.parametrize("item", SCALAR_LIKE_VALUES) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_series_setitem_boolean_key_and_scalar_item_label_updated(key, item): # series[scalar boolean key] = scalar item # ---------------------------------------- @@ -493,14 +493,14 @@ def test_series_setitem_boolean_key_and_scalar_item_case2_numeric_index(key, ite expected_ser = native_pd.Series(data=data, index=index) - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): # verify that the result is correct assert_series_equal(snowpark_ser, expected_ser) @pytest.mark.parametrize("key", [True, False]) @pytest.mark.parametrize("item", SCALAR_LIKE_VALUES) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_series_setitem_boolean_key_and_scalar_item_case2_non_numeric_index(key, item): # series[scalar boolean key] = scalar item # ---------------------------------------- @@ -559,7 +559,7 @@ def test_series_setitem_boolean_key_and_scalar_item_case2_non_numeric_index(key, @pytest.mark.parametrize("key", [0, 1]) @pytest.mark.parametrize("item", SCALAR_LIKE_VALUES) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_series_setitem_boolean_key_and_scalar_item_case3( key, item, native_series_with_duplicate_boolean_index ): @@ -1835,7 +1835,7 @@ def test_series_setitem_check_type_behavior_with_string_key_and_number_scalar_it assert_series_equal(snowpark_ser, native_ser, check_dtype=False) else: # All other cases match native pandas behavior - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): assert_series_equal(snowpark_ser, native_ser, check_dtype=False) @@ -1886,7 +1886,7 @@ def test_series_setitem_check_type_behavior_with_string_key_and_boolean_scalar_i # b True # c True # dtype: bool - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): err_msg = "Series are different" with pytest.raises(AssertionError, match=err_msg): assert_series_equal(snowpark_ser, native_ser, check_dtype=False) @@ -1997,7 +1997,7 @@ def test_series_setitem_check_type_behavior_with_string_key_and_string_scalar_it expected_data = [str(val) for val in native_ser] expected_ser = native_pd.Series(data=expected_data, index=index) - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): assert_series_equal(snowpark_ser, expected_ser, check_dtype=False) @@ -2093,7 +2093,7 @@ def set_loc_helper(ser): [2, "x"], ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_df_setitem_boolean_key(key, index): item = 99 @@ -2435,7 +2435,7 @@ def test_behavior_table_is_up_to_date(): prev_err_msg = expected_err_msg -@sql_count_checker(query_count=2, join_count=6) +@sql_count_checker(query_count=2, join_count=2) def test_series_setitem_int_key(): # pandas series setitem with int key is similar to loc set in most cases: # E.g., set index with label 3 to 100 diff --git a/tests/integ/modin/series/test_shape.py b/tests/integ/modin/series/test_shape.py index ba62dfde67..7bbc1270a0 100644 --- a/tests/integ/modin/series/test_shape.py +++ b/tests/integ/modin/series/test_shape.py @@ -9,7 +9,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -34,13 +34,11 @@ "empty series with only index", ], ) +@sql_count_checker(query_count=1) def test_series_shape(args, kwargs): - with SqlCounter( - query_count=1, join_count=1 if kwargs.get("index", None) == [] else 0 - ): - eval_snowpark_pandas_result( - pd.Series(*args, **kwargs), - native_pd.Series(*args, **kwargs), - lambda df: df.shape, - comparator=lambda x, y: x == y, - ) + eval_snowpark_pandas_result( + pd.Series(*args, **kwargs), + native_pd.Series(*args, **kwargs), + lambda df: df.shape, + comparator=lambda x, y: x == y, + ) diff --git a/tests/integ/modin/series/test_size.py b/tests/integ/modin/series/test_size.py index 65730da0fd..4543525b2d 100644 --- a/tests/integ/modin/series/test_size.py +++ b/tests/integ/modin/series/test_size.py @@ -8,7 +8,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 -from tests.integ.modin.sql_counter import SqlCounter +from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import eval_snowpark_pandas_result @@ -36,16 +36,11 @@ "multi index", ], ) +@sql_count_checker(query_count=1) def test_series_size(args, kwargs): - with SqlCounter( - query_count=1, - join_count=2 - if isinstance(kwargs.get("index", None), native_pd.MultiIndex) - else 0, - ): - eval_snowpark_pandas_result( - pd.Series(*args, **kwargs), - native_pd.Series(*args, **kwargs), - lambda df: df.size, - comparator=lambda x, y: x == y, - ) + eval_snowpark_pandas_result( + pd.Series(*args, **kwargs), + native_pd.Series(*args, **kwargs), + lambda df: df.size, + comparator=lambda x, y: x == y, + ) diff --git a/tests/integ/modin/series/test_take.py b/tests/integ/modin/series/test_take.py index 2ba09be1b8..9eed1559a8 100644 --- a/tests/integ/modin/series/test_take.py +++ b/tests/integ/modin/series/test_take.py @@ -16,23 +16,23 @@ def test_series_take(): actual = ser.take([1, 3, 4]) expected = pd.Series([5, 2, 4], index=[1, 3, 4]) - with SqlCounter(query_count=2, join_count=3): + with SqlCounter(query_count=2, join_count=2): assert_series_equal(actual, expected) actual = ser.take([-1, 3, 4]) expected = pd.Series([4, 2, 4], index=[4, 3, 4]) - with SqlCounter(query_count=2, join_count=3): + with SqlCounter(query_count=2, join_count=2): assert_series_equal(actual, expected) # Out-of-bounds testing - valid because .iloc is used in backend. actual = ser.take([1, 10]) expected = pd.Series([5], index=[1]) - with SqlCounter(query_count=2, join_count=3): + with SqlCounter(query_count=2, join_count=2): assert_series_equal(actual, expected) actual = ser.take([2, 5]) expected = pd.Series([6], index=[2]) - with SqlCounter(query_count=2, join_count=3): + with SqlCounter(query_count=2, join_count=2): assert_series_equal(actual, expected) diff --git a/tests/integ/modin/series/test_to_snowflake.py b/tests/integ/modin/series/test_to_snowflake.py index f542edfa17..92b428f70e 100644 --- a/tests/integ/modin/series/test_to_snowflake.py +++ b/tests/integ/modin/series/test_to_snowflake.py @@ -68,7 +68,7 @@ def test_to_snowflake_index_label_none_raises(test_table_name): snow_series.to_snowflake(test_table_name, if_exists="replace", index=True) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_to_snowflake_multiindex(test_table_name, snow_series): index = native_pd.MultiIndex.from_arrays( [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], names=("number", "color") diff --git a/tests/integ/modin/series/test_transpose.py b/tests/integ/modin/series/test_transpose.py index ae2a076171..53ef87bb15 100644 --- a/tests/integ/modin/series/test_transpose.py +++ b/tests/integ/modin/series/test_transpose.py @@ -51,8 +51,8 @@ def test_series_transpose_empty(): ) -@sql_count_checker(query_count=1, join_count=1) -def test_series_transpose_multi_index(): +@sql_count_checker(query_count=1) +def test_series_transpose_multiindex(): data = [1, 2, 3, 4, 5] index = [("a", "x"), ("b", "y"), ("c", "z"), ("d", "u"), ("e", "v")] @@ -66,7 +66,7 @@ def test_series_transpose_multi_index(): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_series_transpose_index_no_names(): data = [1, 2, 3, 4, 5] index = [None, None, None, None, None] diff --git a/tests/integ/modin/series/test_where.py b/tests/integ/modin/series/test_where.py index cff58d4a82..8c5f8a27f0 100644 --- a/tests/integ/modin/series/test_where.py +++ b/tests/integ/modin/series/test_where.py @@ -76,8 +76,8 @@ def test_series_where_duplicate_labels(): eval_snowpark_pandas_result(snow_ser, native_ser, lambda ser: ser.where(ser > 3)) -@sql_count_checker(query_count=1, join_count=1) -def test_series_where_multi_index(): +@sql_count_checker(query_count=1) +def test_series_where_multiindex(): data = [1, 2, 3, 4, 5] index = [("a", "x"), ("b", "y"), ("c", "z"), ("d", "u"), ("e", "v")] @@ -234,7 +234,7 @@ def test_series_where_with_scalar_cond(cond): ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=1, join_count=1) def test_series_where_series_cond_unmatched_index(): data = [1, 2, 3, 4] index1 = [0, 1, 2, 3] @@ -259,10 +259,9 @@ def perform_where(series): ) -@pytest.mark.parametrize( - "index, join_count", [("matched_index", 1), ("unmatched_index", 2)] -) -def test_series_where_short_series_cond(index, join_count): +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("index", ["matched_index", "unmatched_index"]) +def test_series_where_short_series_cond(index): data = [1, 2, 3, 4] if index != "matched_index": index = [7, 8, 9] @@ -281,18 +280,16 @@ def perform_where(series): else: return series.where(native_cond, -1) - with SqlCounter(query_count=1, join_count=join_count): - eval_snowpark_pandas_result( - snow_ser, - native_ser, - perform_where, - ) + eval_snowpark_pandas_result( + snow_ser, + native_ser, + perform_where, + ) -@pytest.mark.parametrize( - "index, join_count", [("matched_index", 1), ("unmatched_index", 2)] -) -def test_series_where_long_series_cond(index, join_count): +@sql_count_checker(query_count=1, join_count=1) +@pytest.mark.parametrize("index", ["matched_index", "unmatched_index"]) +def test_series_where_long_series_cond(index): data = [1, 2, 3, 4] if index != "matched_index": index = [7, 8, 9, 10, 11] @@ -311,9 +308,8 @@ def perform_where(series): else: return series.where(native_cond, -1) - with SqlCounter(query_count=1, join_count=join_count): - eval_snowpark_pandas_result( - snow_ser, - native_ser, - perform_where, - ) + eval_snowpark_pandas_result( + snow_ser, + native_ser, + perform_where, + ) diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index 5c236731a0..19693ad381 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -1058,7 +1058,7 @@ def test_concat_sorted_frames(): ), # duplicate in frame2 ], ) -@sql_count_checker(query_count=2, union_count=1, join_count=1) +@sql_count_checker(query_count=2, union_count=1) def test_concat_duplicate_columns(columns1, columns2, expected_rows, expected_cols): df1 = pd.DataFrame([[1, 2, 3]], columns=columns1) df2 = pd.DataFrame([[4, 5, 6]], columns=columns2) @@ -1123,7 +1123,7 @@ def test_concat_from_file(resources_path): ) -@sql_count_checker(query_count=1, join_count=5) +@sql_count_checker(query_count=1, join_count=2) def test_concat_keys(): native_data = { "one": native_pd.Series([1, 2, 3], index=["a", "b", "c"]), diff --git a/tests/integ/modin/test_numpy.py b/tests/integ/modin/test_numpy.py index 43b9ef263f..cafbd08f36 100644 --- a/tests/integ/modin/test_numpy.py +++ b/tests/integ/modin/test_numpy.py @@ -113,7 +113,7 @@ def test_np_where_notimplemented(): ) -@sql_count_checker(query_count=5, join_count=7) +@sql_count_checker(query_count=5, join_count=4) def test_scalar(): pdf_scalar = native_pd.DataFrame([[99, 99], [99, 99]]) sdf_scalar = pd.DataFrame([[99, 99], [99, 99]]) @@ -172,7 +172,7 @@ def test_different_inputs(cond, x, y): assert_array_equal(sp_result, np_orig_result) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=2) def test_broadcast_scalar_x_df(): input_df = native_pd.DataFrame([[False, True], [False, True]]) input_df2 = native_pd.DataFrame([[1, 0], [0, 1]]) @@ -183,7 +183,7 @@ def test_broadcast_scalar_x_df(): assert_array_equal(snow_result, np_result) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=2, join_count=2) def test_broadcast_scalar_x_ser(): input_ser = native_pd.Series([False, True]) input_ser2 = native_pd.Series([1, 0]) diff --git a/tests/integ/modin/types/test_timedelta_indexing.py b/tests/integ/modin/types/test_timedelta_indexing.py index 62f98107b9..0d5cf838c6 100644 --- a/tests/integ/modin/types/test_timedelta_indexing.py +++ b/tests/integ/modin/types/test_timedelta_indexing.py @@ -264,7 +264,7 @@ def loc_set(key, item, df): df.loc[key] = item return df - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): # single value key = (1, "a") run_test(key, item, api=loc_set) @@ -304,9 +304,9 @@ def iloc_set(key, item, df): df.iloc[key] = item return df - def run_test(key, item, natvie_df=td, api=iloc_set): + def run_test(key, item, native_df=td, api=iloc_set): eval_snowpark_pandas_result( - snow_td.copy(), natvie_df.copy(), functools.partial(api, key, item) + snow_td.copy(), native_df.copy(), functools.partial(api, key, item) ) item = "string" @@ -325,7 +325,7 @@ def run_test(key, item, natvie_df=td, api=iloc_set): td_int = td.copy() td_int["b"] = td_int["b"].astype("int64") # timedelta type is not preserved in this case - run_test(key, item, natvie_df=td_int) + run_test(key, item, native_df=td_int) def df_set(key, item, df): df[key] = item @@ -346,13 +346,13 @@ def loc_set(key, item, df): run_test(key, item, api=loc_set) item = 1000 - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): # single value key = (1, "b") td_int = td.copy() td_int["b"] = td_int["b"].astype("int64") # timedelta type is not preserved in this case - run_test(key, item, natvie_df=td_int, api=loc_set) + run_test(key, item, native_df=td_int, api=loc_set) @pytest.mark.parametrize("item", [None, pd.Timedelta("1 hour")]) @@ -383,7 +383,7 @@ def setitem_enlargement(key, item, df): ) key = 10 - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( snow_td["a"].copy(), td["a"].copy(), @@ -402,7 +402,7 @@ def loc_enlargement(key, item, df): ) key = 10 - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( snow_td["a"].copy(), td["a"].copy(), @@ -412,7 +412,7 @@ def loc_enlargement(key, item, df): # single row key = (10, slice(None, None, None)) - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=1, join_count=1): if pd.isna(item): eval_snowpark_pandas_result( snow_td.copy(), td.copy(), functools.partial(loc_enlargement, key, item) @@ -566,7 +566,7 @@ def setitem_enlargement(key, item, df): ) key = native_pd.Timedelta("2 days 45 minutes") - with SqlCounter(query_count=1, join_count=3): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( snow_df["a"].copy(), native_df["a"].copy(), @@ -587,7 +587,7 @@ def loc_enlargement(key, item, df): ) key = native_pd.Timedelta("2 days 25 minutes") - with SqlCounter(query_count=1, join_count=3): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( snow_df["a"].copy(), native_df["a"].copy(), @@ -597,7 +597,7 @@ def loc_enlargement(key, item, df): # single row key = (native_pd.Timedelta("2 days 45 minutes"), slice(None, None, None)) - with SqlCounter(query_count=1, join_count=3): + with SqlCounter(query_count=1, join_count=2): eval_snowpark_pandas_result( snow_df.copy(), native_df.copy(), From 2274d1e25b9970e2d940120b3e8c767ee4fd50d1 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 11 Sep 2024 14:39:29 -0700 Subject: [PATCH 31/42] remove print statements and unnecessary comments --- src/snowflake/snowpark/modin/pandas/dataframe.py | 2 ++ .../snowpark/modin/plugin/extensions/series_overrides.py | 2 ++ tests/integ/modin/frame/test_cache_result.py | 2 -- tests/integ/modin/frame/test_loc.py | 4 ++-- tests/integ/modin/frame/test_setitem.py | 1 - tests/integ/modin/frame/test_where.py | 3 +-- tests/integ/modin/groupby/test_groupby_basic_agg.py | 2 +- tests/integ/modin/series/test_iloc.py | 1 + 8 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index e4474a53b3..003f1d56f2 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -308,6 +308,8 @@ def __init__( ) )._query_compiler + # The index is already set if the data is a non-Snowpark pandas object. If either the data or the index is + # a Snowpark pandas object, set the index here. if index is not None and ( isinstance(index, (Index, Series)) or isinstance(data, (Index, Series, type(self))) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index d9fbe613cf..c5435f139f 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -425,6 +425,8 @@ def __init__( ) )._query_compiler + # The index is already set if the data is a non-Snowpark pandas object. If either the data or the index is + # a Snowpark pandas object, set the index here. if index is not None and ( isinstance(index, (Index, type(self))) or isinstance(data, (Index, type(self))) ): diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py index c26b28e4ab..c78cefaa3a 100644 --- a/tests/integ/modin/frame/test_cache_result.py +++ b/tests/integ/modin/frame/test_cache_result.py @@ -81,8 +81,6 @@ def perform_chained_operations(df, module): @pytest.mark.parametrize("inplace", [True, False]) def test_cache_result_empty_dataframe(init_kwargs, inplace): snow_df, native_df = create_test_dfs(**init_kwargs) - print(snow_df) - print(native_df) snow_df_copy = snow_df.copy(deep=True) with SqlCounter(query_count=1): cached_snow_df = cache_and_return_df(snow_df, inplace) diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index 33c1fb98e5..73f06df4b1 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -3936,7 +3936,7 @@ def test_raise_set_cell_with_list_like_value_error(): marks=pytest.mark.xfail( reason="SNOW-1652608 result series name incorrectly set" ), - ), + ), # 1 join from df creation, 1 join from squeeze, 2 joins from to_pandas during eval pytest.param( native_pd.to_timedelta("1 day"), 2, @@ -3944,7 +3944,7 @@ def test_raise_set_cell_with_list_like_value_error(): marks=pytest.mark.xfail( reason="SNOW-1652608 result series name incorrectly set" ), - ), + ), # 1 join fron df creation, 1 join from squeeze, 2 joins from to_pandas during eval (["1 day", "3 days"], 1, 2), ([True, False, False], 1, 2), (slice(None, "4 days"), 1, 1), diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index 6152089f39..3d51277b2c 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -444,7 +444,6 @@ def setitem_helper(df): [["a", "b", "b", "d", "e"], ["x", "y", "z", "u", "u"], True], ], ) -# 2 extra queries to convert to native pandas when creating the two snowpark pandas dataframes @sql_count_checker(query_count=1, join_count=3) def test_df_setitem_with_unique_and_duplicate_index_values( index_values, other_index_values, expect_mismatch diff --git a/tests/integ/modin/frame/test_where.py b/tests/integ/modin/frame/test_where.py index 006b7e76fb..bd7a5b5808 100644 --- a/tests/integ/modin/frame/test_where.py +++ b/tests/integ/modin/frame/test_where.py @@ -628,7 +628,6 @@ def test_dataframe_where_with_dataframe_cond_single_index_different_names(): ) -# one extra query to convert index to native pandas when creating the snowpark pandas dataframe @sql_count_checker(query_count=1, join_count=3) def test_dataframe_where_with_dataframe_cond_single_index_different_names_2(): data = [1, 2, 3] @@ -702,7 +701,7 @@ def test_dataframe_where_with_duplicated_index_aligned(cond_frame, other): ) -# 3 extra join queries to create the 3 snowpark pandas dataframe with non-Snowpark pandas data +# 3 extra joins to create the 3 snowpark pandas dataframe with non-Snowpark pandas data # and a Snowpark pandas Index. @sql_count_checker(query_count=1, join_count=5) def test_dataframe_where_with_duplicated_index_unaligned(): diff --git a/tests/integ/modin/groupby/test_groupby_basic_agg.py b/tests/integ/modin/groupby/test_groupby_basic_agg.py index d4211f2a41..09acd49bb2 100644 --- a/tests/integ/modin/groupby/test_groupby_basic_agg.py +++ b/tests/integ/modin/groupby/test_groupby_basic_agg.py @@ -952,7 +952,7 @@ def test_groupby_with_level(df_multi, level): @sql_count_checker(query_count=1) -def test_groupby_with_higher_columns(): +def test_groupby_with_hier_columns(): tuples = list( zip( *[ diff --git a/tests/integ/modin/series/test_iloc.py b/tests/integ/modin/series/test_iloc.py index 7b6369934d..b5a0df3164 100644 --- a/tests/integ/modin/series/test_iloc.py +++ b/tests/integ/modin/series/test_iloc.py @@ -823,6 +823,7 @@ def test_df_iloc_set_with_multiindex( native_items.index = pd.MultiIndex.from_tuples(item_index) if row_key_index: + # Using native pandas index since row_key[2] is a MultiIndex object. snow_row_key = pd.Series(row_key, index=native_pd.Index(row_key_index)) native_row_key = native_pd.Series(row_key, index=native_pd.Index(row_key_index)) else: From 9eef8d77449cff6a98a1d539740f38d4ce3ea7e4 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 11 Sep 2024 15:36:49 -0700 Subject: [PATCH 32/42] fix tests --- tests/integ/modin/groupby/test_groupby_series.py | 2 +- tests/integ/modin/test_from_pandas_to_pandas.py | 4 ++-- tests/integ/modin/test_internal_frame.py | 2 +- tests/integ/modin/test_telemetry.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integ/modin/groupby/test_groupby_series.py b/tests/integ/modin/groupby/test_groupby_series.py index 7756f8b620..ae8ae0926d 100644 --- a/tests/integ/modin/groupby/test_groupby_series.py +++ b/tests/integ/modin/groupby/test_groupby_series.py @@ -153,7 +153,7 @@ def test_groupby_agg_series_named_agg(aggs, sort): @pytest.mark.parametrize("numeric_only", [False, None]) -@sql_count_checker(query_count=2) +@sql_count_checker(query_count=2, join_count=2) def test_groupby_series_numeric_only(series_str, numeric_only): native_series = series_str.to_pandas() eval_snowpark_pandas_result( diff --git a/tests/integ/modin/test_from_pandas_to_pandas.py b/tests/integ/modin/test_from_pandas_to_pandas.py index 28a6c54950..ceef588410 100644 --- a/tests/integ/modin/test_from_pandas_to_pandas.py +++ b/tests/integ/modin/test_from_pandas_to_pandas.py @@ -525,7 +525,7 @@ def test_from_pandas_series_with_tuple_name(): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_series_to_pandas(): array = ["a", "b", "c"] pandas_series = native_pd.Series(data=array, index=array) @@ -585,7 +585,7 @@ def test_snowpark_pandas_statement_params(): assert "efg" == mock_to_pandas.call_args.kwargs["statement_params"]["abc"] -@sql_count_checker(query_count=1, join_count=5) +@sql_count_checker(query_count=1, join_count=2) def test_create_df_from_series(): native_data = { "one": native_pd.Series([1, 2, 3], index=["a", "b", "c"]), diff --git a/tests/integ/modin/test_internal_frame.py b/tests/integ/modin/test_internal_frame.py index c7a95fa601..da38322b9a 100644 --- a/tests/integ/modin/test_internal_frame.py +++ b/tests/integ/modin/test_internal_frame.py @@ -38,7 +38,7 @@ def test_strip_duplicates(input, expected): assert_frame_equal(result, pd.DataFrame(expected)) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2, join_count=1) def test_strip_duplicates_after_sort(): df = pd.DataFrame({"A": [0, 1, 0, 1, 2], "B": [1, 2, 3, 4, 5]}) df = df.sort_values(by="B", ascending=False) diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index faa012e144..ce9e1caf32 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -342,7 +342,7 @@ def test_telemetry_with_update_inplace(): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_telemetry_with_resample(): # verify api_calls have been collected correctly for Resample APIs index = pandas.date_range("1/1/2000", periods=9, freq="min") From cc09403f06ecb20c88611fc9d377922b8155829a Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 11 Sep 2024 17:01:44 -0700 Subject: [PATCH 33/42] increase coverage --- .../snowpark/modin/pandas/dataframe.py | 6 +++-- .../plugin/extensions/series_overrides.py | 8 +++--- .../test_df_series_creation_with_index.py | 26 +++++++++++++++++++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index 003f1d56f2..f619732f16 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -257,7 +257,7 @@ def __init__( if index is not None: if isinstance(index, Index): # pragma: no cover index = index.to_series()._query_compiler - elif isinstance(index, Series): + elif isinstance(index, Series): # pragma: no cover index = index._query_compiler new_qc = new_qc.reindex(axis=0, labels=index) if columns is not None: @@ -354,7 +354,9 @@ def __init__( ] ] else: - index_qc_list = [Series(index)._query_compiler] + index_qc_list = [ + Series(index)._query_compiler + ] # pragma: no cover query_compiler = query_compiler.set_index(index_qc_list) if isinstance(data, DataFrame): diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index c5435f139f..724b58838c 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -438,9 +438,9 @@ def __init__( labels = index if isinstance(labels, Index): labels = labels.to_series()._query_compiler - elif isinstance(labels, Series): - labels = labels._query_compiler # pragma: no cover - else: + elif isinstance(labels, Series): # pragma: no cover + labels = labels._query_compiler + else: # pragma: no cover labels = Index(labels).to_series()._query_compiler query_compiler = query_compiler.reindex(axis=0, labels=labels) @@ -470,7 +470,7 @@ def __init__( ] ] else: - index_qc_list = [Series(index)._query_compiler] + index_qc_list = [Series(index)._query_compiler] # pragma: no cover query_compiler = query_compiler.set_index(index_qc_list) # Set the query compiler and name fields. diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 1fd5701fda..920523ff3d 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -507,6 +507,32 @@ def test_create_series_with_list_of_lists_index(): assert_series_equal(snow_series, native_series) +@sql_count_checker(query_count=1, join_count=2) +def test_create_series_with_index_data_and_list_of_lists_index(): + # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. + arrays = [ + ["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"], + ["two", "one", "two", "one", "two", "one", "two", "one"], + ] + data = native_pd.Index([1, 2, 3, 4, 5, 6, 7, 8]) + native_series = native_pd.Series(data, index=arrays) + snow_series = pd.Series(pd.Index(data), index=arrays) + assert_series_equal(snow_series, native_series) + + +@sql_count_checker(query_count=1, join_count=2) +def test_create_df_with_index_data_and_list_of_lists_index(): + # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. + arrays = [ + ["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"], + ["two", "one", "two", "one", "two", "one", "two", "one"], + ] + data = native_pd.Index([1, 2, 3, 4, 5, 6, 7, 8]) + native_df = native_pd.DataFrame(data, index=arrays) + snow_df = pd.DataFrame(pd.Index(data), index=arrays) + assert_frame_equal(snow_df, native_df) + + @sql_count_checker(query_count=1) def test_create_series_with_none_data_and_non_empty_index(): # When creating an empty Series with a non-empty index, the index should be used as the index of the Series. From 10c395445575116af8a4de0c70430360c87b9c34 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 13 Sep 2024 17:45:01 -0700 Subject: [PATCH 34/42] try to move out common logic, add more tests --- .../snowpark/modin/pandas/dataframe.py | 284 ++++++------- .../snowpark/modin/plugin/_internal/utils.py | 66 +++ .../plugin/extensions/series_overrides.py | 106 ++--- .../test_df_series_creation_with_index.py | 381 +++++++++++++++++- 4 files changed, 637 insertions(+), 200 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py index f619732f16..28122944e1 100644 --- a/src/snowflake/snowpark/modin/pandas/dataframe.py +++ b/src/snowflake/snowpark/modin/pandas/dataframe.py @@ -89,8 +89,13 @@ raise_if_native_pandas_objects, replace_external_data_keys_with_empty_pandas_series, replace_external_data_keys_with_query_compiler, + try_convert_index_to_native, +) +from snowflake.snowpark.modin.plugin._internal.utils import ( + convert_index_to_list_of_qcs, + convert_index_to_qc, + is_repr_truncated, ) -from snowflake.snowpark.modin.plugin._internal.utils import is_repr_truncated from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, @@ -151,28 +156,40 @@ def __init__( # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. - from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from snowflake.snowpark.modin.plugin.extensions.index import Index self._siblings = [] - if isinstance(index, DataFrame): # pandas raises the same error - raise ValueError("Index data must be 1-dimensional") - + # 0. Setting the query compiler + # ----------------------------- if query_compiler is not None: - # CASE 1: query_compiler - # If a query_compiler is passed in, only use the query_compiler and name fields to create a new Series. + # CASE I: query_compiler + # If a query_compiler is passed in only use the query_compiler field to create a new DataFrame. + assert ( + data is None + ), "Invalid DataFrame construction! Cannot pass both data and query_compiler." + assert ( + index is None + ), "Invalid DataFrame construction! Cannot pass both index and query_compiler." + assert ( + columns is None + ), "Invalid DataFrame construction! Cannot pass both columns and query_compiler." self._query_compiler = query_compiler return + if isinstance(index, DataFrame): # pandas raises the same error + raise ValueError("Index data must be 1-dimensional") + # The logic followed here is: # 1. Create a query_compiler from the provided data. If columns are provided, add/select the columns. # 2. If an index is provided, set the index through set_index or reindex. # 3. If the data is a DataFrame, perform loc to select the required index and columns from the DataFrame. # 4. The resultant query_compiler is then set as the query_compiler for the DataFrame. + # 1. Setting the data (and columns) + # --------------------------------- if isinstance(data, Index): - # CASE 2: data is a Snowpark pandas Index + # CASE II: data is a Snowpark pandas Index # If the data is an Index object, convert it to a DataFrame to make sure that the values are in the # correct format: the values are a data column, not an index column. if data.name is None: @@ -182,26 +199,23 @@ def __init__( query_compiler = data.to_frame(index=False, name=new_name)._query_compiler elif isinstance(data, Series): - # CASE 3: data is a Snowpark pandas Series + # CASE III: data is a Snowpark pandas Series query_compiler = data._query_compiler.copy() # We set the column name if it is not in the provided Series `data`. if data.name is None: - query_compiler = query_compiler.set_columns( - [0] if columns is None else columns - ) + query_compiler = query_compiler.set_columns(columns or [0]) if columns is not None and data.name not in columns: # If the columns provided are not in the named Series, pandas clears # the DataFrame and sets columns to the columns provided. query_compiler = from_pandas( - self.__constructor__(columns=columns) - )._query_compiler # pragma: no cover + pandas.DataFrame(columns=columns) + )._query_compiler elif isinstance(data, DataFrame): - # CASE 5: data is a Snowpark pandas DataFrame + # CASE IV: data is a Snowpark pandas DataFrame query_compiler = data._query_compiler.copy() - if columns is None and index is None: - # If the new DataFrame has the same columns and index as the original DataFrame, + # Special case IV.a: if the new DataFrame has the same columns and index as the original DataFrame, # the query compiler is shared and kept track of as a sibling. self._query_compiler = query_compiler data._add_sibling(self) @@ -218,147 +232,139 @@ def __init__( ) else: - # CASE 5: Non-Snowpark pandas data - dummy_index = None # used in a special dict case - if isinstance(data, pandas.Index): - # CASE 5.B: data is a pandas Index - pass - - elif is_list_like(data) and not is_dict_like(data): - # CASE 5.C: data is list-like - old_dtype = getattr(data, "dtype", None) - values = [ - obj._to_pandas() if isinstance(obj, Series) else obj for obj in data - ] - if isinstance(data, np.ndarray): - data = np.array(values, dtype=old_dtype) - else: - try: - data = type(data)(values, dtype=old_dtype) - except TypeError: - data = values - - elif is_dict_like(data) and not isinstance( - data, (pandas.Series, pandas.DataFrame) - ): - # CASE 5.D: data is dict-like - if columns is not None: - data = {key: value for key, value in data.items() if key in columns} - - if len(data) and all(isinstance(v, Series) for v in data.values()): - # Special case: data is a dictionary where all the values are Snowpark pandas Series - from .general import concat - - new_qc = concat( - data.values(), axis=1, keys=data.keys() - )._query_compiler - if dtype is not None: - new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) - if index is not None: - if isinstance(index, Index): # pragma: no cover - index = index.to_series()._query_compiler - elif isinstance(index, Series): # pragma: no cover - index = index._query_compiler - new_qc = new_qc.reindex(axis=0, labels=index) + # CASE V: Non-Snowpark pandas data + if not isinstance( + data, (pandas.Series, pandas.DataFrame, pandas.Index) + ) and is_list_like(data): + from .general import concat + + if is_dict_like(data): + # Setting up keys and values for processing if all the values are Snowpark pandas objects. if columns is not None: - new_qc = new_qc.reindex( - axis=1, labels=try_convert_index_to_native(columns) - ) - self._query_compiler = new_qc - return - - data = { - k: v._to_pandas() if isinstance(v, Series) else v - for k, v in data.items() - } - - if ( - all(not is_scalar(v) and len(v) == 1 for v in data.values()) - and index is not None - ): - # Special case: the values in the dictionary are all non-scalar objects of length 1 - # >>> DataFrame({"A": [1], "V": [2]}, native_pd.Index(["A", "B", "C"]), name="cake") - # A V - # cake - # A 1 2 - # B 1 2 <--- the first row is copied into the rest of the rows. - # C 1 2 - # Recreate a 2-d array with the first row copied into the rest of the rows. - self._query_compiler = DataFrame( - data=[[v[0] for v in data.values()]] * len(index), - index=index, - columns=list(data.keys()), - )._query_compiler - return - - if all(is_scalar(k) and is_scalar(v) for k, v in data.items()): - # Special case: All keys and values in the dict are all scalars, an index needs to be provided. - # pd.DataFrame({'a': 1, 'b': 2}, index=[0]) - dummy_index = index - - if not isinstance(index, (Index, type(self))): - dummy_index = index + # Reduce the dictionary to only the relevant columns as the keys. + data = { + key: value for key, value in data.items() if key in columns + } + + if len(data) and all( + isinstance(v, (Index, BasePandasDataset)) for v in data.values() + ): + # Special case V.a: data is a list/dict where all the values are Snowpark pandas objects. + # Concat can only be performed with BasePandasDataset objects. + # If a value is an Index, convert it to a Series where the index is the index to be set + # since these values are always present in the final DataFrame. + values = [ + Series(v, index=index) if isinstance(v, Index) else v + for v in data.values() + ] + new_qc = concat( + values, axis=1, keys=data.keys() + )._query_compiler + if dtype is not None: + new_qc = new_qc.astype( + {col: dtype for col in new_qc.columns} + ) + if index is not None: + new_qc = new_qc.reindex( + axis=0, labels=convert_index_to_qc(index) + ) + if columns is not None: + new_qc = new_qc.reindex( + axis=1, labels=try_convert_index_to_native(columns) + ) + self._query_compiler = new_qc + return + + # If only some data is a Snowpark pandas object, convert it to pandas objects. + res = {} + index = try_convert_index_to_native(index) + for k, v in data.items(): + if isinstance(v, (Index)): + res[k] = v.to_pandas() + elif isinstance(v, BasePandasDataset): + # Need to perform reindex on the Series or DataFrame objects since only the data + # whose index matches the given index is kept. + res[k] = v.to_pandas().reindex(index=index) + else: + res[k] = v + data = res + + else: # list-like but not dict-like data. + if len(data) and all( + isinstance(v, (Index, BasePandasDataset)) for v in data + ): + # Special case V.c: data is a list/dict where all the values are Snowpark pandas objects. + # Concat can only be performed with BasePandasDataset objects. + # If a value is an Index, convert it to a Series. + values = [ + Series(v) if isinstance(v, Index) else v for v in data + ] + new_qc = concat(values, axis=1).T._query_compiler + if dtype is not None: + new_qc = new_qc.astype( + {col: dtype for col in new_qc.columns} + ) + if index is not None: + new_qc = new_qc.set_index([convert_index_to_qc(index)]) + if columns is not None: + if all(isinstance(v, Index) for v in data): + # Special case: if all the values are Index objects, they are always present in the + # final result with the provided column names. Therefore, rename the columns. + new_qc = new_qc.set_columns( + try_convert_index_to_native(columns) + ) + else: + new_qc = new_qc.reindex(axis=1, labels=columns) + self._query_compiler = new_qc + return + + # If only some data is a Snowpark pandas object, convert it to pandas objects. + res = [] + for v in data: + if isinstance(v, (Index)): + res.append(v.to_pandas()) + elif isinstance(v, BasePandasDataset): + res.append(v.to_pandas()) + else: + # Need to convert this is a native pandas object since native pandas incorrectly + # tries to perform `get_indexer` on it. + res.append(pandas.Index(v if is_list_like(v) else [v])) + data = res + query_compiler = from_pandas( pandas.DataFrame( data=data, - index=dummy_index, + # Handle setting the index, if it is a lazy index, outside this block. + index=None if isinstance(index, (Index, Series)) else index, columns=try_convert_index_to_native(columns), dtype=dtype, copy=copy, ) )._query_compiler - # The index is already set if the data is a non-Snowpark pandas object. If either the data or the index is - # a Snowpark pandas object, set the index here. + # 2. Setting the index + # -------------------- + # The index is already set if the data is a non-Snowpark pandas object. + # If either the data or the index is a Snowpark pandas object, set the index here. if index is not None and ( isinstance(index, (Index, Series)) - or isinstance(data, (Index, Series, type(self))) + or isinstance(data, (Index, BasePandasDataset)) ): if isinstance(data, (type(self), Series, type(None))): # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. # If data is None and an index is provided, set the index. - labels = index - if isinstance(labels, Index): - labels = labels.to_series()._query_compiler - elif isinstance(labels, Series): - labels = labels._query_compiler # pragma: no cover - else: - labels = Index(labels).to_series()._query_compiler - query_compiler = query_compiler.reindex(axis=0, labels=labels) - + query_compiler = query_compiler.reindex( + axis=0, labels=convert_index_to_qc(index) + ) else: # Performing set index to directly set the index column (joining on row-position instead of index). - if isinstance(index, Series): - index_qc_list = [index._query_compiler] - elif isinstance(index, Index): - index_qc_list = [index.to_series()._query_compiler] - else: - if ( - not isinstance(index, pandas.MultiIndex) - and is_list_like(index) - and len(index) > 0 - and all( - (not isinstance(i, tuple) and is_list_like(i)) - for i in index - ) - ): - # If given a list of lists, convert it to a MultiIndex. - index = pandas.MultiIndex.from_arrays(index) - if isinstance(index, pandas.MultiIndex): - index_qc_list = [ - s._query_compiler - for s in [ - pd.Series(index.get_level_values(level)) - for level in range(index.nlevels) - ] - ] - else: - index_qc_list = [ - Series(index)._query_compiler - ] # pragma: no cover - query_compiler = query_compiler.set_index(index_qc_list) + query_compiler = query_compiler.set_index( + convert_index_to_list_of_qcs(index) + ) + # 3. If data is a DataFrame, filter result + # ---------------------------------------- if isinstance(data, DataFrame): # To select the required index and columns for the resultant DataFrame, # perform .loc[] on the created query compiler. @@ -370,6 +376,8 @@ def __init__( ._query_compiler ) + # 4. Setting the query compiler + # ----------------------------- self._query_compiler = query_compiler def __repr__(self): diff --git a/src/snowflake/snowpark/modin/plugin/_internal/utils.py b/src/snowflake/snowpark/modin/plugin/_internal/utils.py index 9f01954ab2..fdfc9b8da3 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/utils.py @@ -13,6 +13,7 @@ import pandas as native_pd from pandas._typing import Scalar from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype, is_scalar +from pandas.core.dtypes.inference import is_list_like import snowflake.snowpark.modin.pandas as pd import snowflake.snowpark.modin.plugin._internal.statement_params_constants as STATEMENT_PARAMS @@ -1995,3 +1996,68 @@ def create_frame_with_data_columns( def rindex(lst: list, value: int) -> int: """Find the last index in the list of item value.""" return len(lst) - lst[::-1].index(value) - 1 + + +def convert_index_to_qc(index: Any) -> Any: + """ + Method to convert an object representing an index into a query compiler for set_index or reindex. + + Parameters + ---------- + index: Any + The object to convert to a query compiler. + + Returns + ------- + SnowflakeQueryCompiler + The converted query compiler. + """ + from modin.pandas import Series + + from snowflake.snowpark.modin.plugin.extensions.index import Index + + if isinstance(index, Index): + idx_qc = index.to_series()._query_compiler + elif isinstance(index, Series): + idx_qc = index._query_compiler + else: + idx_qc = Series(index)._query_compiler + return idx_qc + + +def convert_index_to_list_of_qcs(index: Any) -> list: + """ + Method to convert an object representing an index into a list of query compilers for set_index. + + Parameters + ---------- + index: Any + The object to convert to a list of query compilers. + + Returns + ------- + list + The list of query compilers. + """ + from modin.pandas import Series + + from snowflake.snowpark.modin.plugin.extensions.index import Index + + if ( + not isinstance(index, (native_pd.MultiIndex, Series, Index)) + and is_list_like(index) + and len(index) > 0 + and all((is_list_like(i) and not isinstance(i, tuple)) for i in index) + ): + # If given a list of lists, convert it to a MultiIndex. + index = native_pd.MultiIndex.from_arrays(index) + if isinstance(index, native_pd.MultiIndex): + index_qc_list = [ + s._query_compiler + for s in [ + Series(index.get_level_values(level)) for level in range(index.nlevels) + ] + ] + else: + index_qc_list = [convert_index_to_qc(index)] + return index_qc_list diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index 724b58838c..7707ed82f9 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -51,6 +51,10 @@ from snowflake.snowpark.modin import pandas as spd # noqa: F401 from snowflake.snowpark.modin.pandas.api.extensions import register_series_accessor from snowflake.snowpark.modin.pandas.utils import from_pandas, is_scalar +from snowflake.snowpark.modin.plugin._internal.utils import ( + convert_index_to_list_of_qcs, + convert_index_to_qc, +) from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, @@ -374,50 +378,80 @@ def __init__( from snowflake.snowpark.modin.plugin.extensions.index import Index - if query_compiler: - # CASE 1: query_compiler + # 0. Setting the query compiler + # ----------------------------- + if query_compiler is not None: + # CASE I: query_compiler # If a query_compiler is passed in, only use the query_compiler and name fields to create a new Series. + assert ( + data is None + ), "Invalid Series construction! Cannot pass both data and query_compiler." + assert ( + index is None + ), "Invalid Series construction! Cannot pass both index and query_compiler." self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name return + if isinstance(index, spd.DataFrame): # pandas raises the same error + raise ValueError("Index data must be 1-dimensional") + + if isinstance(data, spd.DataFrame): + # pandas raises an ambiguous error: + # ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all(). + raise ValueError("Data cannot be a DataFrame") + # The logic followed here is: # 1. Create a query_compiler from the provided data. # 2. If an index is provided, set the index. This is either through set_index or reindex. # 3. The resultant query_compiler is columnarized and set as the query_compiler for the Series. # 4. If a name is provided, set the name. + # 1. Setting the data + # ------------------- if isinstance(data, Index): - # CASE 2: Index + # CASE II: Index # If the data is an Index object, convert it to a Series, and get the query_compiler. query_compiler = ( data.to_series(index=None, name=name).reset_index(drop=True)._query_compiler ) elif isinstance(data, type(self)): - # CASE 3: Series + # CASE III: Series # If the data is a Series object, copy the query_compiler. query_compiler = data._query_compiler.copy() else: - # CASE 4: Non-Snowpark pandas data + # CASE IV: Non-Snowpark pandas data # If the data is not a Snowpark pandas object, convert it to a query compiler. - name = MODIN_UNNAMED_SERIES_LABEL if name is None else name - dummy_index = None - if not isinstance(index, (Index, type(self))): - dummy_index = index + name = name or MODIN_UNNAMED_SERIES_LABEL if ( isinstance(data, (native_pd.Series, native_pd.Index)) and data.name is not None ): name = data.name + # If any of the values are Snowpark pandas objects, convert them to native pandas objects. + if not isinstance( + data, (native_pd.DataFrame, native_pd.Series, native_pd.Index) + ) and is_list_like(data): + if is_dict_like(data): + data = { + k: v.to_list() if isinstance(v, (Index, BasePandasDataset)) else v + for k, v in data.items() + } + else: + data = [ + v.to_list() if isinstance(v, (Index, BasePandasDataset)) else v + for v in data + ] query_compiler = from_pandas( native_pd.DataFrame( native_pd.Series( data=data, dtype=dtype, - index=dummy_index, + # Handle setting the index, if it is a lazy index, outside this block. + index=None if isinstance(index, (Index, Series)) else index, name=name, copy=copy, fastpath=fastpath, @@ -425,8 +459,10 @@ def __init__( ) )._query_compiler - # The index is already set if the data is a non-Snowpark pandas object. If either the data or the index is - # a Snowpark pandas object, set the index here. + # 2. Setting the index + # -------------------- + # The index is already set if the data is a non-Snowpark pandas object. + # If either the data or the index is a Snowpark pandas object, set the index here. if index is not None and ( isinstance(index, (Index, type(self))) or isinstance(data, (Index, type(self))) ): @@ -435,45 +471,17 @@ def __init__( # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. # If data is None and an index is provided, all the values in the Series will be NaN and the index # will be the provided index. - labels = index - if isinstance(labels, Index): - labels = labels.to_series()._query_compiler - elif isinstance(labels, Series): # pragma: no cover - labels = labels._query_compiler - else: # pragma: no cover - labels = Index(labels).to_series()._query_compiler - query_compiler = query_compiler.reindex(axis=0, labels=labels) - + query_compiler = query_compiler.reindex( + axis=0, labels=convert_index_to_qc(index) + ) else: # Performing set index to directly set the index column (joining on row-position instead of index). - if isinstance(index, Series): - index_qc_list = [index._query_compiler] - elif isinstance(index, Index): - index_qc_list = [index.to_series()._query_compiler] - else: - if ( - not isinstance(index, native_pd.MultiIndex) - and is_list_like(index) - and len(index) > 0 - and all( - (not isinstance(i, tuple) and is_list_like(i)) for i in index - ) - ): - # If given a list of lists, convert it to a MultiIndex. - index = native_pd.MultiIndex.from_arrays(index) - if isinstance(index, native_pd.MultiIndex): - index_qc_list = [ - s._query_compiler - for s in [ - pd.Series(index.get_level_values(level)) - for level in range(index.nlevels) - ] - ] - else: - index_qc_list = [Series(index)._query_compiler] # pragma: no cover - query_compiler = query_compiler.set_index(index_qc_list) - - # Set the query compiler and name fields. + query_compiler = query_compiler.set_index( + convert_index_to_list_of_qcs(index) + ) + + # 3 and 4. Setting the query compiler and name + # -------------------------------------------- self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 920523ff3d..e337e53d4b 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -220,6 +220,7 @@ def test_create_with_series_as_data_and_index_as_index( assert_equal_func( snow_obj(data=snow_series, index=snow_index), native_obj(data=native_series, index=native_index), + check_dtype=False, ) @@ -440,8 +441,9 @@ def test_create_df_with_df_as_data_and_index_as_index_and_different_columns( ) snow_index = pd.Index(native_index) qc = 1 if column_type == "list" else 2 + qc += 1 if (isinstance(native_df, dict)) else 0 qc += 1 if (isinstance(native_df, dict) and column_type == "index") else 0 - jc = 2 if isinstance(native_df, native_pd.DataFrame) else 1 + jc = 2 if isinstance(native_df, native_pd.DataFrame) else 0 with SqlCounter(query_count=qc, join_count=jc): assert_frame_equal( pd.DataFrame(snow_df, index=snow_index, columns=native_columns), @@ -464,18 +466,7 @@ def test_create_df_with_new_columns(): ) -@sql_count_checker(query_count=0) -def test_create_df_with_df_index_negative(): - with pytest.raises(ValueError, match="Index data must be 1-dimensional"): - pd.DataFrame([1, 2, 3], index=pd.DataFrame([[1, 2], [3, 4], [5, 6]])) - with pytest.raises( - ValueError, - match=re.escape("Shape of passed values is (3, 1), indices imply (2, 1)"), - ): - native_pd.DataFrame([1, 2, 3], index=[[1, 2], [3, 4], [5, 6]]) - - -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2) def test_create_df_with_dict_as_data_and_index_as_index(): """ Special case when creating: @@ -540,3 +531,367 @@ def test_create_series_with_none_data_and_non_empty_index(): native_series = native_pd.Series(None, index=index, dtype=object) snow_series = pd.Series(None, index=index, dtype=object) assert_series_equal(snow_series, native_series) + + +@pytest.mark.parametrize( + "data1, data2", [("series", "series"), ("series", "index"), ("index", "index")] +) +def test_create_df_with_series_index_dict_data(data1, data2): + # Create the dict data. + native_data1 = ( + native_pd.Series([1, 2, 3]) if data1 == "series" else native_pd.Index([1, 2, 3]) + ) + native_data2 = ( + native_pd.Series([4, 5, 6]) if data2 == "series" else native_pd.Index([4, 5, 6]) + ) + snow_data1 = pd.Series([1, 2, 3]) if data1 == "series" else pd.Index([1, 2, 3]) + snow_data2 = pd.Series([4, 5, 6]) if data2 == "series" else pd.Index([4, 5, 6]) + native_data = {"A": native_data1, "B": native_data2} + snow_data = {"A": snow_data1, "B": snow_data2} + + # Create DataFrame only with dict data. + native_df = native_pd.DataFrame(native_data) + snow_df = pd.DataFrame(snow_data) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data and Series index. + native_ser_index = native_pd.Series([9, 2, 999]) + snow_ser_index = pd.Series([9, 2, 999]) + native_df = native_pd.DataFrame(native_data, index=native_ser_index) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data and Index index. + native_index = native_pd.Index([9, 2, 999]) + snow_index = pd.Index([9, 2, 999]) + native_df = native_pd.DataFrame(native_data, index=native_index) + snow_df = pd.DataFrame(snow_data, index=snow_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data, Series index, and columns. + columns = ["A", "B", "C"] + native_df = native_pd.DataFrame( + native_data, index=native_ser_index, columns=columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index, columns=columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data, Index index, and Index columns. + native_columns = native_pd.Index(columns) + snow_columns = pd.Index(columns) + native_df = native_pd.DataFrame( + native_data, index=native_index, columns=native_columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_index, columns=snow_columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + +@pytest.mark.parametrize( + "data1, data2", [("series", "series"), ("series", "index"), ("index", "index")] +) +def test_create_df_with_series_index_list_data(data1, data2): + # Create the list data. + native_data1 = ( + native_pd.Series([11, 22, 33]) + if data1 == "series" + else native_pd.Index([11, 22, 33]) + ) + native_data2 = ( + native_pd.Series([44, 55, 66]) + if data2 == "series" + else native_pd.Index([44, 55, 66]) + ) + snow_data1 = ( + pd.Series([11, 22, 33]) if data1 == "series" else pd.Index([11, 22, 33]) + ) + snow_data2 = ( + pd.Series([44, 55, 66]) if data2 == "series" else pd.Index([44, 55, 66]) + ) + native_data = [native_data1, native_data2] + snow_data = [snow_data1, snow_data2] + + # Create DataFrame only with list data. + native_df = native_pd.DataFrame(native_data) + snow_df = pd.DataFrame(snow_data) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with list data and Series index. + native_ser_index = native_pd.Series([2, 11]) + snow_ser_index = pd.Series([2, 11]) + native_df = native_pd.DataFrame(native_data, index=native_ser_index) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df, check_dtype=False) + + # Create DataFrame with list data and Index index. + native_index = native_pd.Index([22, 11]) + snow_index = pd.Index([22, 11]) + native_df = native_pd.DataFrame(native_data, index=native_index) + snow_df = pd.DataFrame(snow_data, index=snow_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df, check_dtype=False) + + # Create DataFrame with list data, Series index, and columns. + columns = ["A", "B", "C"] + native_df = native_pd.DataFrame( + native_data, index=native_ser_index, columns=columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index, columns=columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df, check_dtype=False) + + # Create DataFrame with list data, Index index, and Index columns. + native_columns = native_pd.Index(columns) + snow_columns = pd.Index(columns) + native_df = native_pd.DataFrame( + native_data, index=native_index, columns=native_columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_index, columns=snow_columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df, check_dtype=False) + + +@pytest.mark.parametrize( + "data1, data2", [("series", "series"), ("series", "index"), ("index", "index")] +) +def test_create_series_with_series_index_list_data(data1, data2): + # Create the list data. + native_data1 = ( + native_pd.Series([11, 22, 33]) + if data1 == "series" + else native_pd.Index([11, 22, 33]) + ) + native_data2 = ( + native_pd.Series([44, 55, 66]) + if data2 == "series" + else native_pd.Index([44, 55, 66]) + ) + snow_data1 = ( + pd.Series([11, 22, 33]) if data1 == "series" else pd.Index([11, 22, 33]) + ) + snow_data2 = ( + pd.Series([44, 55, 66]) if data2 == "series" else pd.Index([44, 55, 66]) + ) + native_data = [native_data1, native_data2] + snow_data = [snow_data1, snow_data2] + + # Create Series only with list data. + native_df = native_pd.Series(native_data) + snow_df = pd.Series(snow_data) + with SqlCounter(query_count=1): + assert_series_equal(snow_df, native_df) + + # Create Series with list data and Series index. + native_ser_index = native_pd.Series([2, 11]) + snow_ser_index = pd.Series([2, 11]) + native_df = native_pd.Series(native_data, index=native_ser_index) + snow_df = pd.Series(snow_data, index=snow_ser_index) + with SqlCounter(query_count=1): + assert_series_equal(snow_df, native_df, check_dtype=False) + + # Create Series with list data and Index index. + native_index = native_pd.Index([22, 11]) + snow_index = pd.Index([22, 11]) + native_df = native_pd.Series(native_data, index=native_index) + snow_df = pd.Series(snow_data, index=snow_index) + with SqlCounter(query_count=1): + assert_series_equal(snow_df, native_df, check_dtype=False) + + +@pytest.mark.parametrize( + "data1, data2", [("series", "series"), ("series", "index"), ("index", "index")] +) +def test_create_series_with_series_index_dict_data(data1, data2): + # Create the dict data. + native_data1 = ( + native_pd.Series([1, 2, 3]) if data1 == "series" else native_pd.Index([1, 2, 3]) + ) + native_data2 = ( + native_pd.Series([4, 5, 6]) if data2 == "series" else native_pd.Index([4, 5, 6]) + ) + snow_data1 = pd.Series([1, 2, 3]) if data1 == "series" else pd.Index([1, 2, 3]) + snow_data2 = pd.Series([4, 5, 6]) if data2 == "series" else pd.Index([4, 5, 6]) + native_data = {11: native_data1, 22: native_data2} + snow_data = {11: snow_data1, 22: snow_data2} + + # Create DataFrame only with dict data. + native_df = native_pd.Series(native_data) + snow_df = pd.Series(snow_data) + with SqlCounter(query_count=1): + assert_series_equal(snow_df, native_df) + + # Create DataFrame with dict data and Series index. + native_ser_index = native_pd.Series([9, 2, 999]) + snow_ser_index = pd.Series([9, 2, 999]) + native_df = native_pd.Series(native_data, index=native_ser_index) + snow_df = pd.Series(snow_data, index=snow_ser_index) + with SqlCounter(query_count=1): + assert_series_equal(snow_df, native_df) + + # Create DataFrame with dict data and Index index. + native_index = native_pd.Index([9, 2, 999]) + snow_index = pd.Index([9, 2, 999]) + native_df = native_pd.Series(native_data, index=native_index) + snow_df = pd.Series(snow_data, index=snow_index) + with SqlCounter(query_count=1): + assert_series_equal(snow_df, native_df) + + +def test_create_df_with_mixed_series_index_dict_data(): + # Create the dict data. + native_data1 = native_pd.Series([1, 2, 3]) + native_data2 = native_pd.Index([4, 5, 6]) + data3 = [7, 8, 9] + snow_data1 = pd.Series([1, 2, 3]) + snow_data2 = pd.Index([4, 5, 6]) + native_data = {"A": native_data1, "B": native_data2, "C": data3} + snow_data = {"A": snow_data1, "B": snow_data2, "C": data3} + + # Create DataFrame only with dict data. + native_df = native_pd.DataFrame(native_data) + snow_df = pd.DataFrame(snow_data) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data and Series index. + native_ser_index = native_pd.Series([9, 2, 999]) + snow_ser_index = pd.Series([9, 2, 999]) + native_df = native_pd.DataFrame(native_data, index=native_ser_index) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data and Index index. + native_index = native_pd.Index([9, 2, 999]) + snow_index = pd.Index([9, 2, 999]) + native_df = native_pd.DataFrame(native_data, index=native_index) + snow_df = pd.DataFrame(snow_data, index=snow_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data, Series index, and columns. + columns = ["A", "B", "C"] + native_df = native_pd.DataFrame( + native_data, index=native_ser_index, columns=columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index, columns=columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data, Index index, and Index columns. + native_columns = native_pd.Index(columns) + snow_columns = pd.Index(columns) + native_df = native_pd.DataFrame( + native_data, index=native_index, columns=native_columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_index, columns=snow_columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + +def test_create_df_with_mixed_series_index_list_data(): + # Create the list data. + native_data1 = native_pd.Series([1, 2, 3]) + native_data2 = native_pd.Index([4, 5, 6]) + data3 = [7, 8, 9] + snow_data1 = pd.Series([1, 2, 3]) + snow_data2 = pd.Index([4, 5, 6]) + # Need to convert data3 to an Index since native pandas tries to perform `get_indexer` on it. + native_data = [native_data1, native_data2, native_pd.Index(data3)] + snow_data = [snow_data1, snow_data2, data3] + + # Create DataFrame only with list data. + native_df = native_pd.DataFrame(native_data) + snow_df = pd.DataFrame(snow_data) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with list data and Series index. + native_ser_index = native_pd.Series([2, 11, 0]) + snow_ser_index = pd.Series([2, 11, 0]) + native_df = native_pd.DataFrame(native_data, index=native_ser_index) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df, check_dtype=False) + + # Create DataFrame with list data and Index index. + native_index = native_pd.Index([22, 11, 0]) + snow_index = pd.Index([22, 11, 0]) + native_df = native_pd.DataFrame(native_data, index=native_index) + snow_df = pd.DataFrame(snow_data, index=snow_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df, check_dtype=False) + + # Create DataFrame with list data, Series index, and columns. + columns = ["A", "B", "C"] + native_df = native_pd.DataFrame( + native_data, index=native_ser_index, columns=columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index, columns=columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df, check_dtype=False) + + # Create DataFrame with list data, Index index, and Index columns. + native_columns = native_pd.Index(columns) + snow_columns = pd.Index(columns) + native_df = native_pd.DataFrame( + native_data, index=native_index, columns=native_columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_index, columns=snow_columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df, check_dtype=False) + + +@pytest.mark.xfail( + reason="SNOW-1638397 DataFrane creation fails: reindex does not work with string index" +) +def test_create_df_with_series_data_and_series_index(): + # Create the data and index. + native_data = native_pd.Series([1, 2, 3]) + native_index = native_pd.Series(["A", 0, "C"]) + snow_data = pd.Series(native_data) + snow_index = pd.Series(native_index) + + # Create DataFrame with Series data and Series index. + native_df = native_pd.DataFrame(native_data, index=native_index) + snow_df = pd.DataFrame(snow_data, index=snow_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + +@sql_count_checker(query_count=0) +def test_create_df_with_df_index_negative(): + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + native_pd.DataFrame( + [1, 2, 3], index=native_pd.DataFrame([[1, 2], [3, 4], [5, 6]]) + ) + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + pd.DataFrame([1, 2, 3], index=pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + + +@sql_count_checker(query_count=0) +def test_create_series_with_df_index_negative(): + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + native_pd.Series([1, 2, 3], index=native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + pd.Series([1, 2, 3], index=pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + + +@sql_count_checker(query_count=0) +def test_create_series_with_df_data_negative(): + with pytest.raises( + ValueError, + match=re.escape( + "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool()" + ", a.item(), a.any() or a.all()." + ), + ): + native_pd.Series(native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + pd.Series(pd.DataFrame([[1, 2], [3, 4], [5, 6]])) From da56734f3a7b14cb8d6d26c1f20882d69a21e147 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 13 Sep 2024 18:09:11 -0700 Subject: [PATCH 35/42] update df init --- .../plugin/extensions/dataframe_overrides.py | 295 ++++++++++++------ 1 file changed, 206 insertions(+), 89 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index 62c9cab4dc..a37b252e20 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -74,17 +74,21 @@ ) from snowflake.snowpark.modin.pandas.utils import ( create_empty_native_pandas_frame, - from_non_pandas, from_pandas, is_scalar, raise_if_native_pandas_objects, replace_external_data_keys_with_empty_pandas_series, replace_external_data_keys_with_query_compiler, + try_convert_index_to_native, ) from snowflake.snowpark.modin.plugin._internal.aggregation_utils import ( is_snowflake_agg_func, ) -from snowflake.snowpark.modin.plugin._internal.utils import is_repr_truncated +from snowflake.snowpark.modin.plugin._internal.utils import ( + convert_index_to_list_of_qcs, + convert_index_to_qc, + is_repr_truncated, +) from snowflake.snowpark.modin.plugin._typing import ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, @@ -459,104 +463,217 @@ def __init__( # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. - from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native + from snowflake.snowpark.modin.plugin.extensions.index import Index self._siblings = [] - # Engine.subscribe(_update_engine) - if isinstance(data, (DataFrame, Series)): - self._query_compiler = data._query_compiler.copy() - if index is not None and any(i not in data.index for i in index): - ErrorMessage.not_implemented( - "Passing non-existant columns or index values to constructor not" - + " yet implemented." - ) # pragma: no cover - if isinstance(data, Series): - # We set the column name if it is not in the provided Series - if data.name is None: - self.columns = [0] if columns is None else columns + # 0. Setting the query compiler + # ----------------------------- + if query_compiler is not None: + # CASE I: query_compiler + # If a query_compiler is passed in only use the query_compiler field to create a new DataFrame. + assert ( + data is None + ), "Invalid DataFrame construction! Cannot pass both data and query_compiler." + assert ( + index is None + ), "Invalid DataFrame construction! Cannot pass both index and query_compiler." + assert ( + columns is None + ), "Invalid DataFrame construction! Cannot pass both columns and query_compiler." + self._query_compiler = query_compiler + return + + if isinstance(index, DataFrame): # pandas raises the same error + raise ValueError("Index data must be 1-dimensional") + + # The logic followed here is: + # 1. Create a query_compiler from the provided data. If columns are provided, add/select the columns. + # 2. If an index is provided, set the index through set_index or reindex. + # 3. If the data is a DataFrame, perform loc to select the required index and columns from the DataFrame. + # 4. The resultant query_compiler is then set as the query_compiler for the DataFrame. + + # 1. Setting the data (and columns) + # --------------------------------- + if isinstance(data, Index): + # CASE II: data is a Snowpark pandas Index + # If the data is an Index object, convert it to a DataFrame to make sure that the values are in the + # correct format: the values are a data column, not an index column. + if data.name is None: + new_name = 0 if columns is None else columns[0] + else: + new_name = data.name + query_compiler = data.to_frame(index=False, name=new_name)._query_compiler + + elif isinstance(data, Series): + # CASE III: data is a Snowpark pandas Series + query_compiler = data._query_compiler.copy() + # We set the column name if it is not in the provided Series `data`. + if data.name is None: + query_compiler = query_compiler.set_columns(columns or [0]) + if columns is not None and data.name not in columns: # If the columns provided are not in the named Series, pandas clears # the DataFrame and sets columns to the columns provided. - elif columns is not None and data.name not in columns: - self._query_compiler = from_pandas( - self.__constructor__(columns=columns) - )._query_compiler - if index is not None: - self._query_compiler = data.loc[index]._query_compiler - elif columns is None and index is None: + query_compiler = from_pandas( + native_pd.DataFrame(columns=columns) + )._query_compiler + + elif isinstance(data, DataFrame): + # CASE IV: data is a Snowpark pandas DataFrame + query_compiler = data._query_compiler.copy() + if columns is None and index is None: + # Special case IV.a: if the new DataFrame has the same columns and index as the original DataFrame, + # the query compiler is shared and kept track of as a sibling. + self._query_compiler = query_compiler data._add_sibling(self) - else: - if columns is not None and any(i not in data.columns for i in columns): - ErrorMessage.not_implemented( - "Passing non-existant columns or index values to constructor not" - + " yet implemented." - ) # pragma: no cover - if index is None: - index = slice(None) - if columns is None: - columns = slice(None) - self._query_compiler = data.loc[index, columns]._query_compiler - - # Check type of data and use appropriate constructor - elif query_compiler is None: - distributed_frame = from_non_pandas(data, index, columns, dtype) - if distributed_frame is not None: - self._query_compiler = distributed_frame._query_compiler return + # The `columns` parameter is used to select the columns from `data` that will be in the resultant + # DataFrame. If a value in `columns` is not present in `data`'s columns, it will be added as a + # new column filled with NaN values. These columns are tracked by the `extra_columns` variable. + if data.columns is not None and columns is not None: + extra_columns = [col for col in columns if col not in data.columns] + else: + extra_columns = [] + query_compiler = data._query_compiler.create_qc_with_extra_columns( + extra_columns + ) - if isinstance(data, native_pd.Index): - pass - elif is_list_like(data) and not is_dict_like(data): - old_dtype = getattr(data, "dtype", None) - values = [ - obj._to_pandas() if isinstance(obj, Series) else obj for obj in data - ] - if isinstance(data, np.ndarray): - data = np.array(values, dtype=old_dtype) - else: - try: - data = type(data)(values, dtype=old_dtype) - except TypeError: - data = values - elif is_dict_like(data) and not isinstance( - data, (native_pd.Series, Series, native_pd.DataFrame, DataFrame) - ): - if columns is not None: - data = {key: value for key, value in data.items() if key in columns} - - if len(data) and all(isinstance(v, Series) for v in data.values()): - from modin.pandas import concat - - new_qc = concat(data.values(), axis=1, keys=data.keys())._query_compiler - - if dtype is not None: - new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) - if index is not None: - new_qc = new_qc.reindex( - axis=0, labels=try_convert_index_to_native(index) - ) + else: + # CASE V: Non-Snowpark pandas data + if not isinstance( + data, (native_pd.Series, native_pd.DataFrame, native_pd.Index) + ) and is_list_like(data): + from snowflake.snowpark.modin.pandas import concat + + if is_dict_like(data): + # Setting up keys and values for processing if all the values are Snowpark pandas objects. if columns is not None: - new_qc = new_qc.reindex( - axis=1, labels=try_convert_index_to_native(columns) - ) + # Reduce the dictionary to only the relevant columns as the keys. + data = {key: value for key, value in data.items() if key in columns} + + if len(data) and all( + isinstance(v, (Index, BasePandasDataset)) for v in data.values() + ): + # Special case V.a: data is a list/dict where all the values are Snowpark pandas objects. + # Concat can only be performed with BasePandasDataset objects. + # If a value is an Index, convert it to a Series where the index is the index to be set + # since these values are always present in the final DataFrame. + values = [ + Series(v, index=index) if isinstance(v, Index) else v + for v in data.values() + ] + new_qc = concat(values, axis=1, keys=data.keys())._query_compiler + if dtype is not None: + new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) + if index is not None: + new_qc = new_qc.reindex( + axis=0, labels=convert_index_to_qc(index) + ) + if columns is not None: + new_qc = new_qc.reindex( + axis=1, labels=try_convert_index_to_native(columns) + ) + self._query_compiler = new_qc + return + + # If only some data is a Snowpark pandas object, convert it to pandas objects. + res = {} + index = try_convert_index_to_native(index) + for k, v in data.items(): + if isinstance(v, (Index)): + res[k] = v.to_pandas() + elif isinstance(v, BasePandasDataset): + # Need to perform reindex on the Series or DataFrame objects since only the data + # whose index matches the given index is kept. + res[k] = v.to_pandas().reindex(index=index) + else: + res[k] = v + data = res + + else: # list-like but not dict-like data. + if len(data) and all( + isinstance(v, (Index, BasePandasDataset)) for v in data + ): + # Special case V.c: data is a list/dict where all the values are Snowpark pandas objects. + # Concat can only be performed with BasePandasDataset objects. + # If a value is an Index, convert it to a Series. + values = [Series(v) if isinstance(v, Index) else v for v in data] + new_qc = concat(values, axis=1).T._query_compiler + if dtype is not None: + new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) + if index is not None: + new_qc = new_qc.set_index([convert_index_to_qc(index)]) + if columns is not None: + if all(isinstance(v, Index) for v in data): + # Special case: if all the values are Index objects, they are always present in the + # final result with the provided column names. Therefore, rename the columns. + new_qc = new_qc.set_columns( + try_convert_index_to_native(columns) + ) + else: + new_qc = new_qc.reindex(axis=1, labels=columns) + self._query_compiler = new_qc + return + + # If only some data is a Snowpark pandas object, convert it to pandas objects. + res = [] + for v in data: + if isinstance(v, (Index)): + res.append(v.to_pandas()) + elif isinstance(v, BasePandasDataset): + res.append(v.to_pandas()) + else: + # Need to convert this is a native pandas object since native pandas incorrectly + # tries to perform `get_indexer` on it. + res.append(native_pd.Index(v if is_list_like(v) else [v])) + data = res + + query_compiler = from_pandas( + native_pd.DataFrame( + data=data, + # Handle setting the index, if it is a lazy index, outside this block. + index=None if isinstance(index, (Index, Series)) else index, + columns=try_convert_index_to_native(columns), + dtype=dtype, + copy=copy, + ) + )._query_compiler + + # 2. Setting the index + # -------------------- + # The index is already set if the data is a non-Snowpark pandas object. + # If either the data or the index is a Snowpark pandas object, set the index here. + if index is not None and ( + isinstance(index, (Index, Series)) + or isinstance(data, (Index, BasePandasDataset)) + ): + if isinstance(data, (type(self), Series, type(None))): + # The `index` parameter is used to select the rows from `data` that will be in the resultant DataFrame. + # If a value in `index` is not present in `data`'s index, it will be filled with a NaN value. + # If data is None and an index is provided, set the index. + query_compiler = query_compiler.reindex( + axis=0, labels=convert_index_to_qc(index) + ) + else: + # Performing set index to directly set the index column (joining on row-position instead of index). + query_compiler = query_compiler.set_index( + convert_index_to_list_of_qcs(index) + ) - self._query_compiler = new_qc - return - - data = { - k: v._to_pandas() if isinstance(v, Series) else v - for k, v in data.items() - } - pandas_df = native_pd.DataFrame( - data=try_convert_index_to_native(data), - index=try_convert_index_to_native(index), - columns=try_convert_index_to_native(columns), - dtype=dtype, - copy=copy, + # 3. If data is a DataFrame, filter result + # ---------------------------------------- + if isinstance(data, DataFrame): + # To select the required index and columns for the resultant DataFrame, + # perform .loc[] on the created query compiler. + index = slice(None) if index is None else index + columns = slice(None) if columns is None else columns + query_compiler = ( + DataFrame(query_compiler=query_compiler).loc[index, columns]._query_compiler ) - self._query_compiler = from_pandas(pandas_df)._query_compiler - else: - self._query_compiler = query_compiler + + # 4. Setting the query compiler + # ----------------------------- + self._query_compiler = query_compiler @register_dataframe_accessor("__dataframe__") From 8b47e174092d2f623e5ade75e774fb4bb02055fd Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 16 Sep 2024 15:46:29 -0700 Subject: [PATCH 36/42] moved common logic out, fixed some tests --- .../snowpark/modin/plugin/_internal/utils.py | 23 +++++ .../plugin/extensions/dataframe_overrides.py | 40 ++++---- .../plugin/extensions/series_overrides.py | 17 ++-- tests/integ/modin/frame/test_dtypes.py | 11 ++- tests/integ/modin/frame/test_idxmax_idxmin.py | 2 +- tests/integ/modin/frame/test_insert.py | 6 +- tests/integ/modin/frame/test_loc.py | 16 ++-- tests/integ/modin/frame/test_mask.py | 7 +- tests/integ/modin/frame/test_merge.py | 96 +++++++++---------- tests/integ/modin/frame/test_reindex.py | 6 +- tests/integ/modin/frame/test_rename.py | 2 +- tests/integ/modin/frame/test_setitem.py | 2 +- tests/integ/modin/frame/test_to_snowflake.py | 6 +- tests/integ/modin/frame/test_where.py | 7 +- .../test_df_series_creation_with_index.py | 67 ++++--------- .../modin/pivot/test_pivot_table_single.py | 2 +- tests/integ/modin/resample/test_resample.py | 26 ++--- .../modin/resample/test_resample_asfreq.py | 4 +- .../modin/resample/test_resample_fillna.py | 12 +-- tests/integ/modin/series/test_loc.py | 6 +- tests/integ/modin/series/test_sort_values.py | 2 +- tests/integ/modin/test_concat.py | 2 +- 22 files changed, 175 insertions(+), 187 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/utils.py b/src/snowflake/snowpark/modin/plugin/_internal/utils.py index 178111b0f3..1e53ee34f9 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/utils.py @@ -12,6 +12,7 @@ import numpy as np import pandas as native_pd from pandas._typing import Scalar +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype, is_scalar from pandas.core.dtypes.inference import is_list_like @@ -1998,6 +1999,28 @@ def rindex(lst: list, value: int) -> int: return len(lst) - lst[::-1].index(value) - 1 +def error_checking_for_init( + index: Any, dtype: Union[str, np.dtype, ExtensionDtype] +) -> None: + """ + Common error messages for the Series and DataFrame constructors. + + Parameters + ---------- + index: Any + The index to check. + dtype: str, numpy.dtype, or ExtensionDtype + The dtype to check. + """ + from modin.pandas import DataFrame + + if isinstance(index, DataFrame): # pandas raises the same error + raise ValueError("Index data must be 1-dimensional") + + if dtype == "category": + raise NotImplementedError("pandas type category is not implemented") + + def convert_index_to_qc(index: Any) -> Any: """ Method to convert an object representing an index into a query compiler for set_index or reindex. diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index a37b252e20..890c486bb0 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -87,6 +87,7 @@ from snowflake.snowpark.modin.plugin._internal.utils import ( convert_index_to_list_of_qcs, convert_index_to_qc, + error_checking_for_init, is_repr_truncated, ) from snowflake.snowpark.modin.plugin._typing import ListLike @@ -484,8 +485,7 @@ def __init__( self._query_compiler = query_compiler return - if isinstance(index, DataFrame): # pandas raises the same error - raise ValueError("Index data must be 1-dimensional") + error_checking_for_init(index, dtype) # The logic followed here is: # 1. Create a query_compiler from the provided data. If columns are provided, add/select the columns. @@ -500,6 +500,7 @@ def __init__( # If the data is an Index object, convert it to a DataFrame to make sure that the values are in the # correct format: the values are a data column, not an index column. if data.name is None: + # If no name is provided, the default name is 0. new_name = 0 if columns is None else columns[0] else: new_name = data.name @@ -510,6 +511,7 @@ def __init__( query_compiler = data._query_compiler.copy() # We set the column name if it is not in the provided Series `data`. if data.name is None: + # If no name is provided, the default name is 0. query_compiler = query_compiler.set_columns(columns or [0]) if columns is not None and data.name not in columns: # If the columns provided are not in the named Series, pandas clears @@ -607,9 +609,7 @@ def __init__( if all(isinstance(v, Index) for v in data): # Special case: if all the values are Index objects, they are always present in the # final result with the provided column names. Therefore, rename the columns. - new_qc = new_qc.set_columns( - try_convert_index_to_native(columns) - ) + new_qc = new_qc.set_columns(columns) else: new_qc = new_qc.reindex(axis=1, labels=columns) self._query_compiler = new_qc @@ -618,14 +618,16 @@ def __init__( # If only some data is a Snowpark pandas object, convert it to pandas objects. res = [] for v in data: - if isinstance(v, (Index)): - res.append(v.to_pandas()) - elif isinstance(v, BasePandasDataset): + if isinstance(v, (Index, BasePandasDataset)): res.append(v.to_pandas()) + # elif is_dict_like(v) or isinstance(v, (native_pd.Series, native_pd.DataFrame, native_pd.Index)): + # res.append(v) else: - # Need to convert this is a native pandas object since native pandas incorrectly - # tries to perform `get_indexer` on it. - res.append(native_pd.Index(v if is_list_like(v) else [v])) + # # Need to convert this is a native pandas object since native pandas incorrectly + # # tries to perform `get_indexer` on it. Specify dtype=object so that pandas does not + # # cast the data provided. In some cases, None turns to NaN, which is not desired. + # res.append(native_pd.Index(v, dtype=object) if is_list_like(v) else v) + res.append(v) data = res query_compiler = from_pandas( @@ -662,13 +664,14 @@ def __init__( # 3. If data is a DataFrame, filter result # ---------------------------------------- - if isinstance(data, DataFrame): - # To select the required index and columns for the resultant DataFrame, - # perform .loc[] on the created query compiler. - index = slice(None) if index is None else index - columns = slice(None) if columns is None else columns + if isinstance(data, DataFrame) and columns is not None: + # To select the columns for the resultant DataFrame, perform .loc[] on the created query compiler. + # This step is performed to ensure that the right columns are picked from the InternalFrame since we + # never explicitly drop the unwanted columns. query_compiler = ( - DataFrame(query_compiler=query_compiler).loc[index, columns]._query_compiler + DataFrame(query_compiler=query_compiler) + .loc[slice(None), columns] + ._query_compiler ) # 4. Setting the query compiler @@ -1181,6 +1184,9 @@ def insert( # Dictionary keys are treated as index column and this should be joined with # index of target dataframe. This behavior is similar to 'value' being DataFrame # or Series, so we simply create Series from dict data here. + if isinstance(value, set): + raise TypeError(f"'{type(value).__name__}' type is unordered") + if isinstance(value, dict): value = Series(value, name=column) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index 1719a9d167..bb3bb612b5 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -45,6 +45,7 @@ from snowflake.snowpark.modin.plugin._internal.utils import ( convert_index_to_list_of_qcs, convert_index_to_qc, + error_checking_for_init, ) from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike from snowflake.snowpark.modin.plugin.utils.error_message import ( @@ -367,8 +368,7 @@ def __init__( self.name = name return - if isinstance(index, spd.DataFrame): # pandas raises the same error - raise ValueError("Index data must be 1-dimensional") + error_checking_for_init(index, dtype) if isinstance(data, spd.DataFrame): # pandas raises an ambiguous error: @@ -398,11 +398,12 @@ def __init__( else: # CASE IV: Non-Snowpark pandas data # If the data is not a Snowpark pandas object, convert it to a query compiler. + # The query compiler uses the '__reduced__' name internally as a column name to represent pandas + # Series objects that are not explicitly assigned a name. + # This helps to distinguish between an N-element Series and 1xN DataFrame. name = name or MODIN_UNNAMED_SERIES_LABEL - if ( - isinstance(data, (native_pd.Series, native_pd.Index)) - and data.name is not None - ): + if hasattr(data, "name") and data.name is not None: + # If data is an object that has a name field, use that as the name of the new Series. name = data.name # If any of the values are Snowpark pandas objects, convert them to native pandas objects. if not isinstance( @@ -422,9 +423,9 @@ def __init__( native_pd.DataFrame( native_pd.Series( data=data, - dtype=dtype, - # Handle setting the index, if it is a lazy index, outside this block. + # If the index is a lazy index, handle setting it outside this block. index=None if isinstance(index, (Index, Series)) else index, + dtype=dtype, name=name, copy=copy, fastpath=fastpath, diff --git a/tests/integ/modin/frame/test_dtypes.py b/tests/integ/modin/frame/test_dtypes.py index b078b31f6c..d4cff60164 100644 --- a/tests/integ/modin/frame/test_dtypes.py +++ b/tests/integ/modin/frame/test_dtypes.py @@ -473,22 +473,23 @@ def test_empty_index(index, expected_index_dtype): @pytest.mark.parametrize( - "input_data, type_msg", + "input_data, dtype, type_msg", [ - (native_pd.Categorical([1, 2, 3, 1, 2, 3]), "category"), - (native_pd.Categorical(["a", "b", "c", "a", "b", "c"]), "category"), + (native_pd.Categorical([1, 2, 3, 1, 2, 3]), "category", "category"), + (native_pd.Categorical(["a", "b", "c", "a", "b", "c"]), "category", "category"), ( native_pd.period_range("2015-02-03 11:22:33.4567", periods=5, freq="s"), + None, r"period\[s\]", ), ], ) @sql_count_checker(query_count=0) -def test_unsupported_dtype_raises(input_data, type_msg) -> None: +def test_unsupported_dtype_raises(input_data, dtype, type_msg) -> None: with pytest.raises( NotImplementedError, match=f"pandas type {type_msg} is not implemented" ): - pd.Series(input_data) + pd.Series(input_data, dtype=dtype) @pytest.mark.parametrize( diff --git a/tests/integ/modin/frame/test_idxmax_idxmin.py b/tests/integ/modin/frame/test_idxmax_idxmin.py index 1059abf9d8..87041060bd 100644 --- a/tests/integ/modin/frame/test_idxmax_idxmin.py +++ b/tests/integ/modin/frame/test_idxmax_idxmin.py @@ -194,7 +194,7 @@ def test_idxmax_idxmin_with_dates(func, axis): ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize( "axis", diff --git a/tests/integ/modin/frame/test_insert.py b/tests/integ/modin/frame/test_insert.py index 414889d337..5f995e24a7 100644 --- a/tests/integ/modin/frame/test_insert.py +++ b/tests/integ/modin/frame/test_insert.py @@ -212,7 +212,7 @@ def test_insert_dataframe_shape_negative(native_df): (np.ones((1, 1)), 1), ([1, 2], 1), # len < number of rows ((6, 7, 8, 9), 1), # len > number of rows - ({"a", "b", "c"}, 1), # python set + ({"a", "b", "c"}, 0), # python set ], ) def test_insert_value_negative(native_df, value, expected_query_count): @@ -725,12 +725,10 @@ def test_insert_multiindex_column_negative(snow_df, columns, insert_label): [["a", "b", "b", "d", "e"], ["x", "y", "z", "u", "u"], True], ], ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_insert_with_unique_and_duplicate_index_values( index_values, other_index_values, expect_mismatch ): - # Two of the three joins come from creating the DataFrame with non-Snowpark pandas data - # and a Snowpark pandas Index. The third join is from the insert operation. data = list(range(5)) data1 = {"foo": data} data2 = {"bar": [val * 10 for val in data]} diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index b1f89365eb..81afb9cfd8 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -3945,12 +3945,12 @@ def test_raise_set_cell_with_list_like_value_error(): reason="SNOW-1652608 result series name incorrectly set" ), ), # 1 join fron df creation, 1 join from squeeze, 2 joins from to_pandas during eval - (["1 day", "3 days"], 1, 2), - ([True, False, False], 1, 2), - (slice(None, "4 days"), 1, 1), - (slice(None, "4 days", 2), 1, 1), - (slice("1 day", "2 days"), 1, 1), - (slice("1 day 1 hour", "2 days 2 hours", -1), 1, 1), + (["1 day", "3 days"], 1, 1), + ([True, False, False], 1, 1), + (slice(None, "4 days"), 1, 0), + (slice(None, "4 days", 2), 1, 0), + (slice("1 day", "2 days"), 1, 0), + (slice("1 day 1 hour", "2 days 2 hours", -1), 1, 0), ], ) def test_df_loc_get_with_timedelta(key, query_count, join_count): @@ -4017,7 +4017,7 @@ def test_df_loc_get_with_timedelta(key, query_count, join_count): ), ], ) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=2) def test_df_loc_get_with_timedelta_behavior_difference(key, expected_result): # In these test cases, native pandas raises a KeyError but Snowpark pandas works correctly. data = { @@ -4037,7 +4037,7 @@ def test_df_loc_get_with_timedelta_behavior_difference(key, expected_result): assert_frame_equal(actual_result, expected_result) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=3, join_count=1) def test_df_loc_get_with_timedeltaindex_key(): data = { "A": [1, 2, 3], diff --git a/tests/integ/modin/frame/test_mask.py b/tests/integ/modin/frame/test_mask.py index 53afbd7bf8..c6b1290902 100644 --- a/tests/integ/modin/frame/test_mask.py +++ b/tests/integ/modin/frame/test_mask.py @@ -683,7 +683,7 @@ def test_dataframe_mask_with_duplicated_index_aligned(cond_frame, other): native_other = other snow_other = other - expected_join_count = 2 if isinstance(other, int) else 3 + expected_join_count = 1 if isinstance(other, int) else 2 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( snow_df, @@ -694,9 +694,8 @@ def test_dataframe_mask_with_duplicated_index_aligned(cond_frame, other): ) -# Three extra joins when creating the 3 snowpark pandas dataframes with non-Snowpark pandas -# data and Snowpark pandas Index. -@sql_count_checker(query_count=1, join_count=5) +# Three extra queries to convert to native index for dataframe constructor when creating the 3 snowpark pandas dataframes +@sql_count_checker(query_count=4, join_count=2) def test_dataframe_mask_with_duplicated_index_unaligned(): data = [3, 4, 5, 2] df_index = pd.Index([2, 1, 2, 3], name="index") diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py index 8b9b5472e3..04f0ca42fd 100644 --- a/tests/integ/modin/frame/test_merge.py +++ b/tests/integ/modin/frame/test_merge.py @@ -276,13 +276,13 @@ def _verify_merge( @pytest.mark.parametrize("on", ["A", "B", ["A", "B"], ("A", "B")]) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_on(left_df, right_df, on, how, sort): _verify_merge(left_df, right_df, how, on=on, sort=sort) @pytest.mark.parametrize("on", ["left_i", "right_i"]) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_on_index_columns(left_df, right_df, how, on, sort): # Change left_df to: columns=["right_i", "B", "left_c", "left_d"] index=["left_i"] left_df = left_df.rename(columns={"A": "right_i"}) @@ -361,7 +361,7 @@ def test_join_type_mismatch_diff_with_native_pandas(index1, index2, expected_res @pytest.mark.parametrize("on", ["A", "B", "C"]) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_on_index_columns_with_multiindex(left_df, right_df, how, on, sort): # Change left_df to: columns = ["C", "left_d"] index = ["A", "B"] left_df = left_df.rename(columns={"left_c": "C"}).set_index(["A", "B"]) @@ -370,7 +370,7 @@ def test_merge_on_index_columns_with_multiindex(left_df, right_df, how, on, sort _verify_merge(left_df, right_df, how, on=on, sort=sort) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_on_multiindex_with_non_multiindex(left_df, right_df, how, sort): # Change left_df to: columns = ["A", "B"] index = ["left_c", "left_d"] left_df = left_df.set_index(["left_c", "left_d"]) @@ -392,29 +392,29 @@ def test_merge_on_multiindex_with_non_multiindex(left_df, right_df, how, sort): (["A", "left_i"], ["B", "right_i"]), # Mix of index and data join keys ], ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_left_on_right_on(left_df, right_df, how, left_on, right_on, sort): _verify_merge(left_df, right_df, how, left_on=left_on, right_on=right_on, sort=sort) @pytest.mark.parametrize("left_on", ["left_i", "A", "B"]) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_left_on_right_index(left_df, right_df, how, left_on, sort): _verify_merge(left_df, right_df, how, left_on=left_on, right_index=True, sort=sort) @pytest.mark.parametrize("right_on", ["right_i", "A", "B"]) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_left_index_right_on(left_df, right_df, how, right_on, sort): _verify_merge(left_df, right_df, how, left_index=True, right_on=right_on, sort=sort) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_on_index_single_index(left_df, right_df, how, sort): _verify_merge(left_df, right_df, how, left_index=True, right_index=True, sort=sort) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_on_index_multiindex_common_labels(left_df, right_df, how, sort): left_df = left_df.set_index("A", append=True) # index columns ['left_i', 'A'] right_df = right_df.set_index("A", append=True) # index columns ['right_i', 'A'] @@ -444,7 +444,7 @@ def test_merge_on_index_multiindex_common_labels_with_none( ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_on_index_multiindex_equal_labels(left_df, right_df, how, sort): # index columns ['A', 'B] left_df = left_df.set_index(["A", "B"]) @@ -463,7 +463,7 @@ def test_merge_left_index_right_index_single_to_multi(left_df, right_df, how, so if how == "inner" and sort is False: pytest.skip("pandas bug: https://github.com/pandas-dev/pandas/issues/55774") else: - with SqlCounter(query_count=3, join_count=5): + with SqlCounter(query_count=3, join_count=1): _verify_merge( left_df, right_df, @@ -489,7 +489,7 @@ def test_merge_left_index_right_index_single_to_multi(left_df, right_df, how, so .merge(right_df.to_pandas(), how=how, on="left_i", sort=sort) .reset_index(drop=True) ) - with SqlCounter(query_count=1, join_count=3): + with SqlCounter(query_count=1, join_count=1): assert_snowpark_pandas_equal_to_pandas( snow_res.reset_index(drop=True), native_res ) @@ -500,7 +500,7 @@ def test_merge_left_index_right_index_multi_to_single(left_df, right_df, how, so "right_i", append=True ) # index columns ['left_i', 'right_i'] if how in ("left", "inner"): - with SqlCounter(query_count=3, join_count=5): + with SqlCounter(query_count=3, join_count=1): _verify_merge( left_df, right_df, how=how, left_index=True, right_index=True, sort=sort ) @@ -519,13 +519,13 @@ def test_merge_left_index_right_index_multi_to_single(left_df, right_df, how, so .merge(right_df.to_pandas(), how=how, on="right_i", sort=sort) .reset_index(drop=True) ) - with SqlCounter(query_count=1, join_count=3): + with SqlCounter(query_count=1, join_count=1): assert_snowpark_pandas_equal_to_pandas( snow_res.reset_index(drop=True), native_res ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_left_index_right_index_no_common_names_negative(left_df, right_df): left_df = left_df.set_index("B", append=True) # index columns ['left_i', 'B'] right_df = right_df.set_index("A", append=True) # index columns ['right_i', 'A'] @@ -543,7 +543,7 @@ def test_merge_left_index_right_index_no_common_names_negative(left_df, right_df ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_left_index_right_index_none_as_common_label_negative(left_df, right_df): # index columns [None, 'B'] left_df = left_df.reset_index(drop=True).set_index("B", append=True) @@ -563,7 +563,7 @@ def test_merge_left_index_right_index_none_as_common_label_negative(left_df, rig ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_cross(left_df, right_df, sort): eval_snowpark_pandas_result( left_df, @@ -587,7 +587,7 @@ def test_merge_cross(left_df, right_df, sort): {"left_index": True, "right_on": "A"}, ], ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=3) def test_merge_non_empty_with_empty(left_df, empty_df, how, kwargs, sort): _verify_merge(left_df, empty_df, how, sort=sort, **kwargs) @@ -601,7 +601,7 @@ def test_merge_non_empty_with_empty(left_df, empty_df, how, kwargs, sort): {"left_index": True, "right_on": "A"}, ], ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=3) def test_merge_empty_with_non_empty(empty_df, right_df, how, kwargs, sort): # Native pandas returns incorrect column order when left frame is empty. # https://github.com/pandas-dev/pandas/issues/51929 @@ -637,7 +637,7 @@ def test_merge_empty_with_non_empty(empty_df, right_df, how, kwargs, sort): (None, None, ["A", "B"], True, False), # left.num_index_levels != len(right_on) ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_mis_specified_negative( left_df, right_df, on, left_on, right_on, left_index, right_index ): @@ -666,7 +666,7 @@ def test_merge_mis_specified_negative( (None, None, None, False, True), # right_index is set to True ], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_cross_mis_specified_negative( left_df, right_df, on, left_on, right_on, left_index, right_index ): @@ -704,7 +704,7 @@ def test_merge_cross_mis_specified_negative( (0.0, 0.0, {"suffixes": ("_x", None)}), ], ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_suffix(left_df, right_df, left_col, right_col, kwargs): left_df = left_df.rename(columns={"A": left_col}) right_df = right_df.rename(columns={"A": right_col}) @@ -720,7 +720,7 @@ def test_merge_suffix(left_df, right_df, left_col, right_col, kwargs): ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_duplicate_suffix(left_df, right_df): eval_snowpark_pandas_result( left_df, @@ -734,7 +734,7 @@ def test_merge_duplicate_suffix(left_df, right_df): ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_label_conflict_with_suffix(left_df, right_df): # Test the behavior when adding suffix crates a conflict with another label. # Note: This raises a warning in pandas 2.0 and will raise an error in future @@ -758,7 +758,7 @@ def test_merge_label_conflict_with_suffix(left_df, right_df): ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_non_str_suffix(left_df, right_df): eval_snowpark_pandas_result( left_df, @@ -776,7 +776,7 @@ def test_merge_non_str_suffix(left_df, right_df): "suffixes", [(None, None), ("", None), (None, ""), ("", "")], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_empty_suffix_negative(left_df, right_df, suffixes): eval_snowpark_pandas_result( left_df, @@ -794,7 +794,7 @@ def test_merge_empty_suffix_negative(left_df, right_df, suffixes): "suffixes", [("a", "b", "c"), tuple("a")], ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_suffix_length_error_negative(left_df, right_df, suffixes): eval_snowpark_pandas_result( left_df, @@ -808,7 +808,7 @@ def test_merge_suffix_length_error_negative(left_df, right_df, suffixes): ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_duplicate_labels(left_df, right_df): # Change left_df columns to ["A", "B", "left_c", "left_c"] # 'left_c' is a duplicate label. @@ -824,7 +824,7 @@ def test_merge_duplicate_labels(left_df, right_df): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_duplicate_join_keys_negative(left_df, right_df): # Change left_df columns to ["A", "B", "left_c", "left_c"] # 'left_c' is a duplicate label. This can not be used as join key. @@ -860,14 +860,14 @@ def test_merge_with_self(): @pytest.mark.parametrize("on", ["A", "B"]) -@sql_count_checker(query_count=4, join_count=4) +@sql_count_checker(query_count=4, join_count=1) def test_merge_with_series(left_df, right_df, how, on, sort): native_series = right_df.to_pandas()[on] snow_series = pd.Series(native_series) _verify_merge(left_df, snow_series, how=how, on=on, sort=sort) -@sql_count_checker(query_count=1, join_count=1) +@sql_count_checker(query_count=1) def test_merge_with_unnamed_series_negative(left_df): native_series = native_pd.Series([1, 2, 3]) snow_series = pd.Series(native_series) @@ -922,7 +922,7 @@ def test_merge_outer_with_nan(dtype): _verify_merge(right, left, "outer", on="key") -@sql_count_checker(query_count=5, join_count=5) +@sql_count_checker(query_count=5, join_count=1) def test_merge_different_index_names(): left = pd.DataFrame({"a": [1]}, index=pd.Index([1], name="c")) right = pd.DataFrame({"a": [1]}, index=pd.Index([1], name="d")) @@ -937,13 +937,13 @@ def test_merge_different_index_names(): ) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_no_join_keys(left_df, right_df, how, sort): _verify_merge(left_df, right_df, how, sort=sort) @pytest.mark.parametrize("left_name, right_name", [("left_a", "right_a"), (1, "1")]) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_no_join_keys_negative(left_name, right_name, left_df, right_df): left_df = left_df.rename(columns={"A": left_name, "B": "left_b"}) right_df = right_df.rename(columns={"A": right_name, "B": "right_b"}) @@ -978,7 +978,7 @@ def test_merge_no_join_keys_common_index_negative(left_df, right_df): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_no_join_keys_common_index_with_data_negative(left_df, right_df): left_df = left_df.rename(columns={"A": "left_a", "B": "left_b"}) right_df = right_df.rename(columns={"A": "right_a", "B": "left_i"}) @@ -1002,16 +1002,16 @@ def test_merge_no_join_keys_common_index_with_data_negative(left_df, right_df): @pytest.mark.parametrize( "left_on, right_on, expected_query_count, expected_join_count", [ - (np.array(["a", "b", "c", "x", "y"]), "right_d", 5, 7), - ([np.array(["a", "b", "c", "x", "y"]), "A"], ["right_d", "A"], 5, 7), - ("left_d", np.array(["a", "b", "c", "x", "y"]), 5, 7), - (["left_d", "A"], [np.array(["a", "b", "c", "x", "y"]), "A"], 5, 7), - (["left_d", "A"], (np.array(["a", "b", "c", "x", "y"]), "A"), 5, 7), # tuple + (np.array(["a", "b", "c", "x", "y"]), "right_d", 5, 2), + ([np.array(["a", "b", "c", "x", "y"]), "A"], ["right_d", "A"], 5, 2), + ("left_d", np.array(["a", "b", "c", "x", "y"]), 5, 2), + (["left_d", "A"], [np.array(["a", "b", "c", "x", "y"]), "A"], 5, 2), + (["left_d", "A"], (np.array(["a", "b", "c", "x", "y"]), "A"), 5, 2), # tuple ( np.array(["a", "b", "c", "x", "y"]), np.array(["x", "y", "c", "a", "b"]), 7, - 9, + 3, ), ], ) @@ -1022,7 +1022,7 @@ def test_merge_on_array_like_keys( _verify_merge(left_df, right_df, how=how, left_on=left_on, right_on=right_on) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_on_array_like_keys_conflict_negative(left_df, right_df): left_on = np.array(["a", "b", "c", "x", "y"]) right_on = np.array(["x", "y", "c", "a", "b"]) @@ -1049,7 +1049,7 @@ def test_merge_on_array_like_keys_conflict_negative(left_df, right_df): np.array(["a", "b", "c", "a", "b", "c"]), # too long ], ) -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=2) def test_merge_on_array_like_keys_length_mismatch_negative(left_df, right_df, left_on): # Native pandas raises # ValueError: The truth value of an array with more than one element is ambiguous @@ -1061,22 +1061,22 @@ def test_merge_on_array_like_keys_length_mismatch_negative(left_df, right_df, le left_df.merge(right_df, left_on=left_on, right_on="right_d") -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_with_indicator(left_df, right_df, how): _verify_merge(left_df, right_df, how, on="A", indicator=True) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_with_indicator_cross_join(left_df, right_df): _verify_merge(left_df, right_df, how="cross", indicator=True) -@sql_count_checker(query_count=3, join_count=5) +@sql_count_checker(query_count=3, join_count=1) def test_merge_with_indicator_explicit_name(left_df, right_df): _verify_merge(left_df, right_df, "outer", on="A", indicator="indicator_col") -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_with_invalid_indicator_type_negative(left_df, right_df): eval_snowpark_pandas_result( left_df, @@ -1092,7 +1092,7 @@ def test_merge_with_invalid_indicator_type_negative(left_df, right_df): ) -@sql_count_checker(query_count=2, join_count=2) +@sql_count_checker(query_count=2) def test_merge_with_indicator_explicit_name_negative(left_df, right_df): left_df = left_df.rename(columns={"left_c": "_merge"}) eval_snowpark_pandas_result( diff --git a/tests/integ/modin/frame/test_reindex.py b/tests/integ/modin/frame/test_reindex.py index 98d0a41e7a..de1aacd786 100644 --- a/tests/integ/modin/frame/test_reindex.py +++ b/tests/integ/modin/frame/test_reindex.py @@ -209,7 +209,7 @@ def perform_reindex(df): perform_reindex, ) - @sql_count_checker(query_count=1, join_count=2) + @sql_count_checker(query_count=2, join_count=1) @pytest.mark.parametrize("limit", [None, 1, 2, 100]) @pytest.mark.parametrize("method", ["bfill", "backfill", "pad", "ffill"]) def test_reindex_index_datetime_with_fill(self, limit, method): @@ -248,7 +248,7 @@ def test_reindex_index_non_overlapping_index(self): snow_df, native_df, lambda df: df.reindex(axis=0, labels=list("EFG")) ) - @sql_count_checker(query_count=1, join_count=2) + @sql_count_checker(query_count=2, join_count=1) def test_reindex_index_non_overlapping_datetime_index(self): date_index = native_pd.date_range("1/1/2010", periods=6, freq="D") native_df = native_pd.DataFrame( @@ -273,7 +273,7 @@ def perform_reindex(df): snow_df, native_df, perform_reindex, check_freq=False ) - @sql_count_checker(query_count=0) + @sql_count_checker(query_count=1) def test_reindex_index_non_overlapping_different_types_index_negative(self): date_index = pd.date_range("1/1/2010", periods=6, freq="D") snow_df = pd.DataFrame( diff --git a/tests/integ/modin/frame/test_rename.py b/tests/integ/modin/frame/test_rename.py index a5595ec716..1be8956a9d 100644 --- a/tests/integ/modin/frame/test_rename.py +++ b/tests/integ/modin/frame/test_rename.py @@ -104,7 +104,7 @@ def test_rename(self, snow_float_frame): assert_index_equal(renamed.index, native_pd.Index(["A", "B", "foo", "bar"])) # index with name - with SqlCounter(query_count=1, join_count=2): + with SqlCounter(query_count=2, join_count=1): index = Index(["foo", "bar"], name="name") renamer = DataFrame(data, index=index) renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index 3d51277b2c..bc4a1393b2 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -444,7 +444,7 @@ def setitem_helper(df): [["a", "b", "b", "d", "e"], ["x", "y", "z", "u", "u"], True], ], ) -@sql_count_checker(query_count=1, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_df_setitem_with_unique_and_duplicate_index_values( index_values, other_index_values, expect_mismatch ): diff --git a/tests/integ/modin/frame/test_to_snowflake.py b/tests/integ/modin/frame/test_to_snowflake.py index 253fa542c9..6ddcdaf0db 100644 --- a/tests/integ/modin/frame/test_to_snowflake.py +++ b/tests/integ/modin/frame/test_to_snowflake.py @@ -16,7 +16,7 @@ @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("index_labels", [None, ["my_index"]]) # one extra query to convert index to native pandas when creating the snowpark pandas dataframe -@sql_count_checker(query_count=2, join_count=1) +@sql_count_checker(query_count=3) def test_to_snowflake_index(test_table_name, index, index_labels): df = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index") @@ -180,7 +180,7 @@ def test_to_snowflake_column_with_quotes(session, test_table_name): # one extra query to convert index to native pandas when creating the snowpark pandas dataframe -@sql_count_checker(query_count=0) +@sql_count_checker(query_count=1) def test_to_snowflake_index_label_none_raises(test_table_name): df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -198,7 +198,7 @@ def test_to_snowflake_index_label_none_raises(test_table_name): # one extra query to convert index to native pandas when creating the snowpark pandas dataframe -@sql_count_checker(query_count=0) +@sql_count_checker(query_count=1) def test_to_snowflake_data_label_none_raises(test_table_name): df = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6]}, index=pd.Index([2, 3, 4], name="index") diff --git a/tests/integ/modin/frame/test_where.py b/tests/integ/modin/frame/test_where.py index bd7a5b5808..59565b4fcb 100644 --- a/tests/integ/modin/frame/test_where.py +++ b/tests/integ/modin/frame/test_where.py @@ -690,7 +690,7 @@ def test_dataframe_where_with_duplicated_index_aligned(cond_frame, other): native_other = other snow_other = other - expected_join_count = 2 if isinstance(other, int) else 3 + expected_join_count = 1 if isinstance(other, int) else 2 with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( snow_df, @@ -701,9 +701,8 @@ def test_dataframe_where_with_duplicated_index_aligned(cond_frame, other): ) -# 3 extra joins to create the 3 snowpark pandas dataframe with non-Snowpark pandas data -# and a Snowpark pandas Index. -@sql_count_checker(query_count=1, join_count=5) +# 3 extra queries to convert index to native pandas when creating the 3 snowpark pandas dataframe +@sql_count_checker(query_count=4, join_count=2) def test_dataframe_where_with_duplicated_index_unaligned(): data = [3, 4, 5, 2] df_index = pd.Index([2, 1, 2, 3], name="index") diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index e337e53d4b..ccf1753f18 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -298,7 +298,7 @@ def test_create_with_series_as_data_and_index_as_index( ), # no index values match ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_create_df_with_df_as_data_and_index_as_index(native_df, native_index): """ Creating a DataFrame where the data is a DataFrame and the index is an Index. @@ -329,7 +329,7 @@ def test_create_df_with_df_as_data_and_index_as_index(native_df, native_index): ({}, native_pd.Index([10, 0, 1], name="non-empty index")), ], ) -@sql_count_checker(query_count=1, join_count=2) +@sql_count_checker(query_count=1, join_count=1) def test_create_df_with_empty_df_as_data_and_index_as_index(native_df, native_index): """ Creating a DataFrame where the data is an empty DataFrame and the index is an Index. @@ -443,7 +443,7 @@ def test_create_df_with_df_as_data_and_index_as_index_and_different_columns( qc = 1 if column_type == "list" else 2 qc += 1 if (isinstance(native_df, dict)) else 0 qc += 1 if (isinstance(native_df, dict) and column_type == "index") else 0 - jc = 2 if isinstance(native_df, native_pd.DataFrame) else 0 + jc = 1 if isinstance(native_df, native_pd.DataFrame) else 0 with SqlCounter(query_count=qc, join_count=jc): assert_frame_equal( pd.DataFrame(snow_df, index=snow_index, columns=native_columns), @@ -795,57 +795,22 @@ def test_create_df_with_mixed_series_index_dict_data(): assert_frame_equal(snow_df, native_df) -def test_create_df_with_mixed_series_index_list_data(): +@sql_count_checker(query_count=2) +def test_create_df_with_mixed_series_index_list_data_negative(): + """ + Since Snowpark pandas relies on native pandas for initialization a DataFrame with mixed data types, + they both raise the same error. + """ # Create the list data. - native_data1 = native_pd.Series([1, 2, 3]) - native_data2 = native_pd.Index([4, 5, 6]) + data1 = native_pd.Series([1, 2, 3]) + data2 = native_pd.Index([4, 5, 6]) data3 = [7, 8, 9] - snow_data1 = pd.Series([1, 2, 3]) - snow_data2 = pd.Index([4, 5, 6]) # Need to convert data3 to an Index since native pandas tries to perform `get_indexer` on it. - native_data = [native_data1, native_data2, native_pd.Index(data3)] - snow_data = [snow_data1, snow_data2, data3] - - # Create DataFrame only with list data. - native_df = native_pd.DataFrame(native_data) - snow_df = pd.DataFrame(snow_data) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df) - - # Create DataFrame with list data and Series index. - native_ser_index = native_pd.Series([2, 11, 0]) - snow_ser_index = pd.Series([2, 11, 0]) - native_df = native_pd.DataFrame(native_data, index=native_ser_index) - snow_df = pd.DataFrame(snow_data, index=snow_ser_index) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df, check_dtype=False) - - # Create DataFrame with list data and Index index. - native_index = native_pd.Index([22, 11, 0]) - snow_index = pd.Index([22, 11, 0]) - native_df = native_pd.DataFrame(native_data, index=native_index) - snow_df = pd.DataFrame(snow_data, index=snow_index) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df, check_dtype=False) - - # Create DataFrame with list data, Series index, and columns. - columns = ["A", "B", "C"] - native_df = native_pd.DataFrame( - native_data, index=native_ser_index, columns=columns - ) - snow_df = pd.DataFrame(snow_data, index=snow_ser_index, columns=columns) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df, check_dtype=False) - - # Create DataFrame with list data, Index index, and Index columns. - native_columns = native_pd.Index(columns) - snow_columns = pd.Index(columns) - native_df = native_pd.DataFrame( - native_data, index=native_index, columns=native_columns - ) - snow_df = pd.DataFrame(snow_data, index=snow_index, columns=snow_columns) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df, check_dtype=False) + err_msg = "'builtin_function_or_method' object has no attribute 'get_indexer'" + with pytest.raises(AttributeError, match=err_msg): + native_pd.DataFrame([data1, data2, data3]) + with pytest.raises(AttributeError, match=err_msg): + pd.DataFrame([pd.Series(data1), pd.Index(data2), data3]) @pytest.mark.xfail( diff --git a/tests/integ/modin/pivot/test_pivot_table_single.py b/tests/integ/modin/pivot/test_pivot_table_single.py index e53b553090..9feab0c09f 100644 --- a/tests/integ/modin/pivot/test_pivot_table_single.py +++ b/tests/integ/modin/pivot/test_pivot_table_single.py @@ -226,7 +226,7 @@ def test_pivot_table_with_sum_and_count_null_and_empty_values_matching_behavior_ # One extra query to convert to native pandas in dataframe constructor when creating snow_df -@sql_count_checker(query_count=5, join_count=2) +@sql_count_checker(query_count=6, join_count=1) def test_pivot_on_inline_data_using_temp_table(): # Create a large dataframe of inlined data that will spill to a temporary table. snow_df = pd.DataFrame( diff --git a/tests/integ/modin/resample/test_resample.py b/tests/integ/modin/resample/test_resample.py index af99185294..5ade497c4d 100644 --- a/tests/integ/modin/resample/test_resample.py +++ b/tests/integ/modin/resample/test_resample.py @@ -32,7 +32,7 @@ def randomword(length): @freq @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_with_varying_freq_and_interval(freq, interval, agg_func): rule = f"{interval}{freq}" eval_snowpark_pandas_result( @@ -45,7 +45,7 @@ def test_resample_with_varying_freq_and_interval(freq, interval, agg_func): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_date_before_snowflake_alignment_date(): # Snowflake TIMESLICE alignment date is 1970-01-01 00:00:00 date_data = native_pd.to_datetime( @@ -66,7 +66,7 @@ def test_resample_date_before_snowflake_alignment_date(): @interval -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_date_wraparound_snowflake_alignment_date(interval): # Snowflake TIMESLICE alignment date is 1970-01-01 00:00:00 date_data = native_pd.to_datetime( @@ -89,7 +89,7 @@ def test_resample_date_wraparound_snowflake_alignment_date(interval): @agg_func @freq -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_missing_data_upsample(agg_func, freq): # this tests to make sure that missing resample bins will be filled in. date_data = native_pd.date_range("2020-01-01", periods=13, freq=f"1{freq}").delete( @@ -103,7 +103,7 @@ def test_resample_missing_data_upsample(agg_func, freq): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_duplicated_timestamps_downsample(): date_data = native_pd.to_datetime( [ @@ -122,7 +122,7 @@ def test_resample_duplicated_timestamps_downsample(): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_duplicated_timestamps(): date_data = native_pd.to_datetime( [ @@ -161,7 +161,7 @@ def test_resample_series(freq, interval, agg_func): @pytest.mark.parametrize( "agg_func", ["max", "min", "mean", "median", "sum", "std", "var"] ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_numeric_only(agg_func): eval_snowpark_pandas_result( *create_test_dfs( @@ -174,7 +174,7 @@ def test_resample_numeric_only(agg_func): @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_df_with_nan(agg_func): # resample bins of 'A' each have a NaN. 1 resample bin of 'B' is all NaN eval_snowpark_pandas_result( @@ -202,7 +202,7 @@ def test_resample_ser_with_nan(agg_func): @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_single_resample_bin(agg_func): eval_snowpark_pandas_result( *create_test_dfs( @@ -215,7 +215,7 @@ def test_resample_single_resample_bin(agg_func): @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_index_with_nan(agg_func): datecol = native_pd.to_datetime( ["2020-01-01", "2020-01-03", "2020-01-05", np.nan, "2020-01-09", np.nan] @@ -230,7 +230,7 @@ def test_resample_index_with_nan(agg_func): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_df_getitem(): eval_snowpark_pandas_result( *create_test_dfs( @@ -253,7 +253,7 @@ def test_resample_ser_getitem(): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_date_trunc_day(): # resample bins of 'A' each have a NaN. 1 resample bin of 'B' is all NaN eval_snowpark_pandas_result( @@ -266,7 +266,7 @@ def test_resample_date_trunc_day(): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_date_trunc_hour(): # resample bins of 'A' each have a NaN. 1 resample bin of 'B' is all NaN eval_snowpark_pandas_result( diff --git a/tests/integ/modin/resample/test_resample_asfreq.py b/tests/integ/modin/resample/test_resample_asfreq.py index fc60f62621..50e9646a4c 100644 --- a/tests/integ/modin/resample/test_resample_asfreq.py +++ b/tests/integ/modin/resample/test_resample_asfreq.py @@ -19,7 +19,7 @@ @freq @interval -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_asfreq_no_method(freq, interval): rule = f"{interval}{freq}" eval_snowpark_pandas_result( @@ -32,7 +32,7 @@ def test_asfreq_no_method(freq, interval): ) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_asfreq_ffill(): eval_snowpark_pandas_result( *create_test_dfs( diff --git a/tests/integ/modin/resample/test_resample_fillna.py b/tests/integ/modin/resample/test_resample_fillna.py index 53352fd4ef..c15aef3fa9 100644 --- a/tests/integ/modin/resample/test_resample_fillna.py +++ b/tests/integ/modin/resample/test_resample_fillna.py @@ -17,7 +17,7 @@ @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_fill(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -69,7 +69,7 @@ def test_resample_fill_ser(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_ffill_one_gap(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -113,7 +113,7 @@ def resample_ffill_ser_one_gap(agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_ffill_missing_in_middle(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -163,7 +163,7 @@ def test_resample_ffill_ser_missing_in_middle(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_ffill_ffilled_with_none(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -187,7 +187,7 @@ def test_resample_ffill_ffilled_with_none(interval, agg_func): @interval @agg_func -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_ffill_large_gaps(interval, agg_func): datecol = native_pd.to_datetime( [ @@ -209,7 +209,7 @@ def test_resample_ffill_large_gaps(interval, agg_func): @interval @pytest.mark.parametrize("method", ["ffill", "pad", "backfill", "bfill"]) -@sql_count_checker(query_count=2, join_count=3) +@sql_count_checker(query_count=3, join_count=1) def test_resample_fillna(interval, method): datecol = native_pd.to_datetime( [ diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index 2603eaa61c..8d74fd856a 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -4,7 +4,6 @@ import functools import numbers import random -import re import modin.pandas as pd import numpy as np @@ -1451,10 +1450,7 @@ def test_series_loc_set_df_key_negative(item, default_index_native_series): native_ser.loc[df_key] = item # Snowpark pandas error verification. - err_msg = re.escape( - "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), " - "a.any() or a.all()." - ) + err_msg = "Data cannot be a DataFrame" with pytest.raises(ValueError, match=err_msg): snowpark_ser.loc[pd.DataFrame(df_key)] = item assert_series_equal(snowpark_ser, native_ser) diff --git a/tests/integ/modin/series/test_sort_values.py b/tests/integ/modin/series/test_sort_values.py index b147377f75..e966409dfc 100644 --- a/tests/integ/modin/series/test_sort_values.py +++ b/tests/integ/modin/series/test_sort_values.py @@ -33,7 +33,7 @@ def snow_series(snow_df): @pytest.mark.parametrize("by", ["A", "B", "a", "b"]) @pytest.mark.parametrize("ascending", [True, False]) -@sql_count_checker(query_count=3, join_count=3) +@sql_count_checker(query_count=3) def test_sort_values(snow_df, by, ascending): snow_series = snow_df[by] native_series = snow_series.to_pandas() diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index 19693ad381..3170241be4 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -1063,7 +1063,7 @@ def test_concat_duplicate_columns(columns1, columns2, expected_rows, expected_co df1 = pd.DataFrame([[1, 2, 3]], columns=columns1) df2 = pd.DataFrame([[4, 5, 6]], columns=columns2) expected_df = pd.DataFrame(expected_rows, columns=expected_cols, index=[0, 0]) - assert_frame_equal(pd.concat([df1, df2]), expected_df) + assert_frame_equal(pd.concat([df1, df2]), expected_df, check_dtype=False) @pytest.mark.parametrize("value1", [4, 1.5, True, "c", (1, 2), {"a": 1}]) From fa4eb09836460eb6a3bb9bb64c7e4a3622367087 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 16 Sep 2024 16:08:41 -0700 Subject: [PATCH 37/42] remove unnecessary diffs --- tests/integ/modin/frame/test_merge.py | 1 + tests/integ/modin/frame/test_rename.py | 1 + tests/integ/modin/frame/test_setitem.py | 1 + tests/integ/modin/resample/test_resample.py | 13 +++++++++++++ tests/integ/modin/resample/test_resample_fillna.py | 1 + 5 files changed, 17 insertions(+) diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py index 04f0ca42fd..d006706418 100644 --- a/tests/integ/modin/frame/test_merge.py +++ b/tests/integ/modin/frame/test_merge.py @@ -922,6 +922,7 @@ def test_merge_outer_with_nan(dtype): _verify_merge(right, left, "outer", on="key") +# Two extra queries to convert to native index for dataframe constructor when creating left and right @sql_count_checker(query_count=5, join_count=1) def test_merge_different_index_names(): left = pd.DataFrame({"a": [1]}, index=pd.Index([1], name="c")) diff --git a/tests/integ/modin/frame/test_rename.py b/tests/integ/modin/frame/test_rename.py index 1be8956a9d..289fb6e159 100644 --- a/tests/integ/modin/frame/test_rename.py +++ b/tests/integ/modin/frame/test_rename.py @@ -104,6 +104,7 @@ def test_rename(self, snow_float_frame): assert_index_equal(renamed.index, native_pd.Index(["A", "B", "foo", "bar"])) # index with name + # Two extra queries, one for converting to native pandas in renamer Dataframe constructor, one to get the name with SqlCounter(query_count=2, join_count=1): index = Index(["foo", "bar"], name="name") renamer = DataFrame(data, index=index) diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py index bc4a1393b2..cc5698b684 100644 --- a/tests/integ/modin/frame/test_setitem.py +++ b/tests/integ/modin/frame/test_setitem.py @@ -444,6 +444,7 @@ def setitem_helper(df): [["a", "b", "b", "d", "e"], ["x", "y", "z", "u", "u"], True], ], ) +# 2 extra queries to convert to native pandas when creating the two snowpark pandas dataframes @sql_count_checker(query_count=3, join_count=1) def test_df_setitem_with_unique_and_duplicate_index_values( index_values, other_index_values, expect_mismatch diff --git a/tests/integ/modin/resample/test_resample.py b/tests/integ/modin/resample/test_resample.py index 5ade497c4d..63c72452c1 100644 --- a/tests/integ/modin/resample/test_resample.py +++ b/tests/integ/modin/resample/test_resample.py @@ -32,6 +32,7 @@ def randomword(length): @freq @interval @agg_func +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_with_varying_freq_and_interval(freq, interval, agg_func): rule = f"{interval}{freq}" @@ -45,6 +46,7 @@ def test_resample_with_varying_freq_and_interval(freq, interval, agg_func): ) +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_date_before_snowflake_alignment_date(): # Snowflake TIMESLICE alignment date is 1970-01-01 00:00:00 @@ -66,6 +68,7 @@ def test_resample_date_before_snowflake_alignment_date(): @interval +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_date_wraparound_snowflake_alignment_date(interval): # Snowflake TIMESLICE alignment date is 1970-01-01 00:00:00 @@ -89,6 +92,7 @@ def test_resample_date_wraparound_snowflake_alignment_date(interval): @agg_func @freq +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_missing_data_upsample(agg_func, freq): # this tests to make sure that missing resample bins will be filled in. @@ -103,6 +107,7 @@ def test_resample_missing_data_upsample(agg_func, freq): ) +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_duplicated_timestamps_downsample(): date_data = native_pd.to_datetime( @@ -122,6 +127,7 @@ def test_resample_duplicated_timestamps_downsample(): ) +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_duplicated_timestamps(): date_data = native_pd.to_datetime( @@ -158,6 +164,7 @@ def test_resample_series(freq, interval, agg_func): ) +# One extra query to convert index to native pandas for dataframe constructor @pytest.mark.parametrize( "agg_func", ["max", "min", "mean", "median", "sum", "std", "var"] ) @@ -173,6 +180,7 @@ def test_resample_numeric_only(agg_func): ) +# One extra query to convert index to native pandas for dataframe constructor @agg_func @sql_count_checker(query_count=3, join_count=1) def test_resample_df_with_nan(agg_func): @@ -202,6 +210,7 @@ def test_resample_ser_with_nan(agg_func): @agg_func +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_single_resample_bin(agg_func): eval_snowpark_pandas_result( @@ -215,6 +224,7 @@ def test_resample_single_resample_bin(agg_func): @agg_func +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_index_with_nan(agg_func): datecol = native_pd.to_datetime( @@ -230,6 +240,7 @@ def test_resample_index_with_nan(agg_func): ) +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_df_getitem(): eval_snowpark_pandas_result( @@ -253,6 +264,7 @@ def test_resample_ser_getitem(): ) +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_date_trunc_day(): # resample bins of 'A' each have a NaN. 1 resample bin of 'B' is all NaN @@ -266,6 +278,7 @@ def test_resample_date_trunc_day(): ) +# One extra query to convert index to native pandas for dataframe constructor @sql_count_checker(query_count=3, join_count=1) def test_resample_date_trunc_hour(): # resample bins of 'A' each have a NaN. 1 resample bin of 'B' is all NaN diff --git a/tests/integ/modin/resample/test_resample_fillna.py b/tests/integ/modin/resample/test_resample_fillna.py index c15aef3fa9..3aad42dbc4 100644 --- a/tests/integ/modin/resample/test_resample_fillna.py +++ b/tests/integ/modin/resample/test_resample_fillna.py @@ -15,6 +15,7 @@ agg_func = pytest.mark.parametrize("agg_func", ["ffill", "bfill"]) +# One extra query to convert index to native pandas for dataframe constructor @interval @agg_func @sql_count_checker(query_count=3, join_count=1) From db2863067685e5504a40744ce802e011c6a8a9c2 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Mon, 16 Sep 2024 17:43:03 -0700 Subject: [PATCH 38/42] fix doctest and couple of tests --- .../plugin/extensions/dataframe_overrides.py | 31 ++++++++++--------- tests/integ/modin/frame/test_name.py | 2 +- .../test_df_series_creation_with_index.py | 2 +- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index 890c486bb0..a095d63232 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -615,20 +615,23 @@ def __init__( self._query_compiler = new_qc return - # If only some data is a Snowpark pandas object, convert it to pandas objects. - res = [] - for v in data: - if isinstance(v, (Index, BasePandasDataset)): - res.append(v.to_pandas()) - # elif is_dict_like(v) or isinstance(v, (native_pd.Series, native_pd.DataFrame, native_pd.Index)): - # res.append(v) - else: - # # Need to convert this is a native pandas object since native pandas incorrectly - # # tries to perform `get_indexer` on it. Specify dtype=object so that pandas does not - # # cast the data provided. In some cases, None turns to NaN, which is not desired. - # res.append(native_pd.Index(v, dtype=object) if is_list_like(v) else v) - res.append(v) - data = res + if not isinstance(data, np.ndarray): + # Sometimes the ndarray representation of a list is different from a regular list. + # For instance, [(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")] + # is different from np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]). + # The list has the shape (3, 3) while the ndarray has the shape (3,). + # If only some data is a Snowpark pandas object, convert it to pandas objects. + res = [] + for v in data: + if isinstance(v, (Index, BasePandasDataset)): + res.append(v.to_pandas()) + else: + # # Need to convert this is a native pandas object since native pandas incorrectly + # # tries to perform `get_indexer` on it. Specify dtype=object so that pandas does not + # # cast the data provided. In some cases, None turns to NaN, which is not desired. + # res.append(native_pd.Index(v, dtype=object) if is_list_like(v) else v) + res.append(v) + data = res query_compiler = from_pandas( native_pd.DataFrame( diff --git a/tests/integ/modin/frame/test_name.py b/tests/integ/modin/frame/test_name.py index b23a3b26f0..aa5d04ed03 100644 --- a/tests/integ/modin/frame/test_name.py +++ b/tests/integ/modin/frame/test_name.py @@ -39,7 +39,7 @@ def test_create_dataframe_from_object_with_name(sample): ) -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=1, join_count=2, union_count=1) def test_create_dataframe_from_snowpark_pandas_series(): df = pd.DataFrame([[2, 3, 4], [5, 6, 7]], columns=["X", "Y", "Z"]) df = pd.DataFrame([df.X, df.iloc[:, 2]]) diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index ccf1753f18..354bb2f11e 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -858,5 +858,5 @@ def test_create_series_with_df_data_negative(): ), ): native_pd.Series(native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) - with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + with pytest.raises(ValueError, match="Data cannot be a DataFrame"): pd.Series(pd.DataFrame([[1, 2], [3, 4], [5, 6]])) From 17be4c30a705416cac656230235cc1f18331ef74 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 18 Sep 2024 13:53:46 -0700 Subject: [PATCH 39/42] apply feedback to simplify logic --- .../snowpark/modin/plugin/_internal/utils.py | 58 ++++- .../compiler/snowflake_query_compiler.py | 50 ---- .../modin/plugin/docstrings/dataframe.py | 15 +- .../plugin/extensions/dataframe_overrides.py | 237 +++++++++--------- .../plugin/extensions/series_overrides.py | 47 ++-- .../integ/modin/groupby/test_groupby_apply.py | 8 +- .../test_df_series_creation_with_index.py | 96 ++++++- 7 files changed, 310 insertions(+), 201 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/utils.py b/src/snowflake/snowpark/modin/plugin/_internal/utils.py index 1e53ee34f9..8563997dfa 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/utils.py @@ -11,7 +11,7 @@ import numpy as np import pandas as native_pd -from pandas._typing import Scalar +from pandas._typing import AnyArrayLike, Scalar from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype, is_scalar from pandas.core.dtypes.inference import is_list_like @@ -2021,6 +2021,20 @@ def error_checking_for_init( raise NotImplementedError("pandas type category is not implemented") +def assert_fields_are_none( + class_name: str, data: Any, index: Any, columns: Any = None +) -> None: + assert ( + data is None + ), f"Invalid {class_name} construction! Cannot pass both data and query_compiler." + assert ( + index is None + ), f"Invalid {class_name} construction! Cannot pass both index and query_compiler." + assert ( + columns is None + ), f"Invalid {class_name} construction! Cannot pass both columns and query_compiler." + + def convert_index_to_qc(index: Any) -> Any: """ Method to convert an object representing an index into a query compiler for set_index or reindex. @@ -2084,3 +2098,45 @@ def convert_index_to_list_of_qcs(index: Any) -> list: else: index_qc_list = [convert_index_to_qc(index)] return index_qc_list + + +def add_extra_columns_and_select_required_columns( + query_compiler: Any, + columns: Union[AnyArrayLike, list], + data_columns: Union[AnyArrayLike, list], +) -> Any: + """ + Method to add extra columns to and select the required columns from the provided query compiler. + This is used in DataFrame construction in the following cases: + - general case when data is a DataFrame + - data is a named Series, and this name is in `columns` + + Parameters + ---------- + query_compiler: Any + The query compiler to select columns from, i.e., data's query compiler. + columns: AnyArrayLike or list + The columns to select from the query compiler. + data_columns: AnyArrayLike or list + The columns in the data. This is data.columns if data is a DataFrame or data.name if data is a Series. + + """ + from modin.pandas import DataFrame + + # The `columns` parameter is used to select the columns from `data` that will be in the resultant DataFrame. + # If a value in `columns` is not present in data's columns, it will be added as a new column filled with NaN values. + # These columns are tracked by the `extra_columns` variable. + if data_columns is not None and columns is not None: + extra_columns = [col for col in columns if col not in data_columns] + # To add these new columns to the DataFrame, perform `__getitem__` only with the extra columns + # and set them to None. + extra_columns_df = DataFrame(query_compiler=query_compiler) + extra_columns_df[extra_columns] = None + query_compiler = extra_columns_df._query_compiler + + # To select the columns for the resultant DataFrame, perform `.loc[]` on the created query compiler. + # This step is performed to ensure that the right columns are picked from the InternalFrame since we + # never explicitly drop the unwanted columns. `.loc[]` also ensures that the columns in the resultant + # DataFrame are in the same order as the columns in the `columns` parameter. + columns = slice(None) if columns is None else columns + return DataFrame(query_compiler=query_compiler).loc[:, columns]._query_compiler diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 256b0de6b8..6e706c133c 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -18292,53 +18292,3 @@ def timedelta_property( return SnowflakeQueryCompiler( self._modin_frame.apply_snowpark_function_to_columns(func, include_index) ) - - def create_qc_with_extra_columns( - self, extra_columns: List[Hashable] - ) -> "SnowflakeQueryCompiler": - """ - This is a helper function for creating a DataFrame where the data is a DataFrame object. Sometimes, columns - not present in the `data` DataFrame can be passed as arguments - these are added to the resultant DataFrame - as NaN columns. - - Parameters - ---------- - extra_columns : list of hashable, default None - New columns that are not a part of the original query compiler - - Returns - ------- - SnowflakeQueryCompiler - A new query compiler with the new columns. - """ - self_frame = self._modin_frame - - if not extra_columns or len(extra_columns) == 0: - return self.copy() - - # Append the new columns to the data's internal frame. - new_snowflake_quoted_identifiers = ( - self._modin_frame.ordered_dataframe.generate_snowflake_quoted_identifiers( - pandas_labels=extra_columns, - excluded=self_frame.data_column_snowflake_quoted_identifiers, - ) - ) - new_ordered_frame = append_columns( - self_frame.ordered_dataframe, - new_snowflake_quoted_identifiers, - [pandas_lit(np.nan)] * len(extra_columns), - ) - new_internal_frame = InternalFrame.create( - ordered_dataframe=new_ordered_frame, - data_column_pandas_labels=self_frame.data_column_pandas_labels - + extra_columns, - data_column_snowflake_quoted_identifiers=self_frame.data_column_snowflake_quoted_identifiers - + new_snowflake_quoted_identifiers, - data_column_pandas_index_names=self_frame.data_column_pandas_index_names, - index_column_pandas_labels=self_frame.index_column_pandas_labels, - index_column_snowflake_quoted_identifiers=self_frame.index_column_snowflake_quoted_identifiers, - data_column_types=None, - index_column_types=None, - ) - - return SnowflakeQueryCompiler(new_internal_frame) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index f7e93e6c2d..c7b11c8fbe 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -82,16 +82,13 @@ class DataFrame(BasePandasDataset): Notes ----- ``DataFrame`` can be created either from passed `data` or `query_compiler`. If both - parameters are provided, data source will be prioritized in the next order: + parameters are provided, an assertion error will be raised. `query_compiler` can only + be specified when the `data`, `index`, and `columns` are None. - 1) Modin ``DataFrame`` or ``Series`` passed with `data` parameter. - 2) Query compiler from the `query_compiler` parameter. - 3) Various pandas/NumPy/Python data structures passed with `data` parameter. - - The last option is less desirable since import of such data structures is very - inefficient, please use previously created Modin structures from the fist two - options or import data using highly efficient Modin IO tools (for example - ``pd.read_csv``). + Using pandas/NumPy/Python data structures as the `data` parameter is less desirable since + importing such data structures is very inefficient. + Please use previously created Modin structures or import data using highly efficient Modin IO + tools (for example ``pd.read_csv``). Examples -------- diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index a095d63232..a9a5ea511e 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -85,12 +85,15 @@ is_snowflake_agg_func, ) from snowflake.snowpark.modin.plugin._internal.utils import ( + add_extra_columns_and_select_required_columns, + assert_fields_are_none, convert_index_to_list_of_qcs, convert_index_to_qc, error_checking_for_init, is_repr_truncated, ) from snowflake.snowpark.modin.plugin._typing import ListLike +from snowflake.snowpark.modin.plugin.extensions.index import Index from snowflake.snowpark.modin.plugin.utils.error_message import ( ErrorMessage, dataframe_not_implemented, @@ -464,43 +467,41 @@ def __init__( # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. - from snowflake.snowpark.modin.plugin.extensions.index import Index - self._siblings = [] - # 0. Setting the query compiler - # ----------------------------- + # Setting the query compiler + # -------------------------- if query_compiler is not None: # CASE I: query_compiler # If a query_compiler is passed in only use the query_compiler field to create a new DataFrame. - assert ( - data is None - ), "Invalid DataFrame construction! Cannot pass both data and query_compiler." - assert ( - index is None - ), "Invalid DataFrame construction! Cannot pass both index and query_compiler." - assert ( - columns is None - ), "Invalid DataFrame construction! Cannot pass both columns and query_compiler." + # Verify that the data, index, and columns parameters are None. + assert_fields_are_none( + class_name="DataFrame", data=data, index=index, columns=columns + ) self._query_compiler = query_compiler return + # A DataFrame cannot be used as an index and Snowpark pandas does not support the Categorical type yet. + # Check that index is not a DataFrame and dtype is not "category". error_checking_for_init(index, dtype) + # Convert columns to a local object if it is a lazy Index. + columns = try_convert_index_to_native(columns) + # The logic followed here is: - # 1. Create a query_compiler from the provided data. If columns are provided, add/select the columns. - # 2. If an index is provided, set the index through set_index or reindex. - # 3. If the data is a DataFrame, perform loc to select the required index and columns from the DataFrame. - # 4. The resultant query_compiler is then set as the query_compiler for the DataFrame. + # STEP 1: Create a query_compiler from the provided data. If columns are provided, add/select the columns. + # STEP 2: If an index is provided, set the index through set_index or reindex. + # STEP 3: The resultant query_compiler is then set as the query_compiler for the DataFrame. - # 1. Setting the data (and columns) - # --------------------------------- + # STEP 1: Setting the data (and columns) + # -------------------------------------- if isinstance(data, Index): # CASE II: data is a Snowpark pandas Index # If the data is an Index object, convert it to a DataFrame to make sure that the values are in the - # correct format: the values are a data column, not an index column. + # correct format: the values should be a data column, not an index column. if data.name is None: - # If no name is provided, the default name is 0. + # If no name is provided, the default name is 0. Otherwise, only use the first value in `columns` to + # set the column name; this is because the resultant DataFrame will have only one column. new_name = 0 if columns is None else columns[0] else: new_name = data.name @@ -508,36 +509,38 @@ def __init__( elif isinstance(data, Series): # CASE III: data is a Snowpark pandas Series - query_compiler = data._query_compiler.copy() - # We set the column name if it is not in the provided Series `data`. - if data.name is None: - # If no name is provided, the default name is 0. - query_compiler = query_compiler.set_columns(columns or [0]) - if columns is not None and data.name not in columns: - # If the columns provided are not in the named Series, pandas clears - # the DataFrame and sets columns to the columns provided. + # If the Series `data` has no name, the default name is 0. + name = [data.name] if data.name is not None else [0] + if columns is None: + # If no columns are provided, the resultant DataFrame has only one column. + # The column name is the Series' name. + query_compiler = data._query_compiler.set_columns(name) + elif data.name in columns: + # Treat any columns that are not data.name as extra columns. They will be appended as NaN columns. + # After this, select the required columns in the order provided by `columns`. + query_compiler = add_extra_columns_and_select_required_columns( + data._query_compiler, columns, name + ) + else: + # If the columns provided are not in the named Series, pandas clears the DataFrame and sets columns. query_compiler = from_pandas( native_pd.DataFrame(columns=columns) )._query_compiler elif isinstance(data, DataFrame): # CASE IV: data is a Snowpark pandas DataFrame - query_compiler = data._query_compiler.copy() if columns is None and index is None: - # Special case IV.a: if the new DataFrame has the same columns and index as the original DataFrame, + # Special case: if the new DataFrame has the same columns and index as the original DataFrame, # the query compiler is shared and kept track of as a sibling. - self._query_compiler = query_compiler - data._add_sibling(self) + self._query_compiler = data._query_compiler + if not copy: + # When copy is False, the DataFrame is a shallow copy of the original DataFrame. + data._add_sibling(self) return - # The `columns` parameter is used to select the columns from `data` that will be in the resultant - # DataFrame. If a value in `columns` is not present in `data`'s columns, it will be added as a - # new column filled with NaN values. These columns are tracked by the `extra_columns` variable. - if data.columns is not None and columns is not None: - extra_columns = [col for col in columns if col not in data.columns] - else: - extra_columns = [] - query_compiler = data._query_compiler.create_qc_with_extra_columns( - extra_columns + # Treat any columns that are not in data.columns as extra columns. They will be appended as NaN columns. + # After this, select the required columns in the order provided by `columns`. + query_compiler = add_extra_columns_and_select_required_columns( + data._query_compiler, columns, data.columns ) else: @@ -545,8 +548,6 @@ def __init__( if not isinstance( data, (native_pd.Series, native_pd.DataFrame, native_pd.Index) ) and is_list_like(data): - from snowflake.snowpark.modin.pandas import concat - if is_dict_like(data): # Setting up keys and values for processing if all the values are Snowpark pandas objects. if columns is not None: @@ -556,33 +557,19 @@ def __init__( if len(data) and all( isinstance(v, (Index, BasePandasDataset)) for v in data.values() ): - # Special case V.a: data is a list/dict where all the values are Snowpark pandas objects. - # Concat can only be performed with BasePandasDataset objects. - # If a value is an Index, convert it to a Series where the index is the index to be set - # since these values are always present in the final DataFrame. - values = [ - Series(v, index=index) if isinstance(v, Index) else v - for v in data.values() - ] - new_qc = concat(values, axis=1, keys=data.keys())._query_compiler - if dtype is not None: - new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) - if index is not None: - new_qc = new_qc.reindex( - axis=0, labels=convert_index_to_qc(index) + # Special case: data is a dict where all the values are Snowpark pandas objects. + self._query_compiler = ( + _df_init_dict_data_with_snowpark_pandas_values( + data, index, columns, dtype ) - if columns is not None: - new_qc = new_qc.reindex( - axis=1, labels=try_convert_index_to_native(columns) - ) - self._query_compiler = new_qc + ) return # If only some data is a Snowpark pandas object, convert it to pandas objects. res = {} index = try_convert_index_to_native(index) for k, v in data.items(): - if isinstance(v, (Index)): + if isinstance(v, Index): res[k] = v.to_pandas() elif isinstance(v, BasePandasDataset): # Need to perform reindex on the Series or DataFrame objects since only the data @@ -596,41 +583,27 @@ def __init__( if len(data) and all( isinstance(v, (Index, BasePandasDataset)) for v in data ): - # Special case V.c: data is a list/dict where all the values are Snowpark pandas objects. - # Concat can only be performed with BasePandasDataset objects. - # If a value is an Index, convert it to a Series. - values = [Series(v) if isinstance(v, Index) else v for v in data] - new_qc = concat(values, axis=1).T._query_compiler - if dtype is not None: - new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) - if index is not None: - new_qc = new_qc.set_index([convert_index_to_qc(index)]) - if columns is not None: - if all(isinstance(v, Index) for v in data): - # Special case: if all the values are Index objects, they are always present in the - # final result with the provided column names. Therefore, rename the columns. - new_qc = new_qc.set_columns(columns) - else: - new_qc = new_qc.reindex(axis=1, labels=columns) - self._query_compiler = new_qc + # Special case: data is a list/dict where all the values are Snowpark pandas objects. + self._query_compiler = ( + _df_init_list_data_with_snowpark_pandas_values( + data, index, columns, dtype + ) + ) return + # Sometimes the ndarray representation of a list is different from a regular list. + # For instance, [(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")] + # is different from np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]). + # The list has the shape (3, 3) while the ndarray has the shape (3,). Therefore, do not modify + # the ndarray data. if not isinstance(data, np.ndarray): - # Sometimes the ndarray representation of a list is different from a regular list. - # For instance, [(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")] - # is different from np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]). - # The list has the shape (3, 3) while the ndarray has the shape (3,). # If only some data is a Snowpark pandas object, convert it to pandas objects. - res = [] - for v in data: - if isinstance(v, (Index, BasePandasDataset)): - res.append(v.to_pandas()) - else: - # # Need to convert this is a native pandas object since native pandas incorrectly - # # tries to perform `get_indexer` on it. Specify dtype=object so that pandas does not - # # cast the data provided. In some cases, None turns to NaN, which is not desired. - # res.append(native_pd.Index(v, dtype=object) if is_list_like(v) else v) - res.append(v) + res = [ + v.to_pandas() + if isinstance(v, (Index, BasePandasDataset)) + else v + for v in data + ] data = res query_compiler = from_pandas( @@ -638,14 +611,14 @@ def __init__( data=data, # Handle setting the index, if it is a lazy index, outside this block. index=None if isinstance(index, (Index, Series)) else index, - columns=try_convert_index_to_native(columns), + columns=columns, dtype=dtype, copy=copy, ) )._query_compiler - # 2. Setting the index - # -------------------- + # STEP 2: Setting the index + # ------------------------- # The index is already set if the data is a non-Snowpark pandas object. # If either the data or the index is a Snowpark pandas object, set the index here. if index is not None and ( @@ -665,23 +638,63 @@ def __init__( convert_index_to_list_of_qcs(index) ) - # 3. If data is a DataFrame, filter result - # ---------------------------------------- - if isinstance(data, DataFrame) and columns is not None: - # To select the columns for the resultant DataFrame, perform .loc[] on the created query compiler. - # This step is performed to ensure that the right columns are picked from the InternalFrame since we - # never explicitly drop the unwanted columns. - query_compiler = ( - DataFrame(query_compiler=query_compiler) - .loc[slice(None), columns] - ._query_compiler - ) - - # 4. Setting the query compiler - # ----------------------------- + # STEP 3: Setting the query compiler + # ---------------------------------- self._query_compiler = query_compiler +def _df_init_dict_data_with_snowpark_pandas_values( + data: AnyArrayLike | list, + index: list | AnyArrayLike | Series | Index, + columns: list | AnyArrayLike | Series | Index, + dtype: str | np.dtype | native_pd.ExtensionDtype | None, +): + # Special case: data is a dict where all the values are Snowpark pandas objects. + # Concat can only be performed with BasePandasDataset objects. + # If a value is an Index, convert it to a Series where the index is the index to be set since these values + # are always present in the final DataFrame. + from snowflake.snowpark.modin.pandas import concat + + values = [ + Series(v, index=index) if isinstance(v, Index) else v for v in data.values() + ] + new_qc = concat(values, axis=1, keys=data.keys())._query_compiler + if dtype is not None: + new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) + if index is not None: + new_qc = new_qc.reindex(axis=0, labels=convert_index_to_qc(index)) + if columns is not None: + new_qc = new_qc.reindex(axis=1, labels=columns) + return new_qc + + +def _df_init_list_data_with_snowpark_pandas_values( + data: AnyArrayLike | list, + index: list | AnyArrayLike | Series | Index, + columns: list | AnyArrayLike | Series | Index, + dtype: str | np.dtype | native_pd.ExtensionDtype | None, +): + # Special case: data is a list/dict where all the values are Snowpark pandas objects. + # Concat can only be performed with BasePandasDataset objects. + # If a value is an Index, convert it to a Series. + from snowflake.snowpark.modin.pandas import concat + + values = [Series(v) if isinstance(v, Index) else v for v in data] + new_qc = concat(values, axis=1).T._query_compiler + if dtype is not None: + new_qc = new_qc.astype({col: dtype for col in new_qc.columns}) + if index is not None: + new_qc = new_qc.set_index([convert_index_to_qc(index)]) + if columns is not None: + if all(isinstance(v, Index) for v in data): + # Special case: if all the values are Index objects, they are always present in the + # final result with the provided column names. Therefore, rename the columns. + new_qc = new_qc.set_columns(columns) + else: + new_qc = new_qc.reindex(axis=1, labels=columns) + return new_qc + + @register_dataframe_accessor("__dataframe__") def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): """ diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index bb3bb612b5..46cf45041a 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -43,6 +43,7 @@ from snowflake.snowpark.modin.pandas.api.extensions import register_series_accessor from snowflake.snowpark.modin.pandas.utils import from_pandas, is_scalar from snowflake.snowpark.modin.plugin._internal.utils import ( + assert_fields_are_none, convert_index_to_list_of_qcs, convert_index_to_qc, error_checking_for_init, @@ -352,37 +353,36 @@ def __init__( from snowflake.snowpark.modin.plugin.extensions.index import Index - # 0. Setting the query compiler - # ----------------------------- + # Setting the query compiler + # -------------------------- if query_compiler is not None: # CASE I: query_compiler # If a query_compiler is passed in, only use the query_compiler and name fields to create a new Series. - assert ( - data is None - ), "Invalid Series construction! Cannot pass both data and query_compiler." - assert ( - index is None - ), "Invalid Series construction! Cannot pass both index and query_compiler." + # Verify that the data and index parameters are None. + assert_fields_are_none(class_name="Series", data=data, index=index) self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name return + # A DataFrame cannot be used as an index and Snowpark pandas does not support the Categorical type yet. + # Check that index is not a DataFrame and dtype is not "category". error_checking_for_init(index, dtype) if isinstance(data, spd.DataFrame): + # data cannot be a DataFrame, raise a clear error message. # pandas raises an ambiguous error: # ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all(). raise ValueError("Data cannot be a DataFrame") # The logic followed here is: - # 1. Create a query_compiler from the provided data. - # 2. If an index is provided, set the index. This is either through set_index or reindex. - # 3. The resultant query_compiler is columnarized and set as the query_compiler for the Series. - # 4. If a name is provided, set the name. + # STEP 1: Create a query_compiler from the provided data. + # STEP 2: If an index is provided, set the index. This is either through set_index or reindex. + # STEP 3: The resultant query_compiler is columnarized and set as the query_compiler for the Series. + # STEP 4: If a name is provided, set the name. - # 1. Setting the data - # ------------------- + # STEP 1: Setting the data + # ------------------------ if isinstance(data, Index): # CASE II: Index # If the data is an Index object, convert it to a Series, and get the query_compiler. @@ -390,10 +390,15 @@ def __init__( data.to_series(index=None, name=name).reset_index(drop=True)._query_compiler ) - elif isinstance(data, type(self)): + elif isinstance(data, Series): # CASE III: Series - # If the data is a Series object, copy the query_compiler. - query_compiler = data._query_compiler.copy() + # If the data is a Series object, use its query_compiler. + query_compiler = data._query_compiler + if index is None and name is None and copy is False: + # When copy is False and no index and name are provided, the Series is a shallow copy of the original Series. + self._query_compiler = query_compiler + data._add_sibling(self) + return else: # CASE IV: Non-Snowpark pandas data @@ -433,8 +438,8 @@ def __init__( ) )._query_compiler - # 2. Setting the index - # -------------------- + # STEP 2: Setting the index + # ------------------------- # The index is already set if the data is a non-Snowpark pandas object. # If either the data or the index is a Snowpark pandas object, set the index here. if index is not None and ( @@ -454,8 +459,8 @@ def __init__( convert_index_to_list_of_qcs(index) ) - # 3 and 4. Setting the query compiler and name - # -------------------------------------------- + # STEP 3 and STEP 4: Setting the query compiler and name + # ------------------------------------------------------ self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index e83fcbe00b..04321efcc0 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -537,7 +537,7 @@ def operation(df: native_pd.DataFrame) -> native_pd.DataFrame: if group_keys else QUERY_COUNT_WITH_TRANSFORM_CHECK ), - join_count=2, + join_count=JOIN_COUNT + 1, udtf_count=UDTF_COUNT, ): snow_result = operation(mdf) @@ -719,7 +719,7 @@ def groupby_apply_without_sort(df): with SqlCounter( query_count=QUERY_COUNT_WITH_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=2, + join_count=JOIN_COUNT + 1, ): assert_snowpark_pandas_equal_to_pandas( groupby_apply_without_sort(snow_df).sort_values(), @@ -969,7 +969,7 @@ def test_args_and_kwargs(self, grouping_dfs_with_multiindexes): # One extra query to convert index to native pandas in dataframe constructor to create test dataframes query_count=QUERY_COUNT_WITHOUT_TRANSFORM_CHECK, udtf_count=UDTF_COUNT, - join_count=2, + join_count=JOIN_COUNT + 1, ) @pytest.mark.parametrize("index", [[2.0, np.nan, 2.0, 1.0], [np.nan] * 4]) def test_dropna(self, dropna, index): @@ -1089,7 +1089,7 @@ def test_dataframe_groupby_getitem(self, by, func, dropna, group_keys, sort): with SqlCounter( query_count=qc, udtf_count=UDTF_COUNT, - join_count=2, + join_count=JOIN_COUNT + 1, ): eval_snowpark_pandas_result( *create_test_dfs( diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 354bb2f11e..e15acd03e8 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -748,8 +748,8 @@ def test_create_df_with_mixed_series_index_dict_data(): native_data1 = native_pd.Series([1, 2, 3]) native_data2 = native_pd.Index([4, 5, 6]) data3 = [7, 8, 9] - snow_data1 = pd.Series([1, 2, 3]) - snow_data2 = pd.Index([4, 5, 6]) + snow_data1 = pd.Series(native_data1) + snow_data2 = pd.Index(native_data2) native_data = {"A": native_data1, "B": native_data2, "C": data3} snow_data = {"A": snow_data1, "B": snow_data2, "C": data3} @@ -761,7 +761,7 @@ def test_create_df_with_mixed_series_index_dict_data(): # Create DataFrame with dict data and Series index. native_ser_index = native_pd.Series([9, 2, 999]) - snow_ser_index = pd.Series([9, 2, 999]) + snow_ser_index = pd.Series(native_ser_index) native_df = native_pd.DataFrame(native_data, index=native_ser_index) snow_df = pd.DataFrame(snow_data, index=snow_ser_index) with SqlCounter(query_count=1): @@ -769,7 +769,7 @@ def test_create_df_with_mixed_series_index_dict_data(): # Create DataFrame with dict data and Index index. native_index = native_pd.Index([9, 2, 999]) - snow_index = pd.Index([9, 2, 999]) + snow_index = pd.Index(native_index) native_df = native_pd.DataFrame(native_data, index=native_index) snow_df = pd.DataFrame(snow_data, index=snow_index) with SqlCounter(query_count=1): @@ -860,3 +860,91 @@ def test_create_series_with_df_data_negative(): native_pd.Series(native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) with pytest.raises(ValueError, match="Data cannot be a DataFrame"): pd.Series(pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + + +@sql_count_checker(query_count=1) +def test_create_df_with_name_in_columns(): + # Test DataFrame creation where the data is a named Series and its name is in the columns passed in. + # The column sharing the name with the Series takes on its values as the column values; the rest of the + # columns are filled with NaNs. + native_data = native_pd.Series([1, 2, 3], name="b") + snow_data = pd.Series(native_data) + columns = ["a", "b"] + native_df = native_pd.DataFrame(native_data, columns=columns) + snow_df = pd.DataFrame(snow_data, columns=columns) + assert_frame_equal(snow_df, native_df) + + +@sql_count_checker(query_count=1, join_count=1) +def test_create_df_with_name_not_in_columns_and_index(): + # Test DataFrame creation where the data is a named Series and its name is not in the columns passed in. + # The resultant DataFrame is filled with NaNs; the index and columns are set to the values provided. + native_data = native_pd.Series([1, 2, 3], name="b") + snow_data = pd.Series(native_data) + native_idx = native_pd.Index([1, 2, 3, 4, 5]) + snow_idx = pd.Index(native_idx) + columns = ["a", "c"] + native_df = native_pd.DataFrame(native_data, index=native_idx, columns=columns) + snow_df = pd.DataFrame(snow_data, index=snow_idx, columns=columns) + assert_frame_equal(snow_df, native_df) + + +@sql_count_checker(query_count=1) +def test_create_df_with_df_and_subset_of_columns(): + # Test DataFrame creation where data is a DataFrame and only a subset of its columns are passed in. + # Only the columns passed in are used; the rest are ignored. In this case with end up with a single + # column DataFrame. + native_data = native_pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + snow_data = pd.DataFrame(native_data) + columns = ["a"] + native_df = native_pd.DataFrame(native_data, columns=columns) + snow_df = pd.DataFrame(snow_data, columns=columns) + assert_frame_equal(snow_df, native_df) + + +def test_create_df_with_copy(): + # When copy is True, the data is copied into the DataFrame, and the new DataFrame and data do not share references. + data = pd.DataFrame([[1, 2], [3, 4], [5, 6]]) + df_copy = pd.DataFrame(data, copy=True) + df_not_copy = pd.DataFrame(data, copy=False) + + with SqlCounter(query_count=3): + # Changing data should also change df_not_copy. It does not change df_copy. + data.iloc[0, 0] = 100 + assert data.iloc[0, 0] == df_not_copy.iloc[0, 0] == 100 + assert df_copy.iloc[0, 0] == 1 + + with SqlCounter(query_count=3): + # Similarly, changing df_not_copy should also change data. It does not change df_copy. + df_not_copy.iloc[0, 0] = 99 + assert data.iloc[0, 0] == df_not_copy.iloc[0, 0] == 99 + assert df_copy.iloc[0, 0] == 1 + + with SqlCounter(query_count=2): + # Changing df_copy should not change data or df_not_copy. + df_copy.iloc[0, 0] = 1000 + assert data.iloc[0, 0] == df_not_copy.iloc[0, 0] == 99 + + +def test_create_series_with_copy(): + # When copy is True, the data is copied into the Series, and the new Series and data do not share references. + data = pd.Series([1, 2, 3, 4, 5]) + series_copy = pd.Series(data, copy=True) + series_not_copy = pd.Series(data, copy=False) + + with SqlCounter(query_count=3): + # Changing data should also change series_not_copy. It does not change series_copy. + data.iloc[0] = 100 + assert data.iloc[0] == series_not_copy.iloc[0] == 100 + assert series_copy.iloc[0] == 1 + + with SqlCounter(query_count=3): + # Similarly, changing series_not_copy should also change data. It does not change series_copy. + series_not_copy.iloc[0] = 99 + assert data.iloc[0] == series_not_copy.iloc[0] == 99 + assert series_copy.iloc[0] == 1 + + with SqlCounter(query_count=2): + # Changing series_copy should not change data or series_not_copy. + series_copy.iloc[0] = 1000 + assert data.iloc[0] == series_not_copy.iloc[0] == 99 From 2eb14a7937a31bf75dd8f27f2afad35aed47a169 Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 18 Sep 2024 15:01:44 -0700 Subject: [PATCH 40/42] update query counts to use constants --- tests/integ/modin/groupby/test_groupby_apply.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index 04321efcc0..1ef80e33be 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -1074,9 +1074,9 @@ class TestSeriesGroupBy: def test_dataframe_groupby_getitem(self, by, func, dropna, group_keys, sort): """Test apply() on a SeriesGroupBy that we get by DataFrameGroupBy.__getitem__""" qc = ( - 6 + QUERY_COUNT_WITH_TRANSFORM_CHECK if group_keys is False and not func == get_scalar_from_numeric_series - else 5 + else QUERY_COUNT_WITHOUT_TRANSFORM_CHECK ) if ( func in (get_dataframe_from_numeric_series, get_series_from_numeric_series) From d9bbd9b0cbf06cafec17d43e799acc5c3d71575a Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Wed, 18 Sep 2024 15:10:27 -0700 Subject: [PATCH 41/42] remove docstring update, add docstrings for helper functions --- src/snowflake/snowpark/modin/plugin/docstrings/series.py | 2 +- .../modin/plugin/extensions/dataframe_overrides.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py index eb3f4e1135..9e4ebd4d25 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series.py @@ -78,7 +78,7 @@ class Series(BasePandasDataset): c 3 dtype: int64 - The keys of the dictionary match with the Index values, hence the dictionary + The keys of the dictionary match with the Index values, hence the Index values have no effect. >>> d = {'a': 1, 'b': 2, 'c': 3} diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index a9a5ea511e..d51a8d72db 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -649,6 +649,10 @@ def _df_init_dict_data_with_snowpark_pandas_values( columns: list | AnyArrayLike | Series | Index, dtype: str | np.dtype | native_pd.ExtensionDtype | None, ): + """ + Helper function for initializing a DataFrame with a dictionary where all the values + are Snowpark pandas objects. + """ # Special case: data is a dict where all the values are Snowpark pandas objects. # Concat can only be performed with BasePandasDataset objects. # If a value is an Index, convert it to a Series where the index is the index to be set since these values @@ -674,6 +678,10 @@ def _df_init_list_data_with_snowpark_pandas_values( columns: list | AnyArrayLike | Series | Index, dtype: str | np.dtype | native_pd.ExtensionDtype | None, ): + """ + Helper function for initializing a DataFrame with a list where all the values + are Snowpark pandas objects. + """ # Special case: data is a list/dict where all the values are Snowpark pandas objects. # Concat can only be performed with BasePandasDataset objects. # If a value is an Index, convert it to a Series. From f40c5b4a4a5f4d1230f13b4c2e15acc967909dbd Mon Sep 17 00:00:00 2001 From: Varnika Budati Date: Fri, 20 Sep 2024 11:28:42 -0700 Subject: [PATCH 42/42] try to break down df init into three steps: data, columns, and index --- .../snowpark/modin/plugin/_internal/utils.py | 34 +- .../plugin/extensions/dataframe_overrides.py | 118 +-- .../plugin/extensions/series_overrides.py | 7 +- .../test_df_series_creation_with_index.py | 892 ++++++++++++------ 4 files changed, 678 insertions(+), 373 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/utils.py b/src/snowflake/snowpark/modin/plugin/_internal/utils.py index 6d4e15fe09..a9a10a7ec2 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/utils.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/utils.py @@ -2062,6 +2062,9 @@ def convert_index_to_qc(index: Any) -> Any: if isinstance(index, Index): idx_qc = index.to_series()._query_compiler elif isinstance(index, Series): + # The name of the index comes from the Series' name, not the index name. `reindex` does not handle this, + # so we need to set the name of the index to the name of the Series. + index.index.name = index.name idx_qc = index._query_compiler else: idx_qc = Series(index)._query_compiler @@ -2109,7 +2112,6 @@ def convert_index_to_list_of_qcs(index: Any) -> list: def add_extra_columns_and_select_required_columns( query_compiler: Any, columns: Union[AnyArrayLike, list], - data_columns: Union[AnyArrayLike, list], ) -> Any: """ Method to add extra columns to and select the required columns from the provided query compiler. @@ -2123,26 +2125,32 @@ def add_extra_columns_and_select_required_columns( The query compiler to select columns from, i.e., data's query compiler. columns: AnyArrayLike or list The columns to select from the query compiler. - data_columns: AnyArrayLike or list - The columns in the data. This is data.columns if data is a DataFrame or data.name if data is a Series. - """ from modin.pandas import DataFrame + data_columns = query_compiler.get_columns().to_list() # The `columns` parameter is used to select the columns from `data` that will be in the resultant DataFrame. # If a value in `columns` is not present in data's columns, it will be added as a new column filled with NaN values. # These columns are tracked by the `extra_columns` variable. if data_columns is not None and columns is not None: extra_columns = [col for col in columns if col not in data_columns] - # To add these new columns to the DataFrame, perform `__getitem__` only with the extra columns - # and set them to None. - extra_columns_df = DataFrame(query_compiler=query_compiler) - extra_columns_df[extra_columns] = None - query_compiler = extra_columns_df._query_compiler + if extra_columns is not []: + # To add these new columns to the DataFrame, perform `__getitem__` only with the extra columns + # and set them to None. + extra_columns_df = DataFrame(query_compiler=query_compiler) + # In the case that the columns are MultiIndex but not all extra columns are tuples, we need to flatten the + # columns to ensure that the columns are a single-level index. If not, `__getitem__` will raise an error + # when trying to add new columns that are not in the expected tuple format. + if not all(isinstance(col, tuple) for col in extra_columns) and isinstance( + query_compiler.get_columns(), native_pd.MultiIndex + ): + flattened_columns = extra_columns_df.columns.to_flat_index() + extra_columns_df.columns = flattened_columns + extra_columns_df[extra_columns] = None + query_compiler = extra_columns_df._query_compiler - # To select the columns for the resultant DataFrame, perform `.loc[]` on the created query compiler. + # To select the columns for the resultant DataFrame, perform `__getitem__` on the created query compiler. # This step is performed to ensure that the right columns are picked from the InternalFrame since we - # never explicitly drop the unwanted columns. `.loc[]` also ensures that the columns in the resultant + # never explicitly drop the unwanted columns. `__getitem__` also ensures that the columns in the resultant # DataFrame are in the same order as the columns in the `columns` parameter. - columns = slice(None) if columns is None else columns - return DataFrame(query_compiler=query_compiler).loc[:, columns]._query_compiler + return DataFrame(query_compiler=query_compiler)[columns]._query_compiler diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index d51a8d72db..65152181d3 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -60,6 +60,7 @@ is_numeric_dtype, ) from pandas.core.dtypes.inference import is_hashable, is_integer +from pandas.core.indexes.base import ensure_index from pandas.core.indexes.frozen import FrozenList from pandas.io.formats.printing import pprint_thing from pandas.util._validators import validate_bool_kwarg @@ -472,7 +473,6 @@ def __init__( # Setting the query compiler # -------------------------- if query_compiler is not None: - # CASE I: query_compiler # If a query_compiler is passed in only use the query_compiler field to create a new DataFrame. # Verify that the data, index, and columns parameters are None. assert_fields_are_none( @@ -485,50 +485,38 @@ def __init__( # Check that index is not a DataFrame and dtype is not "category". error_checking_for_init(index, dtype) - # Convert columns to a local object if it is a lazy Index. - columns = try_convert_index_to_native(columns) + # Convert columns to a local object if it is lazy. + if columns is not None: + columns = ( + columns.to_pandas() + if isinstance(columns, (Index, BasePandasDataset)) + else columns + ) + columns = ensure_index(columns) # The logic followed here is: - # STEP 1: Create a query_compiler from the provided data. If columns are provided, add/select the columns. - # STEP 2: If an index is provided, set the index through set_index or reindex. - # STEP 3: The resultant query_compiler is then set as the query_compiler for the DataFrame. - - # STEP 1: Setting the data (and columns) - # -------------------------------------- + # STEP 1: Obtain the query_compiler from the provided data if the data is lazy. If data is local, the query + # compiler is None. + # STEP 2: If columns are provided, set the columns if data is lazy. + # STEP 3: If both the data and index are local (or index is None), create a query compiler from pandas. + # STEP 4: Otherwise, set the index through set_index or reindex. + # STEP 5: The resultant query_compiler is then set as the query_compiler for the DataFrame. + + # STEP 1: Setting the data + # ------------------------ if isinstance(data, Index): - # CASE II: data is a Snowpark pandas Index # If the data is an Index object, convert it to a DataFrame to make sure that the values are in the # correct format: the values should be a data column, not an index column. - if data.name is None: - # If no name is provided, the default name is 0. Otherwise, only use the first value in `columns` to - # set the column name; this is because the resultant DataFrame will have only one column. - new_name = 0 if columns is None else columns[0] - else: - new_name = data.name - query_compiler = data.to_frame(index=False, name=new_name)._query_compiler - + # Converting the Index object to its DataFrame version sets the resultant DataFrame's column name correctly - + # it should be 0 if the name is None. + query_compiler = data.to_frame(index=False)._query_compiler elif isinstance(data, Series): - # CASE III: data is a Snowpark pandas Series - # If the Series `data` has no name, the default name is 0. - name = [data.name] if data.name is not None else [0] - if columns is None: - # If no columns are provided, the resultant DataFrame has only one column. - # The column name is the Series' name. - query_compiler = data._query_compiler.set_columns(name) - elif data.name in columns: - # Treat any columns that are not data.name as extra columns. They will be appended as NaN columns. - # After this, select the required columns in the order provided by `columns`. - query_compiler = add_extra_columns_and_select_required_columns( - data._query_compiler, columns, name - ) - else: - # If the columns provided are not in the named Series, pandas clears the DataFrame and sets columns. - query_compiler = from_pandas( - native_pd.DataFrame(columns=columns) - )._query_compiler - + # Rename the Series object to 0 if its name is None and grab its query compiler. + query_compiler = data.rename( + 0 if data.name is None else data.name, inplace=False + )._query_compiler elif isinstance(data, DataFrame): - # CASE IV: data is a Snowpark pandas DataFrame + query_compiler = data._query_compiler if columns is None and index is None: # Special case: if the new DataFrame has the same columns and index as the original DataFrame, # the query compiler is shared and kept track of as a sibling. @@ -537,19 +525,38 @@ def __init__( # When copy is False, the DataFrame is a shallow copy of the original DataFrame. data._add_sibling(self) return - # Treat any columns that are not in data.columns as extra columns. They will be appended as NaN columns. - # After this, select the required columns in the order provided by `columns`. - query_compiler = add_extra_columns_and_select_required_columns( - data._query_compiler, columns, data.columns - ) - else: - # CASE V: Non-Snowpark pandas data + # STEP 2: Setting the columns if data is lazy + # ------------------------------------------- + # When data is lazy, the query compiler is not None. + if query_compiler is not None: + if columns is not None: + if ( + isinstance(data, (Index, Series)) + and query_compiler.get_columns()[0] not in columns + ): + # If the name of the Series/Index is not in the columns, clear the DataFrame and set the columns. + query_compiler = from_pandas( + native_pd.DataFrame(columns=columns) + )._query_compiler + else: + # Treat any columns not in data.columns (or data.name if data is a Series/Index) as extra columns. + # They will be appended as NaN columns. Then, select the required columns in the order provided by `columns`. + query_compiler = add_extra_columns_and_select_required_columns( + query_compiler, columns + ) + + # STEP 3: Creating a query compiler from pandas + # --------------------------------------------- + else: # When the data is local, the query compiler is None. + # If the data, columns, and index are local objects, the query compiler representation is created from pandas. + # However, when the data is a dict but the index is lazy, the index is converted to pandas and the query + # compiler is created from pandas. if not isinstance( data, (native_pd.Series, native_pd.DataFrame, native_pd.Index) ) and is_list_like(data): + # If data is a pandas object, directly handle it with the pandas constructor. if is_dict_like(data): - # Setting up keys and values for processing if all the values are Snowpark pandas objects. if columns is not None: # Reduce the dictionary to only the relevant columns as the keys. data = {key: value for key, value in data.items() if key in columns} @@ -565,18 +572,19 @@ def __init__( ) return - # If only some data is a Snowpark pandas object, convert it to pandas objects. + # If only some data is a Snowpark pandas object, convert the lazy data to pandas objects. res = {} - index = try_convert_index_to_native(index) for k, v in data.items(): if isinstance(v, Index): res[k] = v.to_pandas() elif isinstance(v, BasePandasDataset): # Need to perform reindex on the Series or DataFrame objects since only the data # whose index matches the given index is kept. - res[k] = v.to_pandas().reindex(index=index) + res[k] = v.reindex(index=index).to_pandas() else: res[k] = v + # If the index is lazy, convert it to a pandas object so that the pandas constructor can handle it. + index = try_convert_index_to_native(index) data = res else: # list-like but not dict-like data. @@ -594,8 +602,8 @@ def __init__( # Sometimes the ndarray representation of a list is different from a regular list. # For instance, [(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")] # is different from np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]). - # The list has the shape (3, 3) while the ndarray has the shape (3,). Therefore, do not modify - # the ndarray data. + # The list has the shape (3, 3) while the ndarray has the shape (3,). + # Therefore, do not modify the ndarray data. if not isinstance(data, np.ndarray): # If only some data is a Snowpark pandas object, convert it to pandas objects. res = [ @@ -609,7 +617,7 @@ def __init__( query_compiler = from_pandas( native_pd.DataFrame( data=data, - # Handle setting the index, if it is a lazy index, outside this block. + # Handle setting the index, if it is a lazy index, outside this block in STEP 4. index=None if isinstance(index, (Index, Series)) else index, columns=columns, dtype=dtype, @@ -617,9 +625,9 @@ def __init__( ) )._query_compiler - # STEP 2: Setting the index + # STEP 4: Setting the index # ------------------------- - # The index is already set if the data is a non-Snowpark pandas object. + # The index is already set if the data and index are non-Snowpark pandas objects. # If either the data or the index is a Snowpark pandas object, set the index here. if index is not None and ( isinstance(index, (Index, Series)) @@ -638,7 +646,7 @@ def __init__( convert_index_to_list_of_qcs(index) ) - # STEP 3: Setting the query compiler + # STEP 5: Setting the query compiler # ---------------------------------- self._query_compiler = query_compiler diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index 34f9b9c690..e2ed2580cd 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -352,7 +352,6 @@ def __init__( # Setting the query compiler # -------------------------- if query_compiler is not None: - # CASE I: query_compiler # If a query_compiler is passed in, only use the query_compiler and name fields to create a new Series. # Verify that the data and index parameters are None. assert_fields_are_none(class_name="Series", data=data, index=index) @@ -380,14 +379,14 @@ def __init__( # STEP 1: Setting the data # ------------------------ if isinstance(data, Index): - # CASE II: Index + # CASE I: Index # If the data is an Index object, convert it to a Series, and get the query_compiler. query_compiler = ( data.to_series(index=None, name=name).reset_index(drop=True)._query_compiler ) elif isinstance(data, Series): - # CASE III: Series + # CASE II: Series # If the data is a Series object, use its query_compiler. query_compiler = data._query_compiler if index is None and name is None and copy is False: @@ -397,7 +396,7 @@ def __init__( return else: - # CASE IV: Non-Snowpark pandas data + # CASE III: Non-Snowpark pandas data # If the data is not a Snowpark pandas object, convert it to a query compiler. # The query compiler uses the '__reduced__' name internally as a column name to represent pandas # Series objects that are not explicitly assigned a name. diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index e15acd03e8..06ed621fc6 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -38,28 +38,187 @@ def obj_type_helper(obj_type: str) -> tuple: return assert_equal_func, snow_obj, native_obj, kwargs +# DATA TESTS +# ---------- @pytest.mark.parametrize( - "native_idx", + "native_data", [ native_pd.Index([1, 2, 3, 4], name="some name"), - native_pd.Index(list(range(250))), + native_pd.Index(list(range(200))), native_pd.Index(["A", None, 2.3, 1], name="AAAAA"), native_pd.Index([]), + native_pd.Series([1, 2, 3, 4], name="some name"), + native_pd.Series(list(range(100))), + native_pd.Series(["A", None, 2.3, 1], name="AAAAA"), + native_pd.Series([]), + [], + ["A", "B", "C"], + None, ], ) @pytest.mark.parametrize("obj_type", ["series", "df"]) -@sql_count_checker(query_count=1, join_count=0) -def test_create_with_index_as_data(native_idx, obj_type): +@sql_count_checker(query_count=1) +def test_create_with_data(native_data, obj_type): """ - Creating a Series where the data is an Index. + Creating a DataFrame/Series where the data is an Index, Series, or list. """ - snow_idx = pd.Index(native_idx) + if isinstance(native_data, native_pd.Series): + snow_data = pd.Series(native_data) + elif isinstance(native_data, native_pd.Index): + snow_data = pd.Index(native_data) + else: + snow_data = native_data + assert_equal_func, snow_obj, native_obj, kwargs = obj_type_helper(obj_type) + assert_equal_func( + snow_obj(snow_data), + native_obj(native_data), + check_dtype=False, + check_index_type=False, + **kwargs, + ) + + +# INDEX TESTS +# ----------- +@pytest.mark.parametrize( + "index", [[1, 2, 3, 4], list(range(100)), ["A", None, 2.3, 1], []] +) +@pytest.mark.parametrize("index_type", ["series", "index", "list"]) +@pytest.mark.parametrize("obj_type", ["series", "df"]) +def test_create_with_index(index, index_type, obj_type): + with SqlCounter(query_count=1, join_count=0 if index_type == "list" else 1): + # When creating an empty Series with a non-empty index, the index should be used as the index of the Series. + if index_type == "series": + native_index, snow_index = native_pd.Series(index), pd.Series(index) + elif index_type == "index": + native_index, snow_index = native_pd.Index(index), pd.Index(index) + else: + native_index, snow_index = index, index + assert_equal_func, snow_obj, native_obj, kwargs = obj_type_helper(obj_type) + assert_equal_func( + snow_obj(index=snow_index, dtype=object), + native_obj(index=native_index, dtype=object), + check_index_type=False, + **kwargs, + ) + + +@pytest.mark.parametrize( + "index", [[1, 2, 3, 4], list(range(100)), ["A", None, 2.3, 1], []] +) +@pytest.mark.parametrize("index_type", ["series", "index"]) +@pytest.mark.parametrize("index_name", [None, "index name!", ("tuple", "name")]) +@pytest.mark.parametrize("obj_type", ["series", "df"]) +@sql_count_checker(query_count=1, join_count=1) +def test_create_with_named_index(index, index_type, index_name, obj_type): + # When creating an empty Series with a non-empty index, the index should be used as the index of the Series. + if index_type == "series": + native_index, snow_index = native_pd.Series(index, name=index_name), pd.Series( + index, name=index_name + ) + else: + native_index, snow_index = native_pd.Index(index, name=index_name), pd.Index( + index, name=index_name + ) assert_equal_func, snow_obj, native_obj, kwargs = obj_type_helper(obj_type) assert_equal_func( - snow_obj(snow_idx), native_obj(native_idx), check_dtype=False, **kwargs + snow_obj(index=snow_index, dtype=object), + native_obj(index=native_index, dtype=object), + check_index_type=False, + **kwargs, ) +# COLUMN TESTS +# ------------ +@pytest.mark.parametrize( + "columns", + [ + ["A"], + ("A", "B"), + [("A", "B")], + ["A", "B", "C"], + ["A", ("B", "C")], + [("A", "B"), ("C", "D")], + native_pd.Index(["A", "B", "C"]), + np.array([("A", "B"), ("B", "C")]), + ], +) +@sql_count_checker(query_count=1) +def test_create_df_with_columns(columns): + # Test DataFrame creation with only columns passed in. + native_df = native_pd.DataFrame(columns=columns) + snow_df = pd.DataFrame(columns=columns) + assert_frame_equal(snow_df, native_df) + + +@pytest.mark.parametrize( + "columns", + [ + ["A"], + ("A", "B"), + ["A", "B", "C"], + ["A", ("B", "C")], + ], +) +@pytest.mark.parametrize("column_type", ["index", "series"]) +@pytest.mark.parametrize("column_name", [None, "index name!", ("tuple", "name")]) +def test_create_df_with_lazy_columns(columns, column_type, column_name): + # Test DataFrame creation with only lazy columns passed in. + with SqlCounter(query_count=2 if column_type == "index" else 2): + if column_type == "index": + native_cols = native_pd.Index(data=columns, name=column_name) + snow_cols = pd.Index(data=columns, name=column_name) + else: + native_cols = native_pd.Series(data=columns, name=column_name) + snow_cols = pd.Series(data=columns, name=column_name) + native_df = native_pd.DataFrame(columns=native_cols) + snow_df = pd.DataFrame(columns=snow_cols) + # If the column name has a tuple in it, convert it to a list and check whether this matches the Snowpark pandas + # result. This is because any tuple values stored in Snowflake are converted to lists. + # Here, the column name is derived from the values of an Index/Series object stored in Snowflake. + if any(isinstance(col, tuple) for col in columns): + native_columns = [ + list(col) if isinstance(col, tuple) else col + for col in native_df.columns + ] + assert native_columns == snow_df.columns.tolist() + # Set the Snowpark pandas DataFrame's columns to the native pandas DataFrame's columns to allow for + # easier comparison between the two objects + snow_df.columns = native_df.columns + assert_frame_equal(snow_df, native_df) + + +@pytest.mark.parametrize("columns", [[("A", "B")], [("A", "B"), ("C", "D")]]) +@pytest.mark.parametrize("column_type", ["index", "series"]) +@pytest.mark.parametrize("column_name", [None, ("tuple", "name")]) +@sql_count_checker(query_count=2) +def test_create_df_with_lazy_multiindex_columns(columns, column_type, column_name): + # Test DataFrame creation with only lazy columns passed in. + if column_type == "index": + native_cols = native_pd.Index(data=columns, name=column_name) + snow_cols = pd.Index(data=columns, name=column_name) + else: + native_cols = native_pd.Series(data=columns, name=column_name) + snow_cols = pd.Series(data=columns, name=column_name) + native_df = native_pd.DataFrame(columns=native_cols) + snow_df = pd.DataFrame(columns=snow_cols) + # If the column name has a tuple in it, convert it to a list and check whether this matches the Snowpark pandas + # result. This is because any tuple values stored in Snowflake are converted to lists. + # Here, the column name is derived from the values of an Index/Series object stored in Snowflake. + if column_type == "series" and any(isinstance(col, tuple) for col in columns): + native_columns = [ + list(col) if isinstance(col, tuple) else col for col in native_df.columns + ] + assert native_columns == snow_df.columns.tolist() + # Set the Snowpark pandas DataFrame's columns to the native pandas DataFrame's columns to allow for + # easier comparison between the two objects + snow_df.columns = native_df.columns + assert_frame_equal(snow_df, native_df) + + +# DATA AND INDEX TESTS +# -------------------- @pytest.mark.parametrize( "data, native_idx", [ @@ -345,127 +504,6 @@ def test_create_df_with_empty_df_as_data_and_index_as_index(native_df, native_in ) -@pytest.mark.parametrize( - "native_df, native_index, columns", - [ - # Single column DataFrames. - ( - native_pd.DataFrame(list(range(20))), - native_pd.Index(list(range(20))), - [1], - ), # all index values match - ( - native_pd.DataFrame(["A", "V", "D", "R"]), - native_pd.Index([10, 20, 30, 40], name="none"), - ["A"], - ), # no index values match, column missing - # Multi-column DataFrames. - ( - native_pd.DataFrame( - {"col1": ["A", "B", "C", "D"], "col2": ["B", "H", "T", "W"]}, - index=[1.1, 2.2, 3, 4], - ), - native_pd.Index([1, 2, 3, 4], name="some name"), - ["col1"], - ), # some index values are missing, subset of columns - ( - native_pd.DataFrame( - [[10, 20, 30, 40], [2, 4, 6, 7], [-1, -2, -3, -4], [90, 50, 30, 10]], - index=native_pd.Index([None, "B", 0, 3.14], name="mixed"), - columns=["C", "L", "M", "W"], - ), - native_pd.Index(["B", 0, None, 3.14]), - [3, 1], - ), # rearranged index and column values - ( - native_pd.DataFrame( - [["A", "B", "C", "D", "E"], ["R", "S", "T", "U", "V"]], - columns=[1, 2, 3, 4, 5], - ), - native_pd.Index([3, 4], name="index"), - ["A", "V", "C"], - ), # subset of index values - ( - native_pd.DataFrame([list(range(20)), list(range(20))]), - native_pd.Index(list(range(20))), - [1], - ), # all index values match - ( - native_pd.DataFrame( - { - "A": ["A", "V", "D", "R"], - "V": ["V", "D", "R", "A"], - "D": ["D", "R", "A", "V"], - "R": ["R", "A", "V", "D"], - } - ), - native_pd.Index([10, 20, 30, 40], name="none"), - ["A", "X", "D", "R"], - ), # no index values match - ( - native_pd.DataFrame([]), - native_pd.Index([], name="empty index", dtype="int64"), - [], - ), # empty data, index, and columns - ( - native_pd.DataFrame([]), - native_pd.Index(["A", "V"], name="non-empty index"), - ["A", "V"], - ), # empty data, non-empty index and columns - ( - { - "A": [1, 2, 3], - "B": [4, 5, 6], - }, # dict data should behave similar to DataFrame data - native_pd.Index([10, 0, 1], name="non-empty index"), - ["A", "C"], - ), - ], -) -@pytest.mark.parametrize("column_type", ["list", "index"]) -def test_create_df_with_df_as_data_and_index_as_index_and_different_columns( - native_df, native_index, columns, column_type -): - """ - Creating a DataFrame where the data is a DataFrame, the index is an Index, and non-existent columns. - """ - # Two joins are performed: one from joining the data and index parameters to have a query compiler whose - # index columns match the provided index, and one from performing .loc[] to filter the generated qc. - # One extra query is required to create the columns if it is an Index (column_type is "index"). - native_columns = columns if column_type == "list" else native_pd.Index(columns) - snow_columns = columns if column_type == "list" else pd.Index(columns) - snow_df = ( - pd.DataFrame(native_df) - if isinstance(native_df, native_pd.DataFrame) - else native_df - ) - snow_index = pd.Index(native_index) - qc = 1 if column_type == "list" else 2 - qc += 1 if (isinstance(native_df, dict)) else 0 - qc += 1 if (isinstance(native_df, dict) and column_type == "index") else 0 - jc = 1 if isinstance(native_df, native_pd.DataFrame) else 0 - with SqlCounter(query_count=qc, join_count=jc): - assert_frame_equal( - pd.DataFrame(snow_df, index=snow_index, columns=native_columns), - native_pd.DataFrame(native_df, index=native_index, columns=snow_columns), - check_dtype=False, - ) - - -@sql_count_checker(query_count=1) -def test_create_df_with_new_columns(): - """ - Creating a DataFrame with columns that don't exist in `data`. - """ - native_df = native_pd.DataFrame(list(range(100))) - snow_df = pd.DataFrame(native_df) - assert_frame_equal( - pd.DataFrame(snow_df, columns=["new column"]), - native_pd.DataFrame(native_df, columns=["new column"]), - check_dtype=False, - ) - - @sql_count_checker(query_count=2) def test_create_df_with_dict_as_data_and_index_as_index(): """ @@ -485,54 +523,6 @@ def test_create_df_with_dict_as_data_and_index_as_index(): assert_frame_equal(snow_df, native_df) -@sql_count_checker(query_count=1) -def test_create_series_with_list_of_lists_index(): - # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. - arrays = [ - np.array(["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"]), - np.array(["two", "one", "two", "one", "two", "one", "two", "one"]), - ] - data = [1, 2, 3, 4, 5, 6, 7, 8] - native_series = native_pd.Series(data, index=arrays) - snow_series = pd.Series(data, index=arrays) - assert_series_equal(snow_series, native_series) - - -@sql_count_checker(query_count=1, join_count=2) -def test_create_series_with_index_data_and_list_of_lists_index(): - # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. - arrays = [ - ["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"], - ["two", "one", "two", "one", "two", "one", "two", "one"], - ] - data = native_pd.Index([1, 2, 3, 4, 5, 6, 7, 8]) - native_series = native_pd.Series(data, index=arrays) - snow_series = pd.Series(pd.Index(data), index=arrays) - assert_series_equal(snow_series, native_series) - - -@sql_count_checker(query_count=1, join_count=2) -def test_create_df_with_index_data_and_list_of_lists_index(): - # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. - arrays = [ - ["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"], - ["two", "one", "two", "one", "two", "one", "two", "one"], - ] - data = native_pd.Index([1, 2, 3, 4, 5, 6, 7, 8]) - native_df = native_pd.DataFrame(data, index=arrays) - snow_df = pd.DataFrame(pd.Index(data), index=arrays) - assert_frame_equal(snow_df, native_df) - - -@sql_count_checker(query_count=1) -def test_create_series_with_none_data_and_non_empty_index(): - # When creating an empty Series with a non-empty index, the index should be used as the index of the Series. - index = ["A", "B", "C", "D"] - native_series = native_pd.Series(None, index=index, dtype=object) - snow_series = pd.Series(None, index=index, dtype=object) - assert_series_equal(snow_series, native_series) - - @pytest.mark.parametrize( "data1, data2", [("series", "series"), ("series", "index"), ("index", "index")] ) @@ -682,26 +672,26 @@ def test_create_series_with_series_index_list_data(data1, data2): snow_data = [snow_data1, snow_data2] # Create Series only with list data. - native_df = native_pd.Series(native_data) - snow_df = pd.Series(snow_data) + native_ser = native_pd.Series(native_data) + snow_ser = pd.Series(snow_data) with SqlCounter(query_count=1): - assert_series_equal(snow_df, native_df) + assert_series_equal(snow_ser, native_ser) # Create Series with list data and Series index. native_ser_index = native_pd.Series([2, 11]) snow_ser_index = pd.Series([2, 11]) - native_df = native_pd.Series(native_data, index=native_ser_index) - snow_df = pd.Series(snow_data, index=snow_ser_index) + native_ser = native_pd.Series(native_data, index=native_ser_index) + snow_ser = pd.Series(snow_data, index=snow_ser_index) with SqlCounter(query_count=1): - assert_series_equal(snow_df, native_df, check_dtype=False) + assert_series_equal(snow_ser, native_ser, check_dtype=False) # Create Series with list data and Index index. native_index = native_pd.Index([22, 11]) snow_index = pd.Index([22, 11]) - native_df = native_pd.Series(native_data, index=native_index) - snow_df = pd.Series(snow_data, index=snow_index) + native_ser = native_pd.Series(native_data, index=native_index) + snow_ser = pd.Series(snow_data, index=snow_index) with SqlCounter(query_count=1): - assert_series_equal(snow_df, native_df, check_dtype=False) + assert_series_equal(snow_ser, native_ser, check_dtype=False) @pytest.mark.parametrize( @@ -721,100 +711,69 @@ def test_create_series_with_series_index_dict_data(data1, data2): snow_data = {11: snow_data1, 22: snow_data2} # Create DataFrame only with dict data. - native_df = native_pd.Series(native_data) - snow_df = pd.Series(snow_data) + native_ser = native_pd.Series(native_data) + snow_ser = pd.Series(snow_data) with SqlCounter(query_count=1): - assert_series_equal(snow_df, native_df) + assert_series_equal(snow_ser, native_ser) # Create DataFrame with dict data and Series index. native_ser_index = native_pd.Series([9, 2, 999]) snow_ser_index = pd.Series([9, 2, 999]) - native_df = native_pd.Series(native_data, index=native_ser_index) - snow_df = pd.Series(snow_data, index=snow_ser_index) + native_ser = native_pd.Series(native_data, index=native_ser_index) + snow_ser = pd.Series(snow_data, index=snow_ser_index) with SqlCounter(query_count=1): - assert_series_equal(snow_df, native_df) + assert_series_equal(snow_ser, native_ser) # Create DataFrame with dict data and Index index. native_index = native_pd.Index([9, 2, 999]) snow_index = pd.Index([9, 2, 999]) - native_df = native_pd.Series(native_data, index=native_index) - snow_df = pd.Series(snow_data, index=snow_index) + native_ser = native_pd.Series(native_data, index=native_index) + snow_ser = pd.Series(snow_data, index=snow_index) with SqlCounter(query_count=1): - assert_series_equal(snow_df, native_df) + assert_series_equal(snow_ser, native_ser) -def test_create_df_with_mixed_series_index_dict_data(): - # Create the dict data. - native_data1 = native_pd.Series([1, 2, 3]) - native_data2 = native_pd.Index([4, 5, 6]) - data3 = [7, 8, 9] - snow_data1 = pd.Series(native_data1) - snow_data2 = pd.Index(native_data2) - native_data = {"A": native_data1, "B": native_data2, "C": data3} - snow_data = {"A": snow_data1, "B": snow_data2, "C": data3} +@sql_count_checker(query_count=1) +def test_create_series_with_list_data_and_list_of_lists_index(): + # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. + arrays = [ + np.array(["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"]), + np.array(["two", "one", "two", "one", "two", "one", "two", "one"]), + ] + data = [1, 2, 3, 4, 5, 6, 7, 8] + native_series = native_pd.Series(data, index=arrays) + snow_series = pd.Series(data, index=arrays) + assert_series_equal(snow_series, native_series) - # Create DataFrame only with dict data. - native_df = native_pd.DataFrame(native_data) - snow_df = pd.DataFrame(snow_data) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df) - # Create DataFrame with dict data and Series index. - native_ser_index = native_pd.Series([9, 2, 999]) - snow_ser_index = pd.Series(native_ser_index) - native_df = native_pd.DataFrame(native_data, index=native_ser_index) - snow_df = pd.DataFrame(snow_data, index=snow_ser_index) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df) +@sql_count_checker(query_count=1, join_count=2) +def test_create_series_with_index_data_and_list_of_lists_index(): + # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. + arrays = [ + ["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"], + ["two", "one", "two", "one", "two", "one", "two", "one"], + ] + data = native_pd.Index([1, 2, 3, 4, 5, 6, 7, 8]) + native_series = native_pd.Series(data, index=arrays) + snow_series = pd.Series(pd.Index(data), index=arrays) + assert_series_equal(snow_series, native_series) - # Create DataFrame with dict data and Index index. - native_index = native_pd.Index([9, 2, 999]) - snow_index = pd.Index(native_index) - native_df = native_pd.DataFrame(native_data, index=native_index) - snow_df = pd.DataFrame(snow_data, index=snow_index) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df) - - # Create DataFrame with dict data, Series index, and columns. - columns = ["A", "B", "C"] - native_df = native_pd.DataFrame( - native_data, index=native_ser_index, columns=columns - ) - snow_df = pd.DataFrame(snow_data, index=snow_ser_index, columns=columns) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df) - # Create DataFrame with dict data, Index index, and Index columns. - native_columns = native_pd.Index(columns) - snow_columns = pd.Index(columns) - native_df = native_pd.DataFrame( - native_data, index=native_index, columns=native_columns - ) - snow_df = pd.DataFrame(snow_data, index=snow_index, columns=snow_columns) - with SqlCounter(query_count=1): - assert_frame_equal(snow_df, native_df) - - -@sql_count_checker(query_count=2) -def test_create_df_with_mixed_series_index_list_data_negative(): - """ - Since Snowpark pandas relies on native pandas for initialization a DataFrame with mixed data types, - they both raise the same error. - """ - # Create the list data. - data1 = native_pd.Series([1, 2, 3]) - data2 = native_pd.Index([4, 5, 6]) - data3 = [7, 8, 9] - # Need to convert data3 to an Index since native pandas tries to perform `get_indexer` on it. - err_msg = "'builtin_function_or_method' object has no attribute 'get_indexer'" - with pytest.raises(AttributeError, match=err_msg): - native_pd.DataFrame([data1, data2, data3]) - with pytest.raises(AttributeError, match=err_msg): - pd.DataFrame([pd.Series(data1), pd.Index(data2), data3]) +@sql_count_checker(query_count=1, join_count=2) +def test_create_df_with_index_data_and_list_of_lists_index(): + # When given a list of lists as the index, this index needs to be converted to a MultiIndex before processing. + arrays = [ + ["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"], + ["two", "one", "two", "one", "two", "one", "two", "one"], + ] + data = native_pd.Index([1, 2, 3, 4, 5, 6, 7, 8]) + native_df = native_pd.DataFrame(data, index=arrays) + snow_df = pd.DataFrame(pd.Index(data), index=arrays) + assert_frame_equal(snow_df, native_df) @pytest.mark.xfail( - reason="SNOW-1638397 DataFrane creation fails: reindex does not work with string index" + reason="SNOW-1638397 DataFrame creation fails: reindex does not work with string index" ) def test_create_df_with_series_data_and_series_index(): # Create the data and index. @@ -830,40 +789,37 @@ def test_create_df_with_series_data_and_series_index(): assert_frame_equal(snow_df, native_df) -@sql_count_checker(query_count=0) -def test_create_df_with_df_index_negative(): - with pytest.raises(ValueError, match="Index data must be 1-dimensional"): - native_pd.DataFrame( - [1, 2, 3], index=native_pd.DataFrame([[1, 2], [3, 4], [5, 6]]) - ) - with pytest.raises(ValueError, match="Index data must be 1-dimensional"): - pd.DataFrame([1, 2, 3], index=pd.DataFrame([[1, 2], [3, 4], [5, 6]])) - - -@sql_count_checker(query_count=0) -def test_create_series_with_df_index_negative(): - with pytest.raises(ValueError, match="Index data must be 1-dimensional"): - native_pd.Series([1, 2, 3], index=native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) - with pytest.raises(ValueError, match="Index data must be 1-dimensional"): - pd.Series([1, 2, 3], index=pd.DataFrame([[1, 2], [3, 4], [5, 6]])) +# DATA AND COLUMN TESTS +# ---------------------- +@sql_count_checker(query_count=1) +def test_create_df_with_df_data_and_subset_of_columns(): + # Test DataFrame creation where data is a DataFrame and only a subset of its columns are passed in. + # Only the columns passed in are used; the rest are ignored. In this case with end up with a single + # column DataFrame. + native_data = native_pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + snow_data = pd.DataFrame(native_data) + columns = ["a"] + native_df = native_pd.DataFrame(native_data, columns=columns) + snow_df = pd.DataFrame(snow_data, columns=columns) + assert_frame_equal(snow_df, native_df) -@sql_count_checker(query_count=0) -def test_create_series_with_df_data_negative(): - with pytest.raises( - ValueError, - match=re.escape( - "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool()" - ", a.item(), a.any() or a.all()." - ), - ): - native_pd.Series(native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) - with pytest.raises(ValueError, match="Data cannot be a DataFrame"): - pd.Series(pd.DataFrame([[1, 2], [3, 4], [5, 6]])) +@sql_count_checker(query_count=1) +def test_create_df_with_df_data_and_new_columns(): + """ + Creating a DataFrame with columns that don't exist in `data`. + """ + native_df = native_pd.DataFrame(list(range(100))) + snow_df = pd.DataFrame(native_df) + assert_frame_equal( + pd.DataFrame(snow_df, columns=["new column"]), + native_pd.DataFrame(native_df, columns=["new column"]), + check_dtype=False, + ) @sql_count_checker(query_count=1) -def test_create_df_with_name_in_columns(): +def test_create_df_with_df_data_and_name_in_columns(): # Test DataFrame creation where the data is a named Series and its name is in the columns passed in. # The column sharing the name with the Series takes on its values as the column values; the rest of the # columns are filled with NaNs. @@ -875,6 +831,299 @@ def test_create_df_with_name_in_columns(): assert_frame_equal(snow_df, native_df) +@sql_count_checker(query_count=1) +def test_create_df_with_df_data_and_name_not_in_columns(): + # Test DataFrame creation where the data is a named Series and its name is not in the columns passed in. + # The result is an empty DataFrame with the columns set. + native_data = native_pd.Series([1, 2, 3], name="b") + snow_data = pd.Series(native_data) + columns = ["a", "c"] + native_df = native_pd.DataFrame(native_data, columns=columns) + snow_df = pd.DataFrame(snow_data, columns=columns) + assert_frame_equal(snow_df, native_df) + + +@sql_count_checker(query_count=1) +def test_create_df_with_df_data_and_tuple_name_in_columns(): + # Test DataFrame creation where the data is a named Series and its name is in the columns passed in. + # The column sharing the name with the Series takes on its values as the column values; the rest of the + # columns are filled with NaNs. + native_data = native_pd.Series([1, 2, 3], name=("b", "a")) + snow_data = pd.Series(native_data) + columns = [("b", "a"), "b"] + native_df = native_pd.DataFrame(native_data, columns=columns) + snow_df = pd.DataFrame(snow_data, columns=columns) + assert_frame_equal(snow_df, native_df) + + +@sql_count_checker(query_count=1) +def test_create_df_with_df_data_and_tuple_name_not_in_columns(): + # Test DataFrame creation where the data is a named Series and its name is not in the columns passed in. + # The result is an empty DataFrame with the columns set. + native_data = native_pd.Series([1, 2, 3], name=("b", "a")) + snow_data = pd.Series(native_data) + columns = [("b", "c"), "b"] + native_df = native_pd.DataFrame(native_data, columns=columns) + snow_df = pd.DataFrame(snow_data, columns=columns) + assert_frame_equal(snow_df, native_df) + + +# INDEX AND COLUMN TESTS +# ---------------------- +@pytest.mark.parametrize( + "index", + [ + ["A", "B", "C"], + ("A", "B", "C"), + ["A", ("B", "C")], + ], +) +@pytest.mark.parametrize("index_type", ["index", "series"]) +@pytest.mark.parametrize("index_name", [None, "A", "index name!", ("A", "B")]) +@pytest.mark.parametrize( + "columns", + [ + ["A"], + ("A", "B"), + [("A", "B")], + ["A", "B", "C"], + ["A", ("B", "C")], + [("A", "B"), ("C", "D")], + native_pd.Index(["A", "B", "C"]), + np.array([("A", "B"), ("B", "C")]), + ], +) +@sql_count_checker(query_count=1, join_count=1) +def test_create_df_with_index_and_columns(index, index_type, index_name, columns): + # Test DataFrame creation with both index and columns passed in. + if index_type == "index": + native_index = native_pd.Index(data=index, name=index_name) + snow_index = pd.Index(data=index, name=index_name) + else: + native_index = native_pd.Series(data=index, name=index_name) + snow_index = pd.Series(data=index, name=index_name) + native_df = native_pd.DataFrame(index=native_index, columns=columns) + snow_df = pd.DataFrame(index=snow_index, columns=columns) + assert_frame_equal(snow_df, native_df) + + +@pytest.mark.parametrize("index", [[("A", "B"), ("C", "D")]]) +@pytest.mark.parametrize( + "index_type", + [ + "index", + pytest.param( + "series", + marks=pytest.mark.xfail( + reason="SNOW-1675191 reindex does not work with tuple series" + ), + ), + ], +) +@pytest.mark.parametrize("index_name", [None, ("A", "B")]) +@pytest.mark.parametrize( + "columns", + [ + ["A"], + ("A", "B"), + [("A", "B")], + ["A", "B", "C"], + ["A", ("B", "C")], + [("A", "B"), ("C", "D")], + native_pd.Index(["A", "B", "C"]), + np.array([("A", "B"), ("B", "C")]), + ], +) +def test_create_df_with_multiindex_and_columns(index, index_type, index_name, columns): + # Test DataFrame creation with both index and columns passed in. + with SqlCounter(query_count=1, join_count=1 if index_type == "series" else 0): + if index_type == "index": + native_index = native_pd.Index(data=index, name=index_name) + snow_index = pd.MultiIndex.from_tuples(index, names=index_name) + else: + native_index = native_pd.Series(data=index, name=index_name) + snow_index = pd.Series(data=index, name=index_name) + native_df = native_pd.DataFrame(index=native_index, columns=columns) + snow_df = pd.DataFrame(index=snow_index, columns=columns) + assert_frame_equal(snow_df, native_df) + + +@sql_count_checker(query_count=2) +def test_create_df_with_index_and_columns_match(): + # Test DataFrame creation with both index and columns passed in where index name is not in columns. + native_df = native_pd.DataFrame(native_pd.Index([1, 2, 3], name="b"), columns=["a"]) + snow_df = pd.DataFrame(pd.Index([1, 2, 3], name="b"), columns=["a"]) + assert_frame_equal(snow_df, native_df) + + # Test DataFrame creation with both index and columns passed in where name is in columns. + native_df = native_pd.DataFrame( + native_pd.Index([1, 2, 3], name="b"), columns=["a", "b"] + ) + snow_df = pd.DataFrame(pd.Index([1, 2, 3], name="b"), columns=["a", "b"]) + assert_frame_equal(snow_df, native_df) + + +# DATA, INDEX, AND COLUMN TESTS +# ----------------------------- +def test_create_df_with_mixed_series_index_dict_data(): + # Create the dict data. + native_data1 = native_pd.Series([1, 2, 3]) + native_data2 = native_pd.Index([4, 5, 6]) + data3 = [7, 8, 9] + snow_data1 = pd.Series(native_data1) + snow_data2 = pd.Index(native_data2) + native_data = {"A": native_data1, "B": native_data2, "C": data3} + snow_data = {"A": snow_data1, "B": snow_data2, "C": data3} + + # Create DataFrame only with dict data. + native_df = native_pd.DataFrame(native_data) + snow_df = pd.DataFrame(snow_data) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data and Series index. + native_ser_index = native_pd.Series([9, 2, 999]) + snow_ser_index = pd.Series(native_ser_index) + native_df = native_pd.DataFrame(native_data, index=native_ser_index) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data and Index index. + native_index = native_pd.Index([9, 2, 999]) + snow_index = pd.Index(native_index) + native_df = native_pd.DataFrame(native_data, index=native_index) + snow_df = pd.DataFrame(snow_data, index=snow_index) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data, Series index, and columns. + columns = ["A", "B", "C"] + native_df = native_pd.DataFrame( + native_data, index=native_ser_index, columns=columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_ser_index, columns=columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + # Create DataFrame with dict data, Index index, and Index columns. + native_columns = native_pd.Index(columns) + snow_columns = pd.Index(columns) + native_df = native_pd.DataFrame( + native_data, index=native_index, columns=native_columns + ) + snow_df = pd.DataFrame(snow_data, index=snow_index, columns=snow_columns) + with SqlCounter(query_count=1): + assert_frame_equal(snow_df, native_df) + + +@pytest.mark.parametrize( + "native_df, native_index, columns", + [ + # Single column DataFrames. + ( + native_pd.DataFrame(list(range(20))), + native_pd.Index(list(range(20))), + [1], + ), # all index values match + ( + native_pd.DataFrame(["A", "V", "D", "R"]), + native_pd.Index([10, 20, 30, 40], name="none"), + ["A"], + ), # no index values match, column missing + # Multi-column DataFrames. + ( + native_pd.DataFrame( + {"col1": ["A", "B", "C", "D"], "col2": ["B", "H", "T", "W"]}, + index=[1.1, 2.2, 3, 4], + ), + native_pd.Index([1, 2, 3, 4], name="some name"), + ["col1"], + ), # some index values are missing, subset of columns + ( + native_pd.DataFrame( + [[10, 20, 30, 40], [2, 4, 6, 7], [-1, -2, -3, -4], [90, 50, 30, 10]], + index=native_pd.Index([None, "B", 0, 3.14], name="mixed"), + columns=["C", "L", "M", "W"], + ), + native_pd.Index(["B", 0, None, 3.14]), + [3, 1], + ), # rearranged index and column values + ( + native_pd.DataFrame( + [["A", "B", "C", "D", "E"], ["R", "S", "T", "U", "V"]], + columns=[1, 2, 3, 4, 5], + ), + native_pd.Index([3, 4], name="index"), + ["A", "V", "C"], + ), # subset of index values + ( + native_pd.DataFrame([list(range(20)), list(range(20))]), + native_pd.Index(list(range(20))), + [1], + ), # all index values match + ( + native_pd.DataFrame( + { + "A": ["A", "V", "D", "R"], + "V": ["V", "D", "R", "A"], + "D": ["D", "R", "A", "V"], + "R": ["R", "A", "V", "D"], + } + ), + native_pd.Index([10, 20, 30, 40], name="none"), + ["A", "X", "D", "R"], + ), # no index values match + ( + native_pd.DataFrame([]), + native_pd.Index([], name="empty index", dtype="int64"), + [], + ), # empty data, index, and columns + ( + native_pd.DataFrame([]), + native_pd.Index(["A", "V"], name="non-empty index"), + ["A", "V"], + ), # empty data, non-empty index and columns + ( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + }, # dict data should behave similar to DataFrame data + native_pd.Index([10, 0, 1], name="non-empty index"), + ["A", "C"], + ), + ], +) +@pytest.mark.parametrize("column_type", ["list", "index"]) +def test_create_df_with_df_as_data_and_index_as_index_and_different_columns( + native_df, native_index, columns, column_type +): + """ + Creating a DataFrame where the data is a DataFrame, the index is an Index, and non-existent columns. + """ + # Two joins are performed: one from joining the data and index parameters to have a query compiler whose + # index columns match the provided index, and one from performing .loc[] to filter the generated qc. + # One extra query is required to create the columns if it is an Index (column_type is "index"). + native_columns = columns if column_type == "list" else native_pd.Index(columns) + snow_columns = columns if column_type == "list" else pd.Index(columns) + snow_df = ( + pd.DataFrame(native_df) + if isinstance(native_df, native_pd.DataFrame) + else native_df + ) + snow_index = pd.Index(native_index) + qc = 1 if column_type == "list" else 2 + qc += 1 if (isinstance(native_df, dict)) else 0 + qc += 1 if (isinstance(native_df, dict) and column_type == "index") else 0 + jc = 1 if isinstance(native_df, native_pd.DataFrame) else 0 + with SqlCounter(query_count=qc, join_count=jc): + assert_frame_equal( + pd.DataFrame(snow_df, index=snow_index, columns=native_columns), + native_pd.DataFrame(native_df, index=native_index, columns=snow_columns), + check_dtype=False, + ) + + @sql_count_checker(query_count=1, join_count=1) def test_create_df_with_name_not_in_columns_and_index(): # Test DataFrame creation where the data is a named Series and its name is not in the columns passed in. @@ -889,19 +1138,8 @@ def test_create_df_with_name_not_in_columns_and_index(): assert_frame_equal(snow_df, native_df) -@sql_count_checker(query_count=1) -def test_create_df_with_df_and_subset_of_columns(): - # Test DataFrame creation where data is a DataFrame and only a subset of its columns are passed in. - # Only the columns passed in are used; the rest are ignored. In this case with end up with a single - # column DataFrame. - native_data = native_pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) - snow_data = pd.DataFrame(native_data) - columns = ["a"] - native_df = native_pd.DataFrame(native_data, columns=columns) - snow_df = pd.DataFrame(snow_data, columns=columns) - assert_frame_equal(snow_df, native_df) - - +# COPY TESTS +# ---------- def test_create_df_with_copy(): # When copy is True, the data is copied into the DataFrame, and the new DataFrame and data do not share references. data = pd.DataFrame([[1, 2], [3, 4], [5, 6]]) @@ -948,3 +1186,55 @@ def test_create_series_with_copy(): # Changing series_copy should not change data or series_not_copy. series_copy.iloc[0] = 1000 assert data.iloc[0] == series_not_copy.iloc[0] == 99 + + +# NEGATIVE TESTS +# -------------- +@sql_count_checker(query_count=0) +def test_create_df_with_df_index_negative(): + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + native_pd.DataFrame( + [1, 2, 3], index=native_pd.DataFrame([[1, 2], [3, 4], [5, 6]]) + ) + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + pd.DataFrame([1, 2, 3], index=pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + + +@sql_count_checker(query_count=0) +def test_create_series_with_df_index_negative(): + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + native_pd.Series([1, 2, 3], index=native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + with pytest.raises(ValueError, match="Index data must be 1-dimensional"): + pd.Series([1, 2, 3], index=pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + + +@sql_count_checker(query_count=0) +def test_create_series_with_df_data_negative(): + with pytest.raises( + ValueError, + match=re.escape( + "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool()" + ", a.item(), a.any() or a.all()." + ), + ): + native_pd.Series(native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + with pytest.raises(ValueError, match="Data cannot be a DataFrame"): + pd.Series(pd.DataFrame([[1, 2], [3, 4], [5, 6]])) + + +@sql_count_checker(query_count=2) +def test_create_df_with_mixed_series_index_list_data_negative(): + """ + Since Snowpark pandas relies on native pandas for initialization a DataFrame with mixed data types, + they both raise the same error. + """ + # Create the list data. + data1 = native_pd.Series([1, 2, 3]) + data2 = native_pd.Index([4, 5, 6]) + data3 = [7, 8, 9] + # Need to convert data3 to an Index since native pandas tries to perform `get_indexer` on it. + err_msg = "'builtin_function_or_method' object has no attribute 'get_indexer'" + with pytest.raises(AttributeError, match=err_msg): + native_pd.DataFrame([data1, data2, data3]) + with pytest.raises(AttributeError, match=err_msg): + pd.DataFrame([pd.Series(data1), pd.Index(data2), data3])