snowflakedb · sfc-gh-rdurrani · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 5, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -167,6 +167,7 @@ This is a re-release of 1.22.0. Please refer to the 1.22.0 release notes for det
 - Fixed `inplace` argument for Series objects derived from DataFrame columns.
 - Fixed a bug where `Series.reindex` and `DataFrame.reindex` did not update the result index's name correctly.
 - Fixed a bug where `Series.take` did not error when `axis=1` was specified.
+- Fixed `loc` set when setting row of DataFrame with Series value.
 
 
 ## 1.21.1 (2024-09-05)

@@ -1039,6 +1039,9 @@ def __setitem__(
         )
         if item_is_2d_array:
             item = pd.DataFrame(item)
+        frame_is_df_and_item_is_series = isinstance(item, pd.Series) and isinstance(
+            self.df, pd.DataFrame
+        )
         item = item._query_compiler if isinstance(item, BasePandasDataset) else item
         new_qc = self.qc.set_2d_labels(
             index,
@@ -1047,6 +1050,7 @@ def __setitem__(
             matching_item_columns_by_label=matching_item_columns_by_label,
             matching_item_rows_by_label=matching_item_rows_by_label,
             index_is_bool_indexer=index_is_bool_indexer,
+            frame_is_df_and_item_is_series=frame_is_df_and_item_is_series,
         )
 
         self.df._update_inplace(new_query_compiler=new_qc)

@@ -1889,6 +1889,7 @@ def _set_2d_labels_helper_for_frame_item(
     assert len(index.data_column_snowflake_quoted_identifiers) == len(
         item.index_column_snowflake_quoted_identifiers
     ), "TODO: SNOW-966427 handle it well in multiindex case"
+
     if not matching_item_rows_by_label:
         index = index.ensure_row_position_column()
         left_on = [index.row_position_snowflake_quoted_identifier]
@@ -2125,6 +2126,7 @@ def set_frame_2d_labels(
     matching_item_rows_by_label: bool,
     index_is_bool_indexer: bool,
     deduplicate_columns: bool,
+    frame_is_df_and_item_is_series: bool,
 ) -> InternalFrame:
     """
     Helper function to handle the general loc set functionality. The general idea here is to join the key from ``index``
@@ -2151,6 +2153,7 @@ def set_frame_2d_labels(
         index_is_bool_indexer: if True, the index is a boolean indexer. Note we only handle boolean indexer with
                 item is a SnowflakeQueryCompiler here.
         deduplicate_columns: if True, deduplicate columns from ``columns``.
+        frame_is_df_and_item_is_series: Whether item is from a Series object and is being assigned to a DataFrame object
     Returns:
         New frame where values have been set
     """
@@ -2213,6 +2216,36 @@ def set_frame_2d_labels(
     index_is_frame = isinstance(index, InternalFrame)
     item_is_frame = isinstance(item, InternalFrame)
     item_is_scalar = is_scalar(item)
+    original_index = index
+    # If `item` is from a Series (rather than a Dataframe), flip the series item values to apply them
+    # across columns rather than rows.
+    if frame_is_df_and_item_is_series and (columns == slice(None) or len(columns) > 1):  # type: ignore[arg-type]
+        # If columns is slice(None), we are setting all columns in the InternalFrame.
+        matching_item_columns_by_label = True
+        col_len = (
+            len(internal_frame.data_column_snowflake_quoted_identifiers)
+            if columns == slice(None)
+            else len(columns)  # type: ignore[arg-type]
+        )
+        item = get_item_series_as_single_row_frame(
+            item, col_len, move_index_to_cols=True
+        )
+
+        if is_scalar(index):
+            new_item = item.append_column("__index__", pandas_lit(index))
+            item = InternalFrame.create(
+                ordered_dataframe=new_item.ordered_dataframe,
+                data_column_pandas_labels=item.data_column_pandas_labels,
+                data_column_snowflake_quoted_identifiers=item.data_column_snowflake_quoted_identifiers,
+                data_column_pandas_index_names=item.data_column_pandas_index_names,
+                index_column_pandas_labels=item.index_column_pandas_labels,
+                index_column_snowflake_quoted_identifiers=[
+                    new_item.data_column_snowflake_quoted_identifiers[-1]
+                ],
+                data_column_types=item.cached_data_column_snowpark_pandas_types,
+                index_column_types=[item.cached_data_column_snowpark_pandas_types[-1]],
+            )
+            index = pd.Series([index])._query_compiler._modin_frame
 
     assert not isinstance(index, slice) or index == slice(
         None
@@ -2409,7 +2442,7 @@ def generate_updated_expr_for_existing_col(
 
         if index_is_scalar:
             col_obj = iff(
-                result_frame_index_col.equal_null(pandas_lit(index)),
+                result_frame_index_col.equal_null(pandas_lit(original_index)),
                 col_obj,
                 original_col,
             )
@@ -2468,7 +2501,7 @@ def generate_updated_expr_for_new_col(
             return SnowparkPandasColumn(pandas_lit(None), None)
         if index_is_scalar:
             new_column = iff(
-                result_frame_index_col.equal_null(pandas_lit(index)),
+                result_frame_index_col.equal_null(pandas_lit(original_index)),
                 new_column,
                 pandas_lit(None),
             )
@@ -2604,7 +2637,6 @@ def set_frame_2d_positional(
         index = _get_adjusted_key_frame_by_row_pos_int_frame(internal_frame, index)
 
     assert isinstance(index_data_type, (_IntegralType, BooleanType))
-
     if isinstance(item, InternalFrame):
         # If item is Series (rather than a Dataframe), then we need to flip the series item values so they apply across
         # columns rather than rows.
@@ -2918,7 +2950,9 @@ def get_kv_frame_from_index_and_item_frames(
 
 
 def get_item_series_as_single_row_frame(
-    item: InternalFrame, num_columns: int
+    item: InternalFrame,
+    num_columns: int,
+    move_index_to_cols: Optional[bool] = False,
 ) -> InternalFrame:
     """
     Get an internal frame that transpose single data column into frame with single row.  For example, if the
@@ -2940,13 +2974,18 @@ def get_item_series_as_single_row_frame(
     ----------
         num_columns: Number of columns in the return frame
         item: Item frame that contains a single column of values.
+        move_index_to_cols: Whether to use the index as the column names.
 
     Returns
     -------
         Frame containing single row with columns for each row.
     """
     item = item.ensure_row_position_column()
-    item_series_pandas_labels = list(range(num_columns))
+    item_series_pandas_labels = (
+        list(range(num_columns))
+        if not move_index_to_cols
+        else item.index_columns_pandas_index().values
+    )
 
     # This is a 2 step process.
     #

@@ -148,10 +148,11 @@ def compute_isin_with_series(
         slice(None),
         [agg_label],
         values_series,
-        False,
-        True,
-        False,
-        False,
+        matching_item_columns_by_label=False,
+        matching_item_rows_by_label=True,
+        index_is_bool_indexer=False,
+        deduplicate_columns=False,
+        frame_is_df_and_item_is_series=False,
     )
 
     # apply isin operation for all columns except the appended agg_label/agg_identifier column.
@@ -272,6 +273,7 @@ def compute_isin_with_dataframe(
         True,
         False,
         False,
+        False,
     )
 
     isin_identifiers = [

@@ -9573,6 +9573,7 @@ def set_2d_labels(
         matching_item_rows_by_label: bool,
         index_is_bool_indexer: bool,
         deduplicate_columns: bool = False,
+        frame_is_df_and_item_is_series: bool = False,
     ) -> "SnowflakeQueryCompiler":
         """
         Create a new SnowflakeQueryCompiler with indexed columns and rows replaced by item.
@@ -9597,6 +9598,7 @@ def set_2d_labels(
             index_is_bool_indexer: if True, the index is a boolean indexer.
             deduplicate_columns: if True, deduplicate columns from ``columns``, e.g., if columns = ["A","A"], only the
                 second "A" column will be used.
+            frame_is_df_and_item_is_series: Whether item is from a Series and is being set to a DataFrame object
         Returns:
             Updated SnowflakeQueryCompiler
         """
@@ -9622,6 +9624,7 @@ def set_2d_labels(
             matching_item_rows_by_label=matching_item_rows_by_label,
             index_is_bool_indexer=index_is_bool_indexer,
             deduplicate_columns=deduplicate_columns,
+            frame_is_df_and_item_is_series=frame_is_df_and_item_is_series,
         )
 
         return SnowflakeQueryCompiler(result_frame)

@@ -1830,6 +1830,15 @@ def loc():
         viper               0       0
         sidewinder          0       0
 
+        Setting the values with a Series item.
+
+        >>> df.loc["viper"] = pd.Series([99, 99], index=["max_speed", "shield"])
+        >>> df
+                    max_speed  shield
+        cobra              30      10
+        viper              99      99
+        sidewinder          0       0
+
         **Getting values on a DataFrame with an index that has integer labels**
 
         Another example using integers for the index

@@ -3205,3 +3205,68 @@ def test_raise_set_cell_with_list_like_value_error():
         s.iloc[0] = [0, 0]
     with pytest.raises(NotImplementedError):
         s.to_frame().iloc[0, 0] = [0, 0]
+
+
+@sql_count_checker(query_count=1, join_count=3)
+@pytest.mark.parametrize("index", [list("ABC"), [0, 1, 2]])
+def test_df_iloc_set_row_from_series(index):
+    native_df = native_pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC"))
+    snow_df = pd.DataFrame(native_df)
+
+    def ilocset(df):
+        series = (
+            pd.Series([1, 4, 9], index=index)
+            if isinstance(df, pd.DataFrame)
+            else native_pd.Series([1, 4, 9], index=index)
+        )
+        df.iloc[1] = series
+        return df
+
+    eval_snowpark_pandas_result(
+        snow_df,
+        native_df,
+        ilocset,
+    )
+
+
+@sql_count_checker(query_count=1, join_count=3)
+@pytest.mark.parametrize("index", [[3, 4, 5], [0, 1, 2]])
+def test_df_iloc_full_set_row_from_series(index):
+    native_df = native_pd.DataFrame([[1, 2, 3], [4, 5, 6]])
+    snow_df = pd.DataFrame(native_df)
+
+    def ilocset(df):
+        series = (
+            pd.Series([1, 4, 9], index=index)
+            if isinstance(df, pd.DataFrame)
+            else native_pd.Series([1, 4, 9], index=index)
+        )
+        df.iloc[:] = series
+        return df
+
+    eval_snowpark_pandas_result(
+        snow_df,
+        native_df,
+        ilocset,
+    )
+
+
+@sql_count_checker(query_count=1, join_count=3)
+def test_df_iloc_full_set_row_from_series_int_and_string_indexes():
+    native_df = native_pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC"))
+    snow_df = pd.DataFrame(native_df)
+
+    def ilocset(df):
+        series = (
+            pd.Series([1, 4, 9], index=list("ABC"))
+            if isinstance(df, pd.DataFrame)
+            else native_pd.Series([1, 4, 9], index=list("ABC"))
+        )
+        df.iloc[:] = series
+        return df
+
+    eval_snowpark_pandas_result(
+        snow_df,
+        native_df,
+        ilocset,
+    )
@@ -4068,6 +4068,74 @@ def test_df_loc_get_with_timedelta_and_none_key():
     assert_frame_equal(snow_df.loc[None], expected_df, check_column_type=False)
 
 
+@sql_count_checker(query_count=2, join_count=4)
+@pytest.mark.parametrize("index", [list("ABC"), [0, 1, 2]])
+def test_df_loc_set_row_from_series(index):
+    native_df = native_pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC"))
+    snow_df = pd.DataFrame(native_df)
+
+    def locset(df):
+        series = (
+            pd.Series([1, 4, 9], index=index)
+            if isinstance(df, pd.DataFrame)
+            else native_pd.Series([1, 4, 9], index=index)
+        )
+        df.loc[1] = series
+        return df
+
+    eval_snowpark_pandas_result(
+        snow_df,
+        native_df,
+        locset,
+    )
+
+
+@sql_count_checker(query_count=2, join_count=1)
+@pytest.mark.parametrize("index", [[3, 4, 5], [0, 1, 2]])
+def test_df_loc_full_set_row_from_series_pandas_errors(index):
+    native_df = native_pd.DataFrame([[1, 2, 3], [4, 5, 6]])
+    snow_df = pd.DataFrame(native_df)
+
+    with pytest.raises(ValueError, match="setting an array element with a sequence."):
+        native_df.loc[:] = native_pd.Series([1, 4, 9], index=index)
+
+    def locset(df):
+        series = (
+            pd.Series([1, 4, 9], index=index)
+            if isinstance(df, pd.DataFrame)
+            else native_pd.Series([1, 4, 9], index=index)
+        )
+        if isinstance(df, pd.DataFrame):
+            df.loc[:] = series
+        else:
+            if index == [0, 1, 2]:
+                df.loc[0] = series
+                df.loc[1] = None
+            else:
+                df.loc[[0, 1]] = None
+        return df
+
+    eval_snowpark_pandas_result(
+        snow_df,
+        native_df,
+        locset,
+    )
+
+
+@sql_count_checker(query_count=1)
+def test_df_loc_full_set_row_from_series_errors():
+    # We error here because our join columns are an int (item.index)
+    # and a string (value.index) column respectively, and we do not
+    # support joins between those.
+    snow_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=list("ABC"))
+
+    with pytest.raises(
+        SnowparkSQLException, match="Numeric value 'A' is not recognized"
+    ):
+        snow_df.loc[:] = pd.Series([1, 4, 9], index=list("ABC"))
+        snow_df.to_pandas()  # Force materialization.
+
+
 @sql_count_checker(query_count=0)
 def test_df_loc_invalid_key():
     # Bug fix: SNOW-1320674

@@ -943,19 +943,23 @@ def set_loc_helper(ser):
             else native_item_ser
         )
 
-    expected_join_count = 1 if not start and not stop and not step else 4
-
-    with SqlCounter(query_count=1, join_count=expected_join_count):
-        if slice_len == 0:
-            # pandas can fail in this case, so we skip call loc for it, see more below in
-            # test_series_loc_set_key_slice_with_series_item_pandas_bug
-            set_loc_helper(snow_ser)
-            # snow_ser should not change when slice_len = 0
+    if slice_len == 0:
+        # pandas can fail in this case, so we skip call loc for it, see more below in
+        # test_series_loc_set_key_slice_with_series_item_pandas_bug
+        set_loc_helper(snow_ser)
+        # snow_ser should not change when slice_len = 0
+        with SqlCounter(query_count=1):
             assert_snowpark_pandas_equal_to_pandas(snow_ser, native_ser)
+    else:
+        native_res = set_loc_helper(native_ser)
+        if is_scalar(native_res):
+            with SqlCounter(query_count=0):
+                snow_res = set_loc_helper(snow_ser)
+                assert snow_res == native_res
         else:
-            eval_snowpark_pandas_result(
-                snow_ser, native_ser, set_loc_helper, inplace=True
-            )
+            with SqlCounter(query_count=1, join_count=4):
+                snow_res = set_loc_helper(snow_ser)
+                assert_series_equal(snow_res, native_res)
 
 
 @pytest.mark.parametrize(