-
Notifications
You must be signed in to change notification settings - Fork 108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SNOW-1458135 Implement DataFrame and Series initialization with lazy Index objects #2137
base: main
Are you sure you want to change the base?
Changes from all commits
2094c3f
97a7229
1979257
5dbb76d
7de467f
5dd06fd
8b94462
c89dc5d
a2089b8
a9376c1
420a5ac
66d634c
f277041
6a2cb79
df96f4a
f971b0d
13db956
8c78f8d
7970101
f3de1c3
2447022
82728bf
23587a4
1577ddc
8903f60
668c889
67a07c1
a4351ba
1453680
f39e751
b73f027
c6fc05d
d422f86
1ea5d00
024acd8
f4a80f3
ce1ffa6
00d2a8b
cb91849
d9fdbb0
3d5b785
7f9dbaa
6de9f49
c2fb474
2274d1e
9eef8d7
cc09403
10c3954
64dda24
da56734
8b47e17
fa4eb09
db28630
17be4c3
301f47f
2eb14a7
95065f7
d9bbd9b
f40c5b4
8cce409
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,8 +11,10 @@ | |
|
||
import numpy as np | ||
import pandas as native_pd | ||
from pandas._typing import Scalar | ||
from pandas._typing import AnyArrayLike, Scalar | ||
from pandas.core.dtypes.base import ExtensionDtype | ||
from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype, is_scalar | ||
from pandas.core.dtypes.inference import is_list_like | ||
|
||
import snowflake.snowpark.modin.pandas as pd | ||
import snowflake.snowpark.modin.plugin._internal.statement_params_constants as STATEMENT_PARAMS | ||
|
@@ -2003,3 +2005,154 @@ def create_frame_with_data_columns( | |
def rindex(lst: list, value: int) -> int: | ||
"""Find the last index in the list of item value.""" | ||
return len(lst) - lst[::-1].index(value) - 1 | ||
|
||
|
||
def error_checking_for_init( | ||
index: Any, dtype: Union[str, np.dtype, ExtensionDtype] | ||
) -> None: | ||
""" | ||
Common error messages for the Series and DataFrame constructors. | ||
|
||
Parameters | ||
---------- | ||
index: Any | ||
The index to check. | ||
dtype: str, numpy.dtype, or ExtensionDtype | ||
The dtype to check. | ||
""" | ||
from modin.pandas import DataFrame | ||
|
||
if isinstance(index, DataFrame): # pandas raises the same error | ||
raise ValueError("Index data must be 1-dimensional") | ||
|
||
if dtype == "category": | ||
raise NotImplementedError("pandas type category is not implemented") | ||
|
||
|
||
def assert_fields_are_none( | ||
class_name: str, data: Any, index: Any, columns: Any = None | ||
) -> None: | ||
assert ( | ||
data is None | ||
), f"Invalid {class_name} construction! Cannot pass both data and query_compiler." | ||
assert ( | ||
index is None | ||
), f"Invalid {class_name} construction! Cannot pass both index and query_compiler." | ||
assert ( | ||
columns is None | ||
), f"Invalid {class_name} construction! Cannot pass both columns and query_compiler." | ||
|
||
|
||
def convert_index_to_qc(index: Any) -> Any: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how to make the return type "SnowflakeQueryCompiler" without causing circular import issues There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. using "SnowflakeQueryCompiler" with quotes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That does not work either - it still causes the issues |
||
""" | ||
Method to convert an object representing an index into a query compiler for set_index or reindex. | ||
|
||
Parameters | ||
---------- | ||
index: Any | ||
The object to convert to a query compiler. | ||
|
||
Returns | ||
------- | ||
SnowflakeQueryCompiler | ||
The converted query compiler. | ||
""" | ||
from modin.pandas import Series | ||
|
||
from snowflake.snowpark.modin.plugin.extensions.index import Index | ||
|
||
if isinstance(index, Index): | ||
idx_qc = index.to_series()._query_compiler | ||
elif isinstance(index, Series): | ||
# The name of the index comes from the Series' name, not the index name. `reindex` does not handle this, | ||
# so we need to set the name of the index to the name of the Series. | ||
index.index.name = index.name | ||
idx_qc = index._query_compiler | ||
else: | ||
idx_qc = Series(index)._query_compiler | ||
return idx_qc | ||
|
||
|
||
def convert_index_to_list_of_qcs(index: Any) -> list: | ||
""" | ||
Method to convert an object representing an index into a list of query compilers for set_index. | ||
|
||
Parameters | ||
---------- | ||
index: Any | ||
The object to convert to a list of query compilers. | ||
|
||
Returns | ||
------- | ||
list | ||
The list of query compilers. | ||
""" | ||
from modin.pandas import Series | ||
|
||
from snowflake.snowpark.modin.plugin.extensions.index import Index | ||
|
||
if ( | ||
not isinstance(index, (native_pd.MultiIndex, Series, Index)) | ||
and is_list_like(index) | ||
and len(index) > 0 | ||
and all((is_list_like(i) and not isinstance(i, tuple)) for i in index) | ||
): | ||
# If given a list of lists, convert it to a MultiIndex. | ||
index = native_pd.MultiIndex.from_arrays(index) | ||
if isinstance(index, native_pd.MultiIndex): | ||
index_qc_list = [ | ||
s._query_compiler | ||
for s in [ | ||
Series(index.get_level_values(level)) for level in range(index.nlevels) | ||
] | ||
] | ||
else: | ||
index_qc_list = [convert_index_to_qc(index)] | ||
return index_qc_list | ||
|
||
|
||
def add_extra_columns_and_select_required_columns( | ||
query_compiler: Any, | ||
columns: Union[AnyArrayLike, list], | ||
) -> Any: | ||
""" | ||
Method to add extra columns to and select the required columns from the provided query compiler. | ||
This is used in DataFrame construction in the following cases: | ||
- general case when data is a DataFrame | ||
- data is a named Series, and this name is in `columns` | ||
|
||
Parameters | ||
---------- | ||
query_compiler: Any | ||
The query compiler to select columns from, i.e., data's query compiler. | ||
columns: AnyArrayLike or list | ||
The columns to select from the query compiler. | ||
""" | ||
from modin.pandas import DataFrame | ||
|
||
data_columns = query_compiler.get_columns().to_list() | ||
# The `columns` parameter is used to select the columns from `data` that will be in the resultant DataFrame. | ||
# If a value in `columns` is not present in data's columns, it will be added as a new column filled with NaN values. | ||
# These columns are tracked by the `extra_columns` variable. | ||
if data_columns is not None and columns is not None: | ||
extra_columns = [col for col in columns if col not in data_columns] | ||
if extra_columns is not []: | ||
# To add these new columns to the DataFrame, perform `__getitem__` only with the extra columns | ||
# and set them to None. | ||
extra_columns_df = DataFrame(query_compiler=query_compiler) | ||
# In the case that the columns are MultiIndex but not all extra columns are tuples, we need to flatten the | ||
# columns to ensure that the columns are a single-level index. If not, `__getitem__` will raise an error | ||
# when trying to add new columns that are not in the expected tuple format. | ||
if not all(isinstance(col, tuple) for col in extra_columns) and isinstance( | ||
query_compiler.get_columns(), native_pd.MultiIndex | ||
): | ||
flattened_columns = extra_columns_df.columns.to_flat_index() | ||
extra_columns_df.columns = flattened_columns | ||
extra_columns_df[extra_columns] = None | ||
query_compiler = extra_columns_df._query_compiler | ||
|
||
# To select the columns for the resultant DataFrame, perform `__getitem__` on the created query compiler. | ||
# This step is performed to ensure that the right columns are picked from the InternalFrame since we | ||
# never explicitly drop the unwanted columns. `__getitem__` also ensures that the columns in the resultant | ||
# DataFrame are in the same order as the columns in the `columns` parameter. | ||
return DataFrame(query_compiler=query_compiler)[columns]._query_compiler |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
where does this check come from? i don't see it was checked anywhere before
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added this check because it was not checked before - we do not support Categorical type yet. If the user passes in
dtype=category
, this makes the dtype of the Series/DataFramecategory
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The dtype seems only used when data is local, typically under this case, we should already apply the dtype check before uploading the data, we shouldn't need to do such check here. Is not not erroring out today?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, it's not erroring out over here today - it's because the data itself is not categorical but should be treated like categorical if dtype is
category
.