|
«»» |
|
SQL-style merge routines |
|
«»» |
|
from __future__ import annotations |
|
|
|
import copy as cp |
|
import datetime |
|
from functools import partial |
|
import string |
|
from typing import ( |
|
TYPE_CHECKING, |
|
Hashable, |
|
Literal, |
|
Sequence, |
|
cast, |
|
) |
|
import uuid |
|
import warnings |
|
|
|
import numpy as np |
|
|
|
from pandas._libs import ( |
|
Timedelta, |
|
hashtable as libhashtable, |
|
join as libjoin, |
|
lib, |
|
) |
|
from pandas._typing import ( |
|
AnyArrayLike, |
|
ArrayLike, |
|
AxisInt, |
|
DtypeObj, |
|
IndexLabel, |
|
JoinHow, |
|
MergeHow, |
|
Shape, |
|
Suffixes, |
|
npt, |
|
) |
|
from pandas.errors import MergeError |
|
from pandas.util._decorators import ( |
|
Appender, |
|
Substitution, |
|
cache_readonly, |
|
) |
|
from pandas.util._exceptions import find_stack_level |
|
|
|
from pandas.core.dtypes.base import ExtensionDtype |
|
from pandas.core.dtypes.cast import find_common_type |
|
from pandas.core.dtypes.common import ( |
|
ensure_float64, |
|
ensure_int64, |
|
ensure_object, |
|
is_array_like, |
|
is_bool, |
|
is_bool_dtype, |
|
is_categorical_dtype, |
|
is_dtype_equal, |
|
is_extension_array_dtype, |
|
is_float_dtype, |
|
is_integer, |
|
is_integer_dtype, |
|
is_list_like, |
|
is_number, |
|
is_numeric_dtype, |
|
is_object_dtype, |
|
needs_i8_conversion, |
|
) |
|
from pandas.core.dtypes.dtypes import DatetimeTZDtype |
|
from pandas.core.dtypes.generic import ( |
|
ABCDataFrame, |
|
ABCSeries, |
|
) |
|
from pandas.core.dtypes.missing import ( |
|
isna, |
|
na_value_for_dtype, |
|
) |
|
|
|
from pandas import ( |
|
Categorical, |
|
Index, |
|
MultiIndex, |
|
Series, |
|
) |
|
import pandas.core.algorithms as algos |
|
from pandas.core.arrays import ( |
|
BaseMaskedArray, |
|
ExtensionArray, |
|
) |
|
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray |
|
import pandas.core.common as com |
|
from pandas.core.construction import ( |
|
ensure_wrapped_if_datetimelike, |
|
extract_array, |
|
) |
|
from pandas.core.frame import _merge_doc |
|
from pandas.core.indexes.api import default_index |
|
from pandas.core.sorting import is_int64_overflow_possible |
|
|
|
if TYPE_CHECKING: |
|
from pandas import DataFrame |
|
from pandas.core import groupby |
|
from pandas.core.arrays import DatetimeArray |
|
|
|
_factorizers = { |
|
np.int64: libhashtable.Int64Factorizer, |
|
np.longlong: libhashtable.Int64Factorizer, |
|
np.int32: libhashtable.Int32Factorizer, |
|
np.int16: libhashtable.Int16Factorizer, |
|
np.int8: libhashtable.Int8Factorizer, |
|
np.uint64: libhashtable.UInt64Factorizer, |
|
np.uint32: libhashtable.UInt32Factorizer, |
|
np.uint16: libhashtable.UInt16Factorizer, |
|
np.uint8: libhashtable.UInt8Factorizer, |
|
np.bool_: libhashtable.UInt8Factorizer, |
|
np.float64: libhashtable.Float64Factorizer, |
|
np.float32: libhashtable.Float32Factorizer, |
|
np.complex64: libhashtable.Complex64Factorizer, |
|
np.complex128: libhashtable.Complex128Factorizer, |
|
np.object_: libhashtable.ObjectFactorizer, |
|
} |
|
|
|
|
|
@Substitution(«nleft : DataFrame or named Series») |
|
@Appender(_merge_doc, indents=0) |
|
def merge( |
|
left: DataFrame | Series, |
|
right: DataFrame | Series, |
|
how: MergeHow = «inner», |
|
on: IndexLabel | None = None, |
|
left_on: IndexLabel | None = None, |
|
right_on: IndexLabel | None = None, |
|
left_index: bool = False, |
|
right_index: bool = False, |
|
sort: bool = False, |
|
suffixes: Suffixes = («_x», «_y»), |
|
copy: bool = True, |
|
indicator: str | bool = False, |
|
validate: str | None = None, |
|
) -> DataFrame: |
|
op = _MergeOperation( |
|
left, |
|
right, |
|
how=how, |
|
on=on, |
|
left_on=left_on, |
|
right_on=right_on, |
|
left_index=left_index, |
|
right_index=right_index, |
|
sort=sort, |
|
suffixes=suffixes, |
|
indicator=indicator, |
|
validate=validate, |
|
) |
|
return op.get_result(copy=copy) |
|
|
|
|
|
def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces): |
|
«»» |
|
groupby & merge; we are always performing a left-by type operation |
|
|
|
Parameters |
|
———- |
|
by: field to group |
|
left: DataFrame |
|
right: DataFrame |
|
merge_pieces: function for merging |
|
«»» |
|
pieces = [] |
|
if not isinstance(by, (list, tuple)): |
|
by = [by] |
|
|
|
lby = left.groupby(by, sort=False) |
|
rby: groupby.DataFrameGroupBy | None = None |
|
|
|
# if we can groupby the rhs |
|
# then we can get vastly better perf |
|
if all(item in right.columns for item in by): |
|
rby = right.groupby(by, sort=False) |
|
|
|
for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis): |
|
|
|
if rby is None: |
|
rhs = right |
|
else: |
|
try: |
|
rhs = right.take(rby.indices[key]) |
|
except KeyError: |
|
# key doesn’t exist in left |
|
lcols = lhs.columns.tolist() |
|
cols = lcols + [r for r in right.columns if r not in set(lcols)] |
|
merged = lhs.reindex(columns=cols) |
|
merged.index = range(len(merged)) |
|
pieces.append(merged) |
|
continue |
|
|
|
merged = merge_pieces(lhs, rhs) |
|
|
|
# make sure join keys are in the merged |
|
# TODO, should merge_pieces do this? |
|
merged[by] = key |
|
|
|
pieces.append(merged) |
|
|
|
# preserve the original order |
|
# if we have a missing piece this can be reset |
|
from pandas.core.reshape.concat import concat |
|
|
|
result = concat(pieces, ignore_index=True) |
|
result = result.reindex(columns=pieces[0].columns, copy=False) |
|
return result, lby |
|
|
|
|
|
def merge_ordered( |
|
left: DataFrame, |
|
right: DataFrame, |
|
on: IndexLabel | None = None, |
|
left_on: IndexLabel | None = None, |
|
right_on: IndexLabel | None = None, |
|
left_by=None, |
|
right_by=None, |
|
fill_method: str | None = None, |
|
suffixes: Suffixes = («_x», «_y»), |
|
how: JoinHow = «outer», |
|
) -> DataFrame: |
|
«»» |
|
Perform a merge for ordered data with optional filling/interpolation. |
|
|
|
Designed for ordered data like time series data. Optionally |
|
perform group-wise merge (see examples). |
|
|
|
Parameters |
|
———- |
|
left : DataFrame or named Series |
|
right : DataFrame or named Series |
|
on : label or list |
|
Field names to join on. Must be found in both DataFrames. |
|
left_on : label or list, or array-like |
|
Field names to join on in left DataFrame. Can be a vector or list of |
|
vectors of the length of the DataFrame to use a particular vector as |
|
the join key instead of columns. |
|
right_on : label or list, or array-like |
|
Field names to join on in right DataFrame or vector/list of vectors per |
|
left_on docs. |
|
left_by : column name or list of column names |
|
Group left DataFrame by group columns and merge piece by piece with |
|
right DataFrame. Must be None if either left or right are a Series. |
|
right_by : column name or list of column names |
|
Group right DataFrame by group columns and merge piece by piece with |
|
left DataFrame. Must be None if either left or right are a Series. |
|
fill_method : {‘ffill’, None}, default None |
|
Interpolation method for data. |
|
suffixes : list-like, default is («_x», «_y») |
|
A length-2 sequence where each element is optionally a string |
|
indicating the suffix to add to overlapping column names in |
|
`left` and `right` respectively. Pass a value of `None` instead |
|
of a string to indicate that the column name from `left` or |
|
`right` should be left as-is, with no suffix. At least one of the |
|
values must not be None. |
|
|
|
how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘outer’ |
|
* left: use only keys from left frame (SQL: left outer join) |
|
* right: use only keys from right frame (SQL: right outer join) |
|
* outer: use union of keys from both frames (SQL: full outer join) |
|
* inner: use intersection of keys from both frames (SQL: inner join). |
|
|
|
Returns |
|
——- |
|
DataFrame |
|
The merged DataFrame output type will be the same as |
|
‘left’, if it is a subclass of DataFrame. |
|
|
|
See Also |
|
——— |
|
merge : Merge with a database-style join. |
|
merge_asof : Merge on nearest keys. |
|
|
|
Examples |
|
——— |
|
>>> df1 = pd.DataFrame( |
|
… { |
|
… «key»: [«a», «c», «e», «a», «c», «e»], |
|
… «lvalue»: [1, 2, 3, 1, 2, 3], |
|
… «group»: [«a», «a», «a», «b», «b», «b»] |
|
… } |
|
… ) |
|
>>> df1 |
|
key lvalue group |
|
0 a 1 a |
|
1 c 2 a |
|
2 e 3 a |
|
3 a 1 b |
|
4 c 2 b |
|
5 e 3 b |
|
|
|
>>> df2 = pd.DataFrame({«key»: [«b», «c», «d»], «rvalue»: [1, 2, 3]}) |
|
>>> df2 |
|
key rvalue |
|
0 b 1 |
|
1 c 2 |
|
2 d 3 |
|
|
|
>>> merge_ordered(df1, df2, fill_method=»ffill», left_by=»group») |
|
key lvalue group rvalue |
|
0 a 1 a NaN |
|
1 b 1 a 1.0 |
|
2 c 2 a 2.0 |
|
3 d 2 a 3.0 |
|
4 e 3 a 3.0 |
|
5 a 1 b NaN |
|
6 b 1 b 1.0 |
|
7 c 2 b 2.0 |
|
8 d 2 b 3.0 |
|
9 e 3 b 3.0 |
|
«»» |
|
|
|
def _merger(x, y) -> DataFrame: |
|
# perform the ordered merge operation |
|
op = _OrderedMerge( |
|
x, |
|
y, |
|
on=on, |
|
left_on=left_on, |
|
right_on=right_on, |
|
suffixes=suffixes, |
|
fill_method=fill_method, |
|
how=how, |
|
) |
|
return op.get_result() |
|
|
|
if left_by is not None and right_by is not None: |
|
raise ValueError(«Can only group either left or right frames») |
|
if left_by is not None: |
|
if isinstance(left_by, str): |
|
left_by = [left_by] |
|
check = set(left_by).difference(left.columns) |
|
if len(check) != 0: |
|
raise KeyError(f»{check} not found in left columns») |
|
result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y)) |
|
elif right_by is not None: |
|
if isinstance(right_by, str): |
|
right_by = [right_by] |
|
check = set(right_by).difference(right.columns) |
|
if len(check) != 0: |
|
raise KeyError(f»{check} not found in right columns») |
|
result, _ = _groupby_and_merge( |
|
right_by, right, left, lambda x, y: _merger(y, x) |
|
) |
|
else: |
|
result = _merger(left, right) |
|
return result |
|
|
|
|
|
def merge_asof( |
|
left: DataFrame | Series, |
|
right: DataFrame | Series, |
|
on: IndexLabel | None = None, |
|
left_on: IndexLabel | None = None, |
|
right_on: IndexLabel | None = None, |
|
left_index: bool = False, |
|
right_index: bool = False, |
|
by=None, |
|
left_by=None, |
|
right_by=None, |
|
suffixes: Suffixes = («_x», «_y»), |
|
tolerance=None, |
|
allow_exact_matches: bool = True, |
|
direction: str = «backward», |
|
) -> DataFrame: |
|
«»» |
|
Perform a merge by key distance. |
|
|
|
This is similar to a left-join except that we match on nearest |
|
key rather than equal keys. Both DataFrames must be sorted by the key. |
|
|
|
For each row in the left DataFrame: |
|
|
|
— A «backward» search selects the last row in the right DataFrame whose |
|
‘on’ key is less than or equal to the left’s key. |
|
|
|
— A «forward» search selects the first row in the right DataFrame whose |
|
‘on’ key is greater than or equal to the left’s key. |
|
|
|
— A «nearest» search selects the row in the right DataFrame whose ‘on’ |
|
key is closest in absolute distance to the left’s key. |
|
|
|
The default is «backward» and is compatible in versions below 0.20.0. |
|
The direction parameter was added in version 0.20.0 and introduces |
|
«forward» and «nearest». |
|
|
|
Optionally match on equivalent keys with ‘by’ before searching with ‘on’. |
|
|
|
Parameters |
|
———- |
|
left : DataFrame or named Series |
|
right : DataFrame or named Series |
|
on : label |
|
Field name to join on. Must be found in both DataFrames. |
|
The data MUST be ordered. Furthermore this must be a numeric column, |
|
such as datetimelike, integer, or float. On or left_on/right_on |
|
must be given. |
|
left_on : label |
|
Field name to join on in left DataFrame. |
|
right_on : label |
|
Field name to join on in right DataFrame. |
|
left_index : bool |
|
Use the index of the left DataFrame as the join key. |
|
right_index : bool |
|
Use the index of the right DataFrame as the join key. |
|
by : column name or list of column names |
|
Match on these columns before performing merge operation. |
|
left_by : column name |
|
Field names to match on in the left DataFrame. |
|
right_by : column name |
|
Field names to match on in the right DataFrame. |
|
suffixes : 2-length sequence (tuple, list, …) |
|
Suffix to apply to overlapping column names in the left and right |
|
side, respectively. |
|
tolerance : int or Timedelta, optional, default None |
|
Select asof tolerance within this range; must be compatible |
|
with the merge index. |
|
allow_exact_matches : bool, default True |
|
|
|
— If True, allow matching with the same ‘on’ value |
|
(i.e. less-than-or-equal-to / greater-than-or-equal-to) |
|
— If False, don’t match the same ‘on’ value |
|
(i.e., strictly less-than / strictly greater-than). |
|
|
|
direction : ‘backward’ (default), ‘forward’, or ‘nearest’ |
|
Whether to search for prior, subsequent, or closest matches. |
|
|
|
Returns |
|
——- |
|
DataFrame |
|
|
|
See Also |
|
——— |
|
merge : Merge with a database-style join. |
|
merge_ordered : Merge with optional filling/interpolation. |
|
|
|
Examples |
|
——— |
|
>>> left = pd.DataFrame({«a»: [1, 5, 10], «left_val»: [«a», «b», «c»]}) |
|
>>> left |
|
a left_val |
|
0 1 a |
|
1 5 b |
|
2 10 c |
|
|
|
>>> right = pd.DataFrame({«a»: [1, 2, 3, 6, 7], «right_val»: [1, 2, 3, 6, 7]}) |
|
>>> right |
|
a right_val |
|
0 1 1 |
|
1 2 2 |
|
2 3 3 |
|
3 6 6 |
|
4 7 7 |
|
|
|
>>> pd.merge_asof(left, right, on=»a») |
|
a left_val right_val |
|
0 1 a 1 |
|
1 5 b 3 |
|
2 10 c 7 |
|
|
|
>>> pd.merge_asof(left, right, on=»a», allow_exact_matches=False) |
|
a left_val right_val |
|
0 1 a NaN |
|
1 5 b 3.0 |
|
2 10 c 7.0 |
|
|
|
>>> pd.merge_asof(left, right, on=»a», direction=»forward») |
|
a left_val right_val |
|
0 1 a 1.0 |
|
1 5 b 6.0 |
|
2 10 c NaN |
|
|
|
>>> pd.merge_asof(left, right, on=»a», direction=»nearest») |
|
a left_val right_val |
|
0 1 a 1 |
|
1 5 b 6 |
|
2 10 c 7 |
|
|
|
We can use indexed DataFrames as well. |
|
|
|
>>> left = pd.DataFrame({«left_val»: [«a», «b», «c»]}, index=[1, 5, 10]) |
|
>>> left |
|
left_val |
|
1 a |
|
5 b |
|
10 c |
|
|
|
>>> right = pd.DataFrame({«right_val»: [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) |
|
>>> right |
|
right_val |
|
1 1 |
|
2 2 |
|
3 3 |
|
6 6 |
|
7 7 |
|
|
|
>>> pd.merge_asof(left, right, left_index=True, right_index=True) |
|
left_val right_val |
|
1 a 1 |
|
5 b 3 |
|
10 c 7 |
|
|
|
Here is a real-world times-series example |
|
|
|
>>> quotes = pd.DataFrame( |
|
… { |
|
… «time»: [ |
|
… pd.Timestamp(«2016-05-25 13:30:00.023»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.023»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.030»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.041»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.048»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.049»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.072»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.075») |
|
… ], |
|
… «ticker»: [ |
|
… «GOOG», |
|
… «MSFT», |
|
… «MSFT», |
|
… «MSFT», |
|
… «GOOG», |
|
… «AAPL», |
|
… «GOOG», |
|
… «MSFT» |
|
… ], |
|
… «bid»: [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], |
|
… «ask»: [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] |
|
… } |
|
… ) |
|
>>> quotes |
|
time ticker bid ask |
|
0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 |
|
1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 |
|
2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 |
|
3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 |
|
4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 |
|
5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 |
|
6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 |
|
7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 |
|
|
|
>>> trades = pd.DataFrame( |
|
… { |
|
… «time»: [ |
|
… pd.Timestamp(«2016-05-25 13:30:00.023»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.038»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.048»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.048»), |
|
… pd.Timestamp(«2016-05-25 13:30:00.048») |
|
… ], |
|
… «ticker»: [«MSFT», «MSFT», «GOOG», «GOOG», «AAPL»], |
|
… «price»: [51.95, 51.95, 720.77, 720.92, 98.0], |
|
… «quantity»: [75, 155, 100, 100, 100] |
|
… } |
|
… ) |
|
>>> trades |
|
time ticker price quantity |
|
0 2016-05-25 13:30:00.023 MSFT 51.95 75 |
|
1 2016-05-25 13:30:00.038 MSFT 51.95 155 |
|
2 2016-05-25 13:30:00.048 GOOG 720.77 100 |
|
3 2016-05-25 13:30:00.048 GOOG 720.92 100 |
|
4 2016-05-25 13:30:00.048 AAPL 98.00 100 |
|
|
|
By default we are taking the asof of the quotes |
|
|
|
>>> pd.merge_asof(trades, quotes, on=»time», by=»ticker») |
|
time ticker price quantity bid ask |
|
0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 |
|
1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 |
|
2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 |
|
3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 |
|
4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN |
|
|
|
We only asof within 2ms between the quote time and the trade time |
|
|
|
>>> pd.merge_asof( |
|
… trades, quotes, on=»time», by=»ticker», tolerance=pd.Timedelta(«2ms») |
|
… ) |
|
time ticker price quantity bid ask |
|
0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 |
|
1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN |
|
2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 |
|
3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 |
|
4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN |
|
|
|
We only asof within 10ms between the quote time and the trade time |
|
and we exclude exact matches on time. However *prior* data will |
|
propagate forward |
|
|
|
>>> pd.merge_asof( |
|
… trades, |
|
… quotes, |
|
… on=»time», |
|
… by=»ticker», |
|
… tolerance=pd.Timedelta(«10ms»), |
|
… allow_exact_matches=False |
|
… ) |
|
time ticker price quantity bid ask |
|
0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN |
|
1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 |
|
2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN |
|
3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN |
|
4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN |
|
«»» |
|
op = _AsOfMerge( |
|
left, |
|
right, |
|
on=on, |
|
left_on=left_on, |
|
right_on=right_on, |
|
left_index=left_index, |
|
right_index=right_index, |
|
by=by, |
|
left_by=left_by, |
|
right_by=right_by, |
|
suffixes=suffixes, |
|
how=«asof», |
|
tolerance=tolerance, |
|
allow_exact_matches=allow_exact_matches, |
|
direction=direction, |
|
) |
|
return op.get_result() |
|
|
|
|
|
# TODO: transformations?? |
|
# TODO: only copy DataFrames when modification necessary |
|
class _MergeOperation: |
|
«»» |
|
Perform a database (SQL) merge operation between two DataFrame or Series |
|
objects using either columns as keys or their row indexes |
|
«»» |
|
|
|
_merge_type = «merge» |
|
how: MergeHow | Literal[«asof»] |
|
on: IndexLabel | None |
|
# left_on/right_on may be None when passed, but in validate_specification |
|
# get replaced with non-None. |
|
left_on: Sequence[Hashable | AnyArrayLike] |
|
right_on: Sequence[Hashable | AnyArrayLike] |
|
left_index: bool |
|
right_index: bool |
|
axis: AxisInt |
|
bm_axis: AxisInt |
|
sort: bool |
|
suffixes: Suffixes |
|
copy: bool |
|
indicator: str | bool |
|
validate: str | None |
|
join_names: list[Hashable] |
|
right_join_keys: list[AnyArrayLike] |
|
left_join_keys: list[AnyArrayLike] |
|
|
|
def __init__( |
|
self, |
|
left: DataFrame | Series, |
|
right: DataFrame | Series, |
|
how: MergeHow | Literal[«asof»] = «inner», |
|
on: IndexLabel | None = None, |
|
left_on: IndexLabel | None = None, |
|
right_on: IndexLabel | None = None, |
|
axis: AxisInt = 1, |
|
left_index: bool = False, |
|
right_index: bool = False, |
|
sort: bool = True, |
|
suffixes: Suffixes = («_x», «_y»), |
|
indicator: str | bool = False, |
|
validate: str | None = None, |
|
) -> None: |
|
_left = _validate_operand(left) |
|
_right = _validate_operand(right) |
|
self.left = self.orig_left = _left |
|
self.right = self.orig_right = _right |
|
self.how = how |
|
|
|
# bm_axis -> the axis on the BlockManager |
|
self.bm_axis = axis |
|
# axis —> the axis on the Series/DataFrame |
|
self.axis = 1 — axis if self.left.ndim == 2 else 0 |
|
|
|
self.on = com.maybe_make_list(on) |
|
|
|
self.suffixes = suffixes |
|
self.sort = sort |
|
|
|
self.left_index = left_index |
|
self.right_index = right_index |
|
|
|
self.indicator = indicator |
|
|
|
if not is_bool(left_index): |
|
raise ValueError( |
|
f»left_index parameter must be of type bool, not {type(left_index)}« |
|
) |
|
if not is_bool(right_index): |
|
raise ValueError( |
|
f»right_index parameter must be of type bool, not {type(right_index)}« |
|
) |
|
|
|
# GH 40993: raise when merging between different levels; enforced in 2.0 |
|
if _left.columns.nlevels != _right.columns.nlevels: |
|
msg = ( |
|
«Not allowed to merge between different levels. « |
|
f»({_left.columns.nlevels} levels on the left, « |
|
f»{_right.columns.nlevels} on the right)» |
|
) |
|
raise MergeError(msg) |
|
|
|
self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) |
|
|
|
cross_col = None |
|
if self.how == «cross»: |
|
( |
|
self.left, |
|
self.right, |
|
self.how, |
|
cross_col, |
|
) = self._create_cross_configuration(self.left, self.right) |
|
self.left_on = self.right_on = [cross_col] |
|
self._cross = cross_col |
|
|
|
# note this function has side effects |
|
( |
|
self.left_join_keys, |
|
self.right_join_keys, |
|
self.join_names, |
|
) = self._get_merge_keys() |
|
|
|
# validate the merge keys dtypes. We may need to coerce |
|
# to avoid incompatible dtypes |
|
self._maybe_coerce_merge_keys() |
|
|
|
# If argument passed to validate, |
|
# check if columns specified as unique |
|
# are in fact unique. |
|
if validate is not None: |
|
self._validate(validate) |
|
|
|
def _reindex_and_concat( |
|
self, |
|
join_index: Index, |
|
left_indexer: npt.NDArray[np.intp] | None, |
|
right_indexer: npt.NDArray[np.intp] | None, |
|
copy: bool, |
|
) -> DataFrame: |
|
«»» |
|
reindex along index and concat along columns. |
|
«»» |
|
# Take views so we do not alter the originals |
|
left = self.left[:] |
|
right = self.right[:] |
|
|
|
llabels, rlabels = _items_overlap_with_suffix( |
|
self.left._info_axis, self.right._info_axis, self.suffixes |
|
) |
|
|
|
if left_indexer is not None: |
|
# Pinning the index here (and in the right code just below) is not |
|
# necessary, but makes the `.take` more performant if we have e.g. |
|
# a MultiIndex for left.index. |
|
lmgr = left._mgr.reindex_indexer( |
|
join_index, |
|
left_indexer, |
|
axis=1, |
|
copy=False, |
|
only_slice=True, |
|
allow_dups=True, |
|
use_na_proxy=True, |
|
) |
|
left = left._constructor(lmgr) |
|
left.index = join_index |
|
|
|
if right_indexer is not None: |
|
rmgr = right._mgr.reindex_indexer( |
|
join_index, |
|
right_indexer, |
|
axis=1, |
|
copy=False, |
|
only_slice=True, |
|
allow_dups=True, |
|
use_na_proxy=True, |
|
) |
|
right = right._constructor(rmgr) |
|
right.index = join_index |
|
|
|
from pandas import concat |
|
|
|
left.columns = llabels |
|
right.columns = rlabels |
|
result = concat([left, right], axis=1, copy=copy) |
|
return result |
|
|
|
def get_result(self, copy: bool = True) -> DataFrame: |
|
if self.indicator: |
|
self.left, self.right = self._indicator_pre_merge(self.left, self.right) |
|
|
|
join_index, left_indexer, right_indexer = self._get_join_info() |
|
|
|
result = self._reindex_and_concat( |
|
join_index, left_indexer, right_indexer, copy=copy |
|
) |
|
result = result.__finalize__(self, method=self._merge_type) |
|
|
|
if self.indicator: |
|
result = self._indicator_post_merge(result) |
|
|
|
self._maybe_add_join_keys(result, left_indexer, right_indexer) |
|
|
|
self._maybe_restore_index_levels(result) |
|
|
|
self._maybe_drop_cross_column(result, self._cross) |
|
|
|
return result.__finalize__(self, method=«merge») |
|
|
|
def _maybe_drop_cross_column( |
|
self, result: DataFrame, cross_col: str | None |
|
) -> None: |
|
if cross_col is not None: |
|
del result[cross_col] |
|
|
|
@cache_readonly |
|
def _indicator_name(self) -> str | None: |
|
if isinstance(self.indicator, str): |
|
return self.indicator |
|
elif isinstance(self.indicator, bool): |
|
return «_merge» if self.indicator else None |
|
else: |
|
raise ValueError( |
|
«indicator option can only accept boolean or string arguments» |
|
) |
|
|
|
def _indicator_pre_merge( |
|
self, left: DataFrame, right: DataFrame |
|
) -> tuple[DataFrame, DataFrame]: |
|
|
|
columns = left.columns.union(right.columns) |
|
|
|
for i in [«_left_indicator», «_right_indicator»]: |
|
if i in columns: |
|
raise ValueError( |
|
«Cannot use `indicator=True` option when « |
|
f»data contains a column named {i}« |
|
) |
|
if self._indicator_name in columns: |
|
raise ValueError( |
|
«Cannot use name of an existing column for indicator column» |
|
) |
|
|
|
left = left.copy() |
|
right = right.copy() |
|
|
|
left[«_left_indicator»] = 1 |
|
left[«_left_indicator»] = left[«_left_indicator»].astype(«int8») |
|
|
|
right[«_right_indicator»] = 2 |
|
right[«_right_indicator»] = right[«_right_indicator»].astype(«int8») |
|
|
|
return left, right |
|
|
|
def _indicator_post_merge(self, result: DataFrame) -> DataFrame: |
|
|
|
result[«_left_indicator»] = result[«_left_indicator»].fillna(0) |
|
result[«_right_indicator»] = result[«_right_indicator»].fillna(0) |
|
|
|
result[self._indicator_name] = Categorical( |
|
(result[«_left_indicator»] + result[«_right_indicator»]), |
|
categories=[1, 2, 3], |
|
) |
|
result[self._indicator_name] = result[ |
|
self._indicator_name |
|
].cat.rename_categories([«left_only», «right_only», «both»]) |
|
|
|
result = result.drop(labels=[«_left_indicator», «_right_indicator»], axis=1) |
|
return result |
|
|
|
def _maybe_restore_index_levels(self, result: DataFrame) -> None: |
|
«»» |
|
Restore index levels specified as `on` parameters |
|
|
|
Here we check for cases where `self.left_on` and `self.right_on` pairs |
|
each reference an index level in their respective DataFrames. The |
|
joined columns corresponding to these pairs are then restored to the |
|
index of `result`. |
|
|
|
**Note:** This method has side effects. It modifies `result` in-place |
|
|
|
Parameters |
|
———- |
|
result: DataFrame |
|
merge result |
|
|
|
Returns |
|
——- |
|
None |
|
«»» |
|
names_to_restore = [] |
|
for name, left_key, right_key in zip( |
|
self.join_names, self.left_on, self.right_on |
|
): |
|
if ( |
|
# Argument 1 to «_is_level_reference» of «NDFrame» has incompatible |
|
# type «Union[Hashable, ExtensionArray, Index, Series]»; expected |
|
# «Hashable» |
|
self.orig_left._is_level_reference(left_key) # type: ignore[arg-type] |
|
# Argument 1 to «_is_level_reference» of «NDFrame» has incompatible |
|
# type «Union[Hashable, ExtensionArray, Index, Series]»; expected |
|
# «Hashable» |
|
and self.orig_right._is_level_reference( |
|
right_key # type: ignore[arg-type] |
|
) |
|
and left_key == right_key |
|
and name not in result.index.names |
|
): |
|
|
|
names_to_restore.append(name) |
|
|
|
if names_to_restore: |
|
result.set_index(names_to_restore, inplace=True) |
|
|
|
def _maybe_add_join_keys( |
|
self, |
|
result: DataFrame, |
|
left_indexer: np.ndarray | None, |
|
right_indexer: np.ndarray | None, |
|
) -> None: |
|
|
|
left_has_missing = None |
|
right_has_missing = None |
|
|
|
assert all(is_array_like(x) for x in self.left_join_keys) |
|
|
|
keys = zip(self.join_names, self.left_on, self.right_on) |
|
for i, (name, lname, rname) in enumerate(keys): |
|
if not _should_fill(lname, rname): |
|
continue |
|
|
|
take_left, take_right = None, None |
|
|
|
if name in result: |
|
|
|
if left_indexer is not None and right_indexer is not None: |
|
if name in self.left: |
|
|
|
if left_has_missing is None: |
|
left_has_missing = (left_indexer == —1).any() |
|
|
|
if left_has_missing: |
|
take_right = self.right_join_keys[i] |
|
|
|
if not is_dtype_equal( |
|
result[name].dtype, self.left[name].dtype |
|
): |
|
take_left = self.left[name]._values |
|
|
|
elif name in self.right: |
|
|
|
if right_has_missing is None: |
|
right_has_missing = (right_indexer == —1).any() |
|
|
|
if right_has_missing: |
|
take_left = self.left_join_keys[i] |
|
|
|
if not is_dtype_equal( |
|
result[name].dtype, self.right[name].dtype |
|
): |
|
take_right = self.right[name]._values |
|
|
|
elif left_indexer is not None: |
|
take_left = self.left_join_keys[i] |
|
take_right = self.right_join_keys[i] |
|
|
|
if take_left is not None or take_right is not None: |
|
|
|
if take_left is None: |
|
lvals = result[name]._values |
|
else: |
|
# TODO: can we pin down take_left’s type earlier? |
|
take_left = extract_array(take_left, extract_numpy=True) |
|
lfill = na_value_for_dtype(take_left.dtype) |
|
lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill) |
|
|
|
if take_right is None: |
|
rvals = result[name]._values |
|
else: |
|
# TODO: can we pin down take_right’s type earlier? |
|
taker = extract_array(take_right, extract_numpy=True) |
|
rfill = na_value_for_dtype(taker.dtype) |
|
rvals = algos.take_nd(taker, right_indexer, fill_value=rfill) |
|
|
|
# if we have an all missing left_indexer |
|
# make sure to just use the right values or vice-versa |
|
mask_left = left_indexer == —1 |
|
# error: Item «bool» of «Union[Any, bool]» has no attribute «all» |
|
if mask_left.all(): # type: ignore[union-attr] |
|
key_col = Index(rvals) |
|
result_dtype = rvals.dtype |
|
elif right_indexer is not None and (right_indexer == —1).all(): |
|
key_col = Index(lvals) |
|
result_dtype = lvals.dtype |
|
else: |
|
key_col = Index(lvals).where(~mask_left, rvals) |
|
result_dtype = find_common_type([lvals.dtype, rvals.dtype]) |
|
|
|
if result._is_label_reference(name): |
|
result[name] = Series( |
|
key_col, dtype=result_dtype, index=result.index |
|
) |
|
elif result._is_level_reference(name): |
|
if isinstance(result.index, MultiIndex): |
|
key_col.name = name |
|
idx_list = [ |
|
result.index.get_level_values(level_name) |
|
if level_name != name |
|
else key_col |
|
for level_name in result.index.names |
|
] |
|
|
|
result.set_index(idx_list, inplace=True) |
|
else: |
|
result.index = Index(key_col, name=name) |
|
else: |
|
result.insert(i, name or f»key_{i}«, key_col) |
|
|
|
def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: |
|
«»»return the join indexers»»» |
|
return get_join_indexers( |
|
self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how |
|
) |
|
|
|
def _get_join_info( |
|
self, |
|
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: |
|
# make mypy happy |
|
assert self.how != «cross» |
|
left_ax = self.left.axes[self.axis] |
|
right_ax = self.right.axes[self.axis] |
|
|
|
if self.left_index and self.right_index and self.how != «asof»: |
|
join_index, left_indexer, right_indexer = left_ax.join( |
|
right_ax, how=self.how, return_indexers=True, sort=self.sort |
|
) |
|
|
|
elif self.right_index and self.how == «left»: |
|
join_index, left_indexer, right_indexer = _left_join_on_index( |
|
left_ax, right_ax, self.left_join_keys, sort=self.sort |
|
) |
|
|
|
elif self.left_index and self.how == «right»: |
|
join_index, right_indexer, left_indexer = _left_join_on_index( |
|
right_ax, left_ax, self.right_join_keys, sort=self.sort |
|
) |
|
else: |
|
(left_indexer, right_indexer) = self._get_join_indexers() |
|
|
|
if self.right_index: |
|
if len(self.left) > 0: |
|
join_index = self._create_join_index( |
|
self.left.index, |
|
self.right.index, |
|
left_indexer, |
|
how=«right», |
|
) |
|
else: |
|
join_index = self.right.index.take(right_indexer) |
|
elif self.left_index: |
|
if self.how == «asof»: |
|
# GH#33463 asof should always behave like a left merge |
|
join_index = self._create_join_index( |
|
self.left.index, |
|
self.right.index, |
|
left_indexer, |
|
how=«left», |
|
) |
|
|
|
elif len(self.right) > 0: |
|
join_index = self._create_join_index( |
|
self.right.index, |
|
self.left.index, |
|
right_indexer, |
|
how=«left», |
|
) |
|
else: |
|
join_index = self.left.index.take(left_indexer) |
|
else: |
|
join_index = default_index(len(left_indexer)) |
|
|
|
if len(join_index) == 0 and not isinstance(join_index, MultiIndex): |
|
join_index = default_index(0).set_names(join_index.name) |
|
return join_index, left_indexer, right_indexer |
|
|
|
def _create_join_index( |
|
self, |
|
index: Index, |
|
other_index: Index, |
|
indexer: npt.NDArray[np.intp], |
|
how: JoinHow = «left», |
|
) -> Index: |
|
«»» |
|
Create a join index by rearranging one index to match another |
|
|
|
Parameters |
|
———- |
|
index : Index being rearranged |
|
other_index : Index used to supply values not found in index |
|
indexer : np.ndarray[np.intp] how to rearrange index |
|
how : str |
|
Replacement is only necessary if indexer based on other_index. |
|
|
|
Returns |
|
——- |
|
Index |
|
«»» |
|
if self.how in (how, «outer») and not isinstance(other_index, MultiIndex): |
|
# if final index requires values in other_index but not target |
|
# index, indexer may hold missing (-1) values, causing Index.take |
|
# to take the final value in target index. So, we set the last |
|
# element to be the desired fill value. We do not use allow_fill |
|
# and fill_value because it throws a ValueError on integer indices |
|
mask = indexer == —1 |
|
if np.any(mask): |
|
fill_value = na_value_for_dtype(index.dtype, compat=False) |
|
index = index.append(Index([fill_value])) |
|
return index.take(indexer) |
|
|
|
def _get_merge_keys( |
|
self, |
|
) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]: |
|
«»» |
|
Note: has side effects (copy/delete key columns) |
|
|
|
Parameters |
|
———- |
|
left |
|
right |
|
on |
|
|
|
Returns |
|
——- |
|
left_keys, right_keys, join_names |
|
«»» |
|
# left_keys, right_keys entries can actually be anything listlike |
|
# with a ‘dtype’ attr |
|
left_keys: list[AnyArrayLike] = [] |
|
right_keys: list[AnyArrayLike] = [] |
|
join_names: list[Hashable] = [] |
|
right_drop: list[Hashable] = [] |
|
left_drop: list[Hashable] = [] |
|
|
|
left, right = self.left, self.right |
|
|
|
is_lkey = lambda x: is_array_like(x) and len(x) == len(left) |
|
is_rkey = lambda x: is_array_like(x) and len(x) == len(right) |
|
|
|
# Note that pd.merge_asof() has separate ‘on’ and ‘by’ parameters. A |
|
# user could, for example, request ‘left_index’ and ‘left_by’. In a |
|
# regular pd.merge(), users cannot specify both ‘left_index’ and |
|
# ‘left_on’. (Instead, users have a MultiIndex). That means the |
|
# self.left_on in this function is always empty in a pd.merge(), but |
|
# a pd.merge_asof(left_index=True, left_by=…) will result in a |
|
# self.left_on array with a None in the middle of it. This requires |
|
# a work-around as designated in the code below. |
|
# See _validate_left_right_on() for where this happens. |
|
|
|
# ugh, spaghetti re #733 |
|
if _any(self.left_on) and _any(self.right_on): |
|
for lk, rk in zip(self.left_on, self.right_on): |
|
if is_lkey(lk): |
|
lk = cast(AnyArrayLike, lk) |
|
left_keys.append(lk) |
|
if is_rkey(rk): |
|
rk = cast(AnyArrayLike, rk) |
|
right_keys.append(rk) |
|
join_names.append(None) # what to do? |
|
else: |
|
# Then we’re either Hashable or a wrong-length arraylike, |
|
# the latter of which will raise |
|
rk = cast(Hashable, rk) |
|
if rk is not None: |
|
right_keys.append(right._get_label_or_level_values(rk)) |
|
join_names.append(rk) |
|
else: |
|
# work-around for merge_asof(right_index=True) |
|
right_keys.append(right.index) |
|
join_names.append(right.index.name) |
|
else: |
|
if not is_rkey(rk): |
|
# Then we’re either Hashable or a wrong-length arraylike, |
|
# the latter of which will raise |
|
rk = cast(Hashable, rk) |
|
if rk is not None: |
|
right_keys.append(right._get_label_or_level_values(rk)) |
|
else: |
|
# work-around for merge_asof(right_index=True) |
|
right_keys.append(right.index) |
|
if lk is not None and lk == rk: # FIXME: what about other NAs? |
|
# avoid key upcast in corner case (length-0) |
|
lk = cast(Hashable, lk) |
|
if len(left) > 0: |
|
right_drop.append(rk) |
|
else: |
|
left_drop.append(lk) |
|
else: |
|
rk = cast(AnyArrayLike, rk) |
|
right_keys.append(rk) |
|
if lk is not None: |
|
# Then we’re either Hashable or a wrong-length arraylike, |
|
# the latter of which will raise |
|
lk = cast(Hashable, lk) |
|
left_keys.append(left._get_label_or_level_values(lk)) |
|
join_names.append(lk) |
|
else: |
|
# work-around for merge_asof(left_index=True) |
|
left_keys.append(left.index) |
|
join_names.append(left.index.name) |
|
elif _any(self.left_on): |
|
for k in self.left_on: |
|
if is_lkey(k): |
|
k = cast(AnyArrayLike, k) |
|
left_keys.append(k) |
|
join_names.append(None) |
|
else: |
|
# Then we’re either Hashable or a wrong-length arraylike, |
|
# the latter of which will raise |
|
k = cast(Hashable, k) |
|
left_keys.append(left._get_label_or_level_values(k)) |
|
join_names.append(k) |
|
if isinstance(self.right.index, MultiIndex): |
|
right_keys = [ |
|
lev._values.take(lev_codes) |
|
for lev, lev_codes in zip( |
|
self.right.index.levels, self.right.index.codes |
|
) |
|
] |
|
else: |
|
right_keys = [self.right.index._values] |
|
elif _any(self.right_on): |
|
for k in self.right_on: |
|
if is_rkey(k): |
|
k = cast(AnyArrayLike, k) |
|
right_keys.append(k) |
|
join_names.append(None) |
|
else: |
|
# Then we’re either Hashable or a wrong-length arraylike, |
|
# the latter of which will raise |
|
k = cast(Hashable, k) |
|
right_keys.append(right._get_label_or_level_values(k)) |
|
join_names.append(k) |
|
if isinstance(self.left.index, MultiIndex): |
|
left_keys = [ |
|
lev._values.take(lev_codes) |
|
for lev, lev_codes in zip( |
|
self.left.index.levels, self.left.index.codes |
|
) |
|
] |
|
else: |
|
left_keys = [self.left.index._values] |
|
|
|
if left_drop: |
|
self.left = self.left._drop_labels_or_levels(left_drop) |
|
|
|
if right_drop: |
|
self.right = self.right._drop_labels_or_levels(right_drop) |
|
|
|
return left_keys, right_keys, join_names |
|
|
|
def _maybe_coerce_merge_keys(self) -> None: |
|
# we have valid merges but we may have to further |
|
# coerce these if they are originally incompatible types |
|
# |
|
# for example if these are categorical, but are not dtype_equal |
|
# or if we have object and integer dtypes |
|
|
|
for lk, rk, name in zip( |
|
self.left_join_keys, self.right_join_keys, self.join_names |
|
): |
|
if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): |
|
continue |
|
|
|
lk = extract_array(lk, extract_numpy=True) |
|
rk = extract_array(rk, extract_numpy=True) |
|
|
|
lk_is_cat = is_categorical_dtype(lk.dtype) |
|
rk_is_cat = is_categorical_dtype(rk.dtype) |
|
lk_is_object = is_object_dtype(lk.dtype) |
|
rk_is_object = is_object_dtype(rk.dtype) |
|
|
|
# if either left or right is a categorical |
|
# then the must match exactly in categories & ordered |
|
if lk_is_cat and rk_is_cat: |
|
lk = cast(Categorical, lk) |
|
rk = cast(Categorical, rk) |
|
if lk._categories_match_up_to_permutation(rk): |
|
continue |
|
|
|
elif lk_is_cat or rk_is_cat: |
|
pass |
|
|
|
elif is_dtype_equal(lk.dtype, rk.dtype): |
|
continue |
|
|
|
msg = ( |
|
f»You are trying to merge on {lk.dtype} and « |
|
f»{rk.dtype} columns. If you wish to proceed you should use pd.concat» |
|
) |
|
|
|
# if we are numeric, then allow differing |
|
# kinds to proceed, eg. int64 and int8, int and float |
|
# further if we are object, but we infer to |
|
# the same, then proceed |
|
if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype): |
|
if lk.dtype.kind == rk.dtype.kind: |
|
continue |
|
|
|
# check whether ints and floats |
|
if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): |
|
# GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int |
|
with np.errstate(invalid=«ignore»): |
|
# error: Argument 1 to «astype» of «ndarray» has incompatible |
|
# type «Union[ExtensionDtype, Any, dtype[Any]]»; expected |
|
# «Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]» |
|
casted = lk.astype(rk.dtype) # type: ignore[arg-type] |
|
|
|
mask = ~np.isnan(lk) |
|
match = lk == casted |
|
if not match[mask].all(): |
|
warnings.warn( |
|
«You are merging on int and float « |
|
«columns where the float values « |
|
«are not equal to their int representation.», |
|
UserWarning, |
|
stacklevel=find_stack_level(), |
|
) |
|
continue |
|
|
|
if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype): |
|
# GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int |
|
with np.errstate(invalid=«ignore»): |
|
# error: Argument 1 to «astype» of «ndarray» has incompatible |
|
# type «Union[ExtensionDtype, Any, dtype[Any]]»; expected |
|
# «Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]» |
|
casted = rk.astype(lk.dtype) # type: ignore[arg-type] |
|
|
|
mask = ~np.isnan(rk) |
|
match = rk == casted |
|
if not match[mask].all(): |
|
warnings.warn( |
|
«You are merging on int and float « |
|
«columns where the float values « |
|
«are not equal to their int representation.», |
|
UserWarning, |
|
stacklevel=find_stack_level(), |
|
) |
|
continue |
|
|
|
# let’s infer and see if we are ok |
|
if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( |
|
rk, skipna=False |
|
): |
|
continue |
|
|
|
# Check if we are trying to merge on obviously |
|
# incompatible dtypes GH 9780, GH 15800 |
|
|
|
# bool values are coerced to object |
|
elif (lk_is_object and is_bool_dtype(rk.dtype)) or ( |
|
is_bool_dtype(lk.dtype) and rk_is_object |
|
): |
|
pass |
|
|
|
# object values are allowed to be merged |
|
elif (lk_is_object and is_numeric_dtype(rk.dtype)) or ( |
|
is_numeric_dtype(lk.dtype) and rk_is_object |
|
): |
|
inferred_left = lib.infer_dtype(lk, skipna=False) |
|
inferred_right = lib.infer_dtype(rk, skipna=False) |
|
bool_types = [«integer», «mixed-integer», «boolean», «empty»] |
|
string_types = [«string», «unicode», «mixed», «bytes», «empty»] |
|
|
|
# inferred bool |
|
if inferred_left in bool_types and inferred_right in bool_types: |
|
pass |
|
|
|
# unless we are merging non-string-like with string-like |
|
elif ( |
|
inferred_left in string_types and inferred_right not in string_types |
|
) or ( |
|
inferred_right in string_types and inferred_left not in string_types |
|
): |
|
raise ValueError(msg) |
|
|
|
# datetimelikes must match exactly |
|
elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype): |
|
raise ValueError(msg) |
|
elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype): |
|
raise ValueError(msg) |
|
elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance( |
|
rk.dtype, DatetimeTZDtype |
|
): |
|
raise ValueError(msg) |
|
elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance( |
|
rk.dtype, DatetimeTZDtype |
|
): |
|
raise ValueError(msg) |
|
|
|
elif lk_is_object and rk_is_object: |
|
continue |
|
|
|
# Houston, we have a problem! |
|
# let’s coerce to object if the dtypes aren’t |
|
# categorical, otherwise coerce to the category |
|
# dtype. If we coerced categories to object, |
|
# then we would lose type information on some |
|
# columns, and end up trying to merge |
|
# incompatible dtypes. See GH 16900. |
|
if name in self.left.columns: |
|
typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object |
|
self.left = self.left.copy() |
|
self.left[name] = self.left[name].astype(typ) |
|
if name in self.right.columns: |
|
typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object |
|
self.right = self.right.copy() |
|
self.right[name] = self.right[name].astype(typ) |
|
|
|
def _create_cross_configuration( |
|
self, left: DataFrame, right: DataFrame |
|
) -> tuple[DataFrame, DataFrame, JoinHow, str]: |
|
«»» |
|
Creates the configuration to dispatch the cross operation to inner join, |
|
e.g. adding a join column and resetting parameters. Join column is added |
|
to a new object, no inplace modification |
|
|
|
Parameters |
|
———- |
|
left : DataFrame |
|
right : DataFrame |
|
|
|
Returns |
|
——- |
|
a tuple (left, right, how, cross_col) representing the adjusted |
|
DataFrames with cross_col, the merge operation set to inner and the column |
|
to join over. |
|
«»» |
|
cross_col = f»_cross_{uuid.uuid4()}« |
|
how: JoinHow = «inner» |
|
return ( |
|
left.assign(**{cross_col: 1}), |
|
right.assign(**{cross_col: 1}), |
|
how, |
|
cross_col, |
|
) |
|
|
|
def _validate_left_right_on(self, left_on, right_on): |
|
left_on = com.maybe_make_list(left_on) |
|
right_on = com.maybe_make_list(right_on) |
|
|
|
if self.how == «cross»: |
|
if ( |
|
self.left_index |
|
or self.right_index |
|
or right_on is not None |
|
or left_on is not None |
|
or self.on is not None |
|
): |
|
raise MergeError( |
|
«Can not pass on, right_on, left_on or set right_index=True or « |
|
«left_index=True» |
|
) |
|
# Hm, any way to make this logic less complicated?? |
|
elif self.on is None and left_on is None and right_on is None: |
|
|
|
if self.left_index and self.right_index: |
|
left_on, right_on = (), () |
|
elif self.left_index: |
|
raise MergeError(«Must pass right_on or right_index=True») |
|
elif self.right_index: |
|
raise MergeError(«Must pass left_on or left_index=True») |
|
else: |
|
# use the common columns |
|
left_cols = self.left.columns |
|
right_cols = self.right.columns |
|
common_cols = left_cols.intersection(right_cols) |
|
if len(common_cols) == 0: |
|
raise MergeError( |
|
«No common columns to perform merge on. « |
|
f»Merge options: left_on={left_on}, « |
|
f»right_on={right_on}, « |
|
f»left_index={self.left_index}, « |
|
f»right_index={self.right_index}« |
|
) |
|
if ( |
|
not left_cols.join(common_cols, how=«inner»).is_unique |
|
or not right_cols.join(common_cols, how=«inner»).is_unique |
|
): |
|
raise MergeError(f»Data columns not unique: {repr(common_cols)}«) |
|
left_on = right_on = common_cols |
|
elif self.on is not None: |
|
if left_on is not None or right_on is not None: |
|
raise MergeError( |
|
‘Can only pass argument «on» OR «left_on» ‘ |
|
‘and «right_on», not a combination of both.’ |
|
) |
|
if self.left_index or self.right_index: |
|
raise MergeError( |
|
‘Can only pass argument «on» OR «left_index» ‘ |
|
‘and «right_index», not a combination of both.’ |
|
) |
|
left_on = right_on = self.on |
|
elif left_on is not None: |
|
if self.left_index: |
|
raise MergeError( |
|
‘Can only pass argument «left_on» OR «left_index» not both.’ |
|
) |
|
if not self.right_index and right_on is None: |
|
raise MergeError(‘Must pass «right_on» OR «right_index».’) |
|
n = len(left_on) |
|
if self.right_index: |
|
if len(left_on) != self.right.index.nlevels: |
|
raise ValueError( |
|
«len(left_on) must equal the number « |
|
‘of levels in the index of «right»‘ |
|
) |
|
right_on = [None] * n |
|
elif right_on is not None: |
|
if self.right_index: |
|
raise MergeError( |
|
‘Can only pass argument «right_on» OR «right_index» not both.’ |
|
) |
|
if not self.left_index and left_on is None: |
|
raise MergeError(‘Must pass «left_on» OR «left_index».’) |
|
n = len(right_on) |
|
if self.left_index: |
|
if len(right_on) != self.left.index.nlevels: |
|
raise ValueError( |
|
«len(right_on) must equal the number « |
|
‘of levels in the index of «left»‘ |
|
) |
|
left_on = [None] * n |
|
if self.how != «cross» and len(right_on) != len(left_on): |
|
raise ValueError(«len(right_on) must equal len(left_on)») |
|
|
|
return left_on, right_on |
|
|
|
def _validate(self, validate: str) -> None: |
|
|
|
# Check uniqueness of each |
|
if self.left_index: |
|
left_unique = self.orig_left.index.is_unique |
|
else: |
|
left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique |
|
|
|
if self.right_index: |
|
right_unique = self.orig_right.index.is_unique |
|
else: |
|
right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique |
|
|
|
# Check data integrity |
|
if validate in [«one_to_one», «1:1»]: |
|
if not left_unique and not right_unique: |
|
raise MergeError( |
|
«Merge keys are not unique in either left « |
|
«or right dataset; not a one-to-one merge» |
|
) |
|
if not left_unique: |
|
raise MergeError( |
|
«Merge keys are not unique in left dataset; not a one-to-one merge» |
|
) |
|
if not right_unique: |
|
raise MergeError( |
|
«Merge keys are not unique in right dataset; not a one-to-one merge» |
|
) |
|
|
|
elif validate in [«one_to_many», «1:m»]: |
|
if not left_unique: |
|
raise MergeError( |
|
«Merge keys are not unique in left dataset; not a one-to-many merge» |
|
) |
|
|
|
elif validate in [«many_to_one», «m:1»]: |
|
if not right_unique: |
|
raise MergeError( |
|
«Merge keys are not unique in right dataset; « |
|
«not a many-to-one merge» |
|
) |
|
|
|
elif validate in [«many_to_many», «m:m»]: |
|
pass |
|
|
|
else: |
|
raise ValueError( |
|
f'»{validate}» is not a valid argument. ‘ |
|
«Valid arguments are:n« |
|
‘- «1:1»n‘ |
|
‘- «1:m»n‘ |
|
‘- «m:1»n‘ |
|
‘- «m:m»n‘ |
|
‘- «one_to_one»n‘ |
|
‘- «one_to_many»n‘ |
|
‘- «many_to_one»n‘ |
|
‘- «many_to_many»‘ |
|
) |
|
|
|
|
|
def get_join_indexers( |
|
left_keys, |
|
right_keys, |
|
sort: bool = False, |
|
how: MergeHow | Literal[«asof»] = «inner», |
|
**kwargs, |
|
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: |
|
«»» |
|
|
|
Parameters |
|
———- |
|
left_keys : ndarray, Index, Series |
|
right_keys : ndarray, Index, Series |
|
sort : bool, default False |
|
how : {‘inner’, ‘outer’, ‘left’, ‘right’}, default ‘inner’ |
|
|
|
Returns |
|
——- |
|
np.ndarray[np.intp] |
|
Indexer into the left_keys. |
|
np.ndarray[np.intp] |
|
Indexer into the right_keys. |
|
«»» |
|
assert len(left_keys) == len( |
|
right_keys |
|
), «left_key and right_keys must be the same length» |
|
|
|
# fast-path for empty left/right |
|
left_n = len(left_keys[0]) |
|
right_n = len(right_keys[0]) |
|
if left_n == 0: |
|
if how in [«left», «inner», «cross»]: |
|
return _get_empty_indexer() |
|
elif not sort and how in [«right», «outer»]: |
|
return _get_no_sort_one_missing_indexer(right_n, True) |
|
elif right_n == 0: |
|
if how in [«right», «inner», «cross»]: |
|
return _get_empty_indexer() |
|
elif not sort and how in [«left», «outer»]: |
|
return _get_no_sort_one_missing_indexer(left_n, False) |
|
|
|
# get left & right join labels and num. of levels at each location |
|
mapped = ( |
|
_factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) |
|
for n in range(len(left_keys)) |
|
) |
|
zipped = zip(*mapped) |
|
llab, rlab, shape = (list(x) for x in zipped) |
|
|
|
# get flat i8 keys from label lists |
|
lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort) |
|
|
|
# factorize keys to a dense i8 space |
|
# `count` is the num. of unique keys |
|
# set(lkey) | set(rkey) == range(count) |
|
|
|
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) |
|
# preserve left frame order if how == ‘left’ and sort == False |
|
kwargs = cp.copy(kwargs) |
|
if how in («left», «right»): |
|
kwargs[«sort»] = sort |
|
join_func = { |
|
«inner»: libjoin.inner_join, |
|
«left»: libjoin.left_outer_join, |
|
«right»: lambda x, y, count, **kwargs: libjoin.left_outer_join( |
|
y, x, count, **kwargs |
|
)[::—1], |
|
«outer»: libjoin.full_outer_join, |
|
}[how] |
|
|
|
# error: Cannot call function of unknown type |
|
return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator] |
|
|
|
|
|
def restore_dropped_levels_multijoin( |
|
left: MultiIndex, |
|
right: MultiIndex, |
|
dropped_level_names, |
|
join_index: Index, |
|
lindexer: npt.NDArray[np.intp], |
|
rindexer: npt.NDArray[np.intp], |
|
) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]: |
|
«»» |
|
*this is an internal non-public method* |
|
|
|
Returns the levels, labels and names of a multi-index to multi-index join. |
|
Depending on the type of join, this method restores the appropriate |
|
dropped levels of the joined multi-index. |
|
The method relies on lindexer, rindexer which hold the index positions of |
|
left and right, where a join was feasible |
|
|
|
Parameters |
|
———- |
|
left : MultiIndex |
|
left index |
|
right : MultiIndex |
|
right index |
|
dropped_level_names : str array |
|
list of non-common level names |
|
join_index : Index |
|
the index of the join between the |
|
common levels of left and right |
|
lindexer : np.ndarray[np.intp] |
|
left indexer |
|
rindexer : np.ndarray[np.intp] |
|
right indexer |
|
|
|
Returns |
|
——- |
|
levels : list of Index |
|
levels of combined multiindexes |
|
labels : np.ndarray[np.intp] |
|
labels of combined multiindexes |
|
names : List[Hashable] |
|
names of combined multiindex levels |
|
|
|
«»» |
|
|
|
def _convert_to_multiindex(index: Index) -> MultiIndex: |
|
if isinstance(index, MultiIndex): |
|
return index |
|
else: |
|
return MultiIndex.from_arrays([index._values], names=[index.name]) |
|
|
|
# For multi-multi joins with one overlapping level, |
|
# the returned index if of type Index |
|
# Assure that join_index is of type MultiIndex |
|
# so that dropped levels can be appended |
|
join_index = _convert_to_multiindex(join_index) |
|
|
|
join_levels = join_index.levels |
|
join_codes = join_index.codes |
|
join_names = join_index.names |
|
|
|
# Iterate through the levels that must be restored |
|
for dropped_level_name in dropped_level_names: |
|
if dropped_level_name in left.names: |
|
idx = left |
|
indexer = lindexer |
|
else: |
|
idx = right |
|
indexer = rindexer |
|
|
|
# The index of the level name to be restored |
|
name_idx = idx.names.index(dropped_level_name) |
|
|
|
restore_levels = idx.levels[name_idx] |
|
# Inject -1 in the codes list where a join was not possible |
|
# IOW indexer[i]=-1 |
|
codes = idx.codes[name_idx] |
|
if indexer is None: |
|
restore_codes = codes |
|
else: |
|
restore_codes = algos.take_nd(codes, indexer, fill_value=—1) |
|
|
|
# error: Cannot determine type of «__add__» |
|
join_levels = join_levels + [restore_levels] # type: ignore[has-type] |
|
join_codes = join_codes + [restore_codes] |
|
join_names = join_names + [dropped_level_name] |
|
|
|
return join_levels, join_codes, join_names |
|
|
|
|
|
class _OrderedMerge(_MergeOperation): |
|
_merge_type = «ordered_merge» |
|
|
|
def __init__( |
|
self, |
|
left: DataFrame | Series, |
|
right: DataFrame | Series, |
|
on: IndexLabel | None = None, |
|
left_on: IndexLabel | None = None, |
|
right_on: IndexLabel | None = None, |
|
left_index: bool = False, |
|
right_index: bool = False, |
|
axis: AxisInt = 1, |
|
suffixes: Suffixes = («_x», «_y»), |
|
fill_method: str | None = None, |
|
how: JoinHow | Literal[«asof»] = «outer», |
|
) -> None: |
|
|
|
self.fill_method = fill_method |
|
_MergeOperation.__init__( |
|
self, |
|
left, |
|
right, |
|
on=on, |
|
left_on=left_on, |
|
left_index=left_index, |
|
right_index=right_index, |
|
right_on=right_on, |
|
axis=axis, |
|
how=how, |
|
suffixes=suffixes, |
|
sort=True, # factorize sorts |
|
) |
|
|
|
def get_result(self, copy: bool = True) -> DataFrame: |
|
join_index, left_indexer, right_indexer = self._get_join_info() |
|
|
|
llabels, rlabels = _items_overlap_with_suffix( |
|
self.left._info_axis, self.right._info_axis, self.suffixes |
|
) |
|
|
|
left_join_indexer: np.ndarray | None |
|
right_join_indexer: np.ndarray | None |
|
|
|
if self.fill_method == «ffill»: |
|
if left_indexer is None: |
|
raise TypeError(«left_indexer cannot be None») |
|
left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast( |
|
np.ndarray, right_indexer |
|
) |
|
left_join_indexer = libjoin.ffill_indexer(left_indexer) |
|
right_join_indexer = libjoin.ffill_indexer(right_indexer) |
|
else: |
|
left_join_indexer = left_indexer |
|
right_join_indexer = right_indexer |
|
|
|
result = self._reindex_and_concat( |
|
join_index, left_join_indexer, right_join_indexer, copy=copy |
|
) |
|
self._maybe_add_join_keys(result, left_indexer, right_indexer) |
|
|
|
return result |
|
|
|
|
|
def _asof_by_function(direction: str): |
|
name = f»asof_join_{direction}_on_X_by_Y» |
|
return getattr(libjoin, name, None) |
|
|
|
|
|
_type_casters = { |
|
«int64_t»: ensure_int64, |
|
«double»: ensure_float64, |
|
«object»: ensure_object, |
|
} |
|
|
|
|
|
def _get_cython_type_upcast(dtype: DtypeObj) -> str: |
|
«»»Upcast a dtype to ‘int64_t’, ‘double’, or ‘object'»»» |
|
if is_integer_dtype(dtype): |
|
return «int64_t» |
|
elif is_float_dtype(dtype): |
|
return «double» |
|
else: |
|
return «object» |
|
|
|
|
|
class _AsOfMerge(_OrderedMerge): |
|
_merge_type = «asof_merge» |
|
|
|
def __init__( |
|
self, |
|
left: DataFrame | Series, |
|
right: DataFrame | Series, |
|
on: IndexLabel | None = None, |
|
left_on: IndexLabel | None = None, |
|
right_on: IndexLabel | None = None, |
|
left_index: bool = False, |
|
right_index: bool = False, |
|
by=None, |
|
left_by=None, |
|
right_by=None, |
|
axis: AxisInt = 1, |
|
suffixes: Suffixes = («_x», «_y»), |
|
copy: bool = True, |
|
fill_method: str | None = None, |
|
how: Literal[«asof»] = «asof», |
|
tolerance=None, |
|
allow_exact_matches: bool = True, |
|
direction: str = «backward», |
|
) -> None: |
|
|
|
self.by = by |
|
self.left_by = left_by |
|
self.right_by = right_by |
|
self.tolerance = tolerance |
|
self.allow_exact_matches = allow_exact_matches |
|
self.direction = direction |
|
|
|
_OrderedMerge.__init__( |
|
self, |
|
left, |
|
right, |
|
on=on, |
|
left_on=left_on, |
|
right_on=right_on, |
|
left_index=left_index, |
|
right_index=right_index, |
|
axis=axis, |
|
how=how, |
|
suffixes=suffixes, |
|
fill_method=fill_method, |
|
) |
|
|
|
def _validate_left_right_on(self, left_on, right_on): |
|
left_on, right_on = super()._validate_left_right_on(left_on, right_on) |
|
|
|
# we only allow on to be a single item for on |
|
if len(left_on) != 1 and not self.left_index: |
|
raise MergeError(«can only asof on a key for left») |
|
|
|
if len(right_on) != 1 and not self.right_index: |
|
raise MergeError(«can only asof on a key for right») |
|
|
|
if self.left_index and isinstance(self.left.index, MultiIndex): |
|
raise MergeError(«left can only have one index») |
|
|
|
if self.right_index and isinstance(self.right.index, MultiIndex): |
|
raise MergeError(«right can only have one index») |
|
|
|
# set ‘by’ columns |
|
if self.by is not None: |
|
if self.left_by is not None or self.right_by is not None: |
|
raise MergeError(«Can only pass by OR left_by and right_by») |
|
self.left_by = self.right_by = self.by |
|
if self.left_by is None and self.right_by is not None: |
|
raise MergeError(«missing left_by») |
|
if self.left_by is not None and self.right_by is None: |
|
raise MergeError(«missing right_by») |
|
|
|
# GH#29130 Check that merge keys do not have dtype object |
|
if not self.left_index: |
|
left_on_0 = left_on[0] |
|
if is_array_like(left_on_0): |
|
lo_dtype = left_on_0.dtype |
|
else: |
|
lo_dtype = ( |
|
self.left._get_label_or_level_values(left_on_0).dtype |
|
if left_on_0 in self.left.columns |
|
else self.left.index.get_level_values(left_on_0) |
|
) |
|
else: |
|
lo_dtype = self.left.index.dtype |
|
|
|
if not self.right_index: |
|
right_on_0 = right_on[0] |
|
if is_array_like(right_on_0): |
|
ro_dtype = right_on_0.dtype |
|
else: |
|
ro_dtype = ( |
|
self.right._get_label_or_level_values(right_on_0).dtype |
|
if right_on_0 in self.right.columns |
|
else self.right.index.get_level_values(right_on_0) |
|
) |
|
else: |
|
ro_dtype = self.right.index.dtype |
|
|
|
if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): |
|
raise MergeError( |
|
f»Incompatible merge dtype, {repr(ro_dtype)} and « |
|
f»{repr(lo_dtype)}, both sides must have numeric dtype» |
|
) |
|
|
|
# add ‘by’ to our key-list so we can have it in the |
|
# output as a key |
|
if self.left_by is not None: |
|
if not is_list_like(self.left_by): |
|
self.left_by = [self.left_by] |
|
if not is_list_like(self.right_by): |
|
self.right_by = [self.right_by] |
|
|
|
if len(self.left_by) != len(self.right_by): |
|
raise MergeError(«left_by and right_by must be same length») |
|
|
|
left_on = self.left_by + list(left_on) |
|
right_on = self.right_by + list(right_on) |
|
|
|
# check ‘direction’ is valid |
|
if self.direction not in [«backward», «forward», «nearest»]: |
|
raise MergeError(f»direction invalid: {self.direction}«) |
|
|
|
return left_on, right_on |
|
|
|
def _get_merge_keys( |
|
self, |
|
) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]: |
|
|
|
# note this function has side effects |
|
(left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() |
|
|
|
# validate index types are the same |
|
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): |
|
if not is_dtype_equal(lk.dtype, rk.dtype): |
|
if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): |
|
# The generic error message is confusing for categoricals. |
|
# |
|
# In this function, the join keys include both the original |
|
# ones of the merge_asof() call, and also the keys passed |
|
# to its by= argument. Unordered but equal categories |
|
# are not supported for the former, but will fail |
|
# later with a ValueError, so we don’t *need* to check |
|
# for them here. |
|
msg = ( |
|
f»incompatible merge keys [{i}] {repr(lk.dtype)} and « |
|
f»{repr(rk.dtype)}, both sides category, but not equal ones» |
|
) |
|
else: |
|
msg = ( |
|
f»incompatible merge keys [{i}] {repr(lk.dtype)} and « |
|
f»{repr(rk.dtype)}, must be the same type» |
|
) |
|
raise MergeError(msg) |
|
|
|
# validate tolerance; datetime.timedelta or Timedelta if we have a DTI |
|
if self.tolerance is not None: |
|
|
|
if self.left_index: |
|
# Actually more specifically an Index |
|
lt = cast(AnyArrayLike, self.left.index) |
|
else: |
|
lt = left_join_keys[—1] |
|
|
|
msg = ( |
|
f»incompatible tolerance {self.tolerance}, must be compat « |
|
f»with type {repr(lt.dtype)}« |
|
) |
|
|
|
if needs_i8_conversion(lt): |
|
if not isinstance(self.tolerance, datetime.timedelta): |
|
raise MergeError(msg) |
|
if self.tolerance < Timedelta(0): |
|
raise MergeError(«tolerance must be positive») |
|
|
|
elif is_integer_dtype(lt): |
|
if not is_integer(self.tolerance): |
|
raise MergeError(msg) |
|
if self.tolerance < 0: |
|
raise MergeError(«tolerance must be positive») |
|
|
|
elif is_float_dtype(lt): |
|
if not is_number(self.tolerance): |
|
raise MergeError(msg) |
|
if self.tolerance < 0: |
|
raise MergeError(«tolerance must be positive») |
|
|
|
else: |
|
raise MergeError(«key must be integer, timestamp or float») |
|
|
|
# validate allow_exact_matches |
|
if not is_bool(self.allow_exact_matches): |
|
msg = ( |
|
«allow_exact_matches must be boolean, « |
|
f»passed {self.allow_exact_matches}« |
|
) |
|
raise MergeError(msg) |
|
|
|
return left_join_keys, right_join_keys, join_names |
|
|
|
def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: |
|
«»»return the join indexers»»» |
|
|
|
def flip(xs) -> np.ndarray: |
|
«»»unlike np.transpose, this returns an array of tuples»»» |
|
|
|
def injection(obj): |
|
if not is_extension_array_dtype(obj): |
|
# ndarray |
|
return obj |
|
obj = extract_array(obj) |
|
if isinstance(obj, NDArrayBackedExtensionArray): |
|
# fastpath for e.g. dt64tz, categorical |
|
return obj._ndarray |
|
# FIXME: returning obj._values_for_argsort() here doesn’t |
|
# break in any existing test cases, but i (@jbrockmendel) |
|
# am pretty sure it should! |
|
# e.g. |
|
# arr = pd.array([0, pd.NA, 255], dtype=»UInt8″) |
|
# will have values_for_argsort (before GH#45434) |
|
# np.array([0, 255, 255], dtype=np.uint8) |
|
# and the non-injectivity should make a difference somehow |
|
# shouldn’t it? |
|
return np.asarray(obj) |
|
|
|
xs = [injection(x) for x in xs] |
|
labels = list(string.ascii_lowercase[: len(xs)]) |
|
dtypes = [x.dtype for x in xs] |
|
labeled_dtypes = list(zip(labels, dtypes)) |
|
return np.array(list(zip(*xs)), labeled_dtypes) |
|
|
|
# values to compare |
|
left_values = ( |
|
self.left.index._values if self.left_index else self.left_join_keys[—1] |
|
) |
|
right_values = ( |
|
self.right.index._values if self.right_index else self.right_join_keys[—1] |
|
) |
|
tolerance = self.tolerance |
|
|
|
# we require sortedness and non-null values in the join keys |
|
if not Index(left_values).is_monotonic_increasing: |
|
side = «left» |
|
if isna(left_values).any(): |
|
raise ValueError(f»Merge keys contain null values on {side} side») |
|
raise ValueError(f»{side} keys must be sorted») |
|
|
|
if not Index(right_values).is_monotonic_increasing: |
|
side = «right» |
|
if isna(right_values).any(): |
|
raise ValueError(f»Merge keys contain null values on {side} side») |
|
raise ValueError(f»{side} keys must be sorted») |
|
|
|
# initial type conversion as needed |
|
if needs_i8_conversion(left_values): |
|
if tolerance is not None: |
|
tolerance = Timedelta(tolerance) |
|
|
|
# TODO: we have no test cases with PeriodDtype here; probably |
|
# need to adjust tolerance for that case. |
|
if left_values.dtype.kind in [«m», «M»]: |
|
# Make sure the i8 representation for tolerance |
|
# matches that for left_values/right_values. |
|
lvs = ensure_wrapped_if_datetimelike(left_values) |
|
tolerance = tolerance.as_unit(lvs.unit) |
|
|
|
tolerance = tolerance._value |
|
|
|
# TODO: require left_values.dtype == right_values.dtype, or at least |
|
# comparable for e.g. dt64tz |
|
left_values = left_values.view(«i8») |
|
right_values = right_values.view(«i8») |
|
|
|
# a «by» parameter requires special handling |
|
if self.left_by is not None: |
|
# remove ‘on’ parameter from values if one existed |
|
if self.left_index and self.right_index: |
|
left_by_values = self.left_join_keys |
|
right_by_values = self.right_join_keys |
|
else: |
|
left_by_values = self.left_join_keys[0:—1] |
|
right_by_values = self.right_join_keys[0:—1] |
|
|
|
# get tuple representation of values if more than one |
|
if len(left_by_values) == 1: |
|
lbv = left_by_values[0] |
|
rbv = right_by_values[0] |
|
else: |
|
# We get here with non-ndarrays in test_merge_by_col_tz_aware |
|
# and test_merge_groupby_multiple_column_with_categorical_column |
|
lbv = flip(left_by_values) |
|
rbv = flip(right_by_values) |
|
|
|
# upcast ‘by’ parameter because HashTable is limited |
|
by_type = _get_cython_type_upcast(lbv.dtype) |
|
by_type_caster = _type_casters[by_type] |
|
# error: Incompatible types in assignment (expression has type |
|
# «ndarray[Any, dtype[generic]]», variable has type |
|
# «List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]») |
|
left_by_values = by_type_caster(lbv) # type: ignore[assignment] |
|
# error: Incompatible types in assignment (expression has type |
|
# «ndarray[Any, dtype[generic]]», variable has type |
|
# «List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]») |
|
right_by_values = by_type_caster(rbv) # type: ignore[assignment] |
|
|
|
# choose appropriate function by type |
|
func = _asof_by_function(self.direction) |
|
return func( |
|
left_values, |
|
right_values, |
|
left_by_values, |
|
right_by_values, |
|
self.allow_exact_matches, |
|
tolerance, |
|
) |
|
else: |
|
# choose appropriate function by type |
|
func = _asof_by_function(self.direction) |
|
return func( |
|
left_values, |
|
right_values, |
|
None, |
|
None, |
|
self.allow_exact_matches, |
|
tolerance, |
|
False, |
|
) |
|
|
|
|
|
def _get_multiindex_indexer( |
|
join_keys, index: MultiIndex, sort: bool |
|
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: |
|
|
|
# left & right join labels and num. of levels at each location |
|
mapped = ( |
|
_factorize_keys(index.levels[n], join_keys[n], sort=sort) |
|
for n in range(index.nlevels) |
|
) |
|
zipped = zip(*mapped) |
|
rcodes, lcodes, shape = (list(x) for x in zipped) |
|
if sort: |
|
rcodes = list(map(np.take, rcodes, index.codes)) |
|
else: |
|
i8copy = lambda a: a.astype(«i8», subok=False, copy=True) |
|
rcodes = list(map(i8copy, index.codes)) |
|
|
|
# fix right labels if there were any nulls |
|
for i, join_key in enumerate(join_keys): |
|
mask = index.codes[i] == —1 |
|
if mask.any(): |
|
# check if there already was any nulls at this location |
|
# if there was, it is factorized to `shape[i] — 1` |
|
a = join_key[lcodes[i] == shape[i] — 1] |
|
if a.size == 0 or not a[0] != a[0]: |
|
shape[i] += 1 |
|
|
|
rcodes[i][mask] = shape[i] — 1 |
|
|
|
# get flat i8 join keys |
|
lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort) |
|
|
|
# factorize keys to a dense i8 space |
|
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) |
|
|
|
return libjoin.left_outer_join(lkey, rkey, count, sort=sort) |
|
|
|
|
|
def _get_single_indexer( |
|
join_key, index: Index, sort: bool = False |
|
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: |
|
left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort) |
|
|
|
return libjoin.left_outer_join(left_key, right_key, count, sort=sort) |
|
|
|
|
|
def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: |
|
«»»Return empty join indexers.»»» |
|
return ( |
|
np.array([], dtype=np.intp), |
|
np.array([], dtype=np.intp), |
|
) |
|
|
|
|
|
def _get_no_sort_one_missing_indexer( |
|
n: int, left_missing: bool |
|
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: |
|
«»» |
|
Return join indexers where all of one side is selected without sorting |
|
and none of the other side is selected. |
|
|
|
Parameters |
|
———- |
|
n : int |
|
Length of indexers to create. |
|
left_missing : bool |
|
If True, the left indexer will contain only -1’s. |
|
If False, the right indexer will contain only -1’s. |
|
|
|
Returns |
|
——- |
|
np.ndarray[np.intp] |
|
Left indexer |
|
np.ndarray[np.intp] |
|
Right indexer |
|
«»» |
|
idx = np.arange(n, dtype=np.intp) |
|
idx_missing = np.full(shape=n, fill_value=—1, dtype=np.intp) |
|
if left_missing: |
|
return idx_missing, idx |
|
return idx, idx_missing |
|
|
|
|
|
def _left_join_on_index( |
|
left_ax: Index, right_ax: Index, join_keys, sort: bool = False |
|
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: |
|
if len(join_keys) > 1: |
|
if not ( |
|
isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels |
|
): |
|
raise AssertionError( |
|
«If more than one join key is given then « |
|
«‘right_ax’ must be a MultiIndex and the « |
|
«number of join keys must be the number of levels in right_ax» |
|
) |
|
|
|
left_indexer, right_indexer = _get_multiindex_indexer( |
|
join_keys, right_ax, sort=sort |
|
) |
|
else: |
|
jkey = join_keys[0] |
|
|
|
left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort) |
|
|
|
if sort or len(left_ax) != len(left_indexer): |
|
# if asked to sort or there are 1-to-many matches |
|
join_index = left_ax.take(left_indexer) |
|
return join_index, left_indexer, right_indexer |
|
|
|
# left frame preserves order & length of its index |
|
return left_ax, None, right_indexer |
|
|
|
|
|
def _factorize_keys( |
|
lk: ArrayLike, |
|
rk: ArrayLike, |
|
sort: bool = True, |
|
how: MergeHow | Literal[«asof»] = «inner», |
|
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: |
|
«»» |
|
Encode left and right keys as enumerated types. |
|
|
|
This is used to get the join indexers to be used when merging DataFrames. |
|
|
|
Parameters |
|
———- |
|
lk : array-like |
|
Left key. |
|
rk : array-like |
|
Right key. |
|
sort : bool, defaults to True |
|
If True, the encoding is done such that the unique elements in the |
|
keys are sorted. |
|
how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’ |
|
Type of merge. |
|
|
|
Returns |
|
——- |
|
np.ndarray[np.intp] |
|
Left (resp. right if called with `key=’right’`) labels, as enumerated type. |
|
np.ndarray[np.intp] |
|
Right (resp. left if called with `key=’right’`) labels, as enumerated type. |
|
int |
|
Number of unique elements in union of left and right labels. |
|
|
|
See Also |
|
——— |
|
merge : Merge DataFrame or named Series objects |
|
with a database-style join. |
|
algorithms.factorize : Encode the object as an enumerated type |
|
or categorical variable. |
|
|
|
Examples |
|
——— |
|
>>> lk = np.array([«a», «c», «b»]) |
|
>>> rk = np.array([«a», «c»]) |
|
|
|
Here, the unique values are `’a’, ‘b’, ‘c’`. With the default |
|
`sort=True`, the encoding will be `{0: ‘a’, 1: ‘b’, 2: ‘c’}`: |
|
|
|
>>> pd.core.reshape.merge._factorize_keys(lk, rk) |
|
(array([0, 2, 1]), array([0, 2]), 3) |
|
|
|
With the `sort=False`, the encoding will correspond to the order |
|
in which the unique elements first appear: `{0: ‘a’, 1: ‘c’, 2: ‘b’}`: |
|
|
|
>>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) |
|
(array([0, 1, 2]), array([0, 1]), 3) |
|
«»» |
|
# Some pre-processing for non-ndarray lk / rk |
|
lk = extract_array(lk, extract_numpy=True, extract_range=True) |
|
rk = extract_array(rk, extract_numpy=True, extract_range=True) |
|
# TODO: if either is a RangeIndex, we can likely factorize more efficiently? |
|
|
|
if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype): |
|
# Extract the ndarray (UTC-localized) values |
|
# Note: we dont need the dtypes to match, as these can still be compared |
|
# TODO(non-nano): need to make sure resolutions match |
|
lk = cast(«DatetimeArray», lk)._ndarray |
|
rk = cast(«DatetimeArray», rk)._ndarray |
|
|
|
elif ( |
|
is_categorical_dtype(lk.dtype) |
|
and is_categorical_dtype(rk.dtype) |
|
and is_dtype_equal(lk.dtype, rk.dtype) |
|
): |
|
assert isinstance(lk, Categorical) |
|
assert isinstance(rk, Categorical) |
|
# Cast rk to encoding so we can compare codes with lk |
|
|
|
rk = lk._encode_with_my_categories(rk) |
|
|
|
lk = ensure_int64(lk.codes) |
|
rk = ensure_int64(rk.codes) |
|
|
|
elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): |
|
if not isinstance(lk, BaseMaskedArray): |
|
lk, _ = lk._values_for_factorize() |
|
|
|
# error: Item «ndarray» of «Union[Any, ndarray]» has no attribute |
|
# «_values_for_factorize» |
|
rk, _ = rk._values_for_factorize() # type: ignore[union-attr] |
|
|
|
klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk) |
|
|
|
rizer = klass(max(len(lk), len(rk))) |
|
|
|
if isinstance(lk, BaseMaskedArray): |
|
assert isinstance(rk, BaseMaskedArray) |
|
llab = rizer.factorize(lk._data, mask=lk._mask) |
|
rlab = rizer.factorize(rk._data, mask=rk._mask) |
|
else: |
|
# Argument 1 to «factorize» of «ObjectFactorizer» has incompatible type |
|
# «Union[ndarray[Any, dtype[signedinteger[_64Bit]]], |
|
# ndarray[Any, dtype[object_]]]»; expected «ndarray[Any, dtype[object_]]» |
|
llab = rizer.factorize(lk) # type: ignore[arg-type] |
|
rlab = rizer.factorize(rk) # type: ignore[arg-type] |
|
assert llab.dtype == np.dtype(np.intp), llab.dtype |
|
assert rlab.dtype == np.dtype(np.intp), rlab.dtype |
|
|
|
count = rizer.get_count() |
|
|
|
if sort: |
|
uniques = rizer.uniques.to_array() |
|
llab, rlab = _sort_labels(uniques, llab, rlab) |
|
|
|
# NA group |
|
lmask = llab == —1 |
|
lany = lmask.any() |
|
rmask = rlab == —1 |
|
rany = rmask.any() |
|
|
|
if lany or rany: |
|
if lany: |
|
np.putmask(llab, lmask, count) |
|
if rany: |
|
np.putmask(rlab, rmask, count) |
|
count += 1 |
|
|
|
if how == «right»: |
|
return rlab, llab, count |
|
return llab, rlab, count |
|
|
|
|
|
def _convert_arrays_and_get_rizer_klass( |
|
lk: ArrayLike, rk: ArrayLike |
|
) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]: |
|
klass: type[libhashtable.Factorizer] |
|
if is_numeric_dtype(lk.dtype): |
|
if not is_dtype_equal(lk, rk): |
|
dtype = find_common_type([lk.dtype, rk.dtype]) |
|
if isinstance(dtype, ExtensionDtype): |
|
cls = dtype.construct_array_type() |
|
if not isinstance(lk, ExtensionArray): |
|
lk = cls._from_sequence(lk, dtype=dtype, copy=False) |
|
else: |
|
lk = lk.astype(dtype) |
|
|
|
if not isinstance(rk, ExtensionArray): |
|
rk = cls._from_sequence(rk, dtype=dtype, copy=False) |
|
else: |
|
rk = rk.astype(dtype) |
|
else: |
|
lk = lk.astype(dtype) |
|
rk = rk.astype(dtype) |
|
if isinstance(lk, BaseMaskedArray): |
|
# Invalid index type «type» for «Dict[Type[object], Type[Factorizer]]»; |
|
# expected type «Type[object]» |
|
klass = _factorizers[lk.dtype.type] # type: ignore[index] |
|
else: |
|
klass = _factorizers[lk.dtype.type] |
|
|
|
else: |
|
klass = libhashtable.ObjectFactorizer |
|
lk = ensure_object(lk) |
|
rk = ensure_object(rk) |
|
return klass, lk, rk |
|
|
|
|
|
def _sort_labels( |
|
uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp] |
|
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: |
|
|
|
llength = len(left) |
|
labels = np.concatenate([left, right]) |
|
|
|
_, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True) |
|
new_left, new_right = new_labels[:llength], new_labels[llength:] |
|
|
|
return new_left, new_right |
|
|
|
|
|
def _get_join_keys( |
|
llab: list[npt.NDArray[np.int64 | np.intp]], |
|
rlab: list[npt.NDArray[np.int64 | np.intp]], |
|
shape: Shape, |
|
sort: bool, |
|
) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: |
|
|
|
# how many levels can be done without overflow |
|
nlev = next( |
|
lev |
|
for lev in range(len(shape), 0, —1) |
|
if not is_int64_overflow_possible(shape[:lev]) |
|
) |
|
|
|
# get keys for the first `nlev` levels |
|
stride = np.prod(shape[1:nlev], dtype=«i8») |
|
lkey = stride * llab[0].astype(«i8», subok=False, copy=False) |
|
rkey = stride * rlab[0].astype(«i8», subok=False, copy=False) |
|
|
|
for i in range(1, nlev): |
|
with np.errstate(divide=«ignore»): |
|
stride //= shape[i] |
|
lkey += llab[i] * stride |
|
rkey += rlab[i] * stride |
|
|
|
if nlev == len(shape): # all done! |
|
return lkey, rkey |
|
|
|
# densify current keys to avoid overflow |
|
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) |
|
|
|
llab = [lkey] + llab[nlev:] |
|
rlab = [rkey] + rlab[nlev:] |
|
shape = (count,) + shape[nlev:] |
|
|
|
return _get_join_keys(llab, rlab, shape, sort) |
|
|
|
|
|
def _should_fill(lname, rname) -> bool: |
|
if not isinstance(lname, str) or not isinstance(rname, str): |
|
return True |
|
return lname == rname |
|
|
|
|
|
def _any(x) -> bool: |
|
return x is not None and com.any_not_none(*x) |
|
|
|
|
|
def _validate_operand(obj: DataFrame | Series) -> DataFrame: |
|
if isinstance(obj, ABCDataFrame): |
|
return obj |
|
elif isinstance(obj, ABCSeries): |
|
if obj.name is None: |
|
raise ValueError(«Cannot merge a Series without a name») |
|
return obj.to_frame() |
|
else: |
|
raise TypeError( |
|
f»Can only merge Series or DataFrame objects, a {type(obj)} was passed» |
|
) |
|
|
|
|
|
def _items_overlap_with_suffix( |
|
left: Index, right: Index, suffixes: Suffixes |
|
) -> tuple[Index, Index]: |
|
«»» |
|
Suffixes type validation. |
|
|
|
If two indices overlap, add suffixes to overlapping entries. |
|
|
|
If corresponding suffix is empty, the entry is simply converted to string. |
|
|
|
«»» |
|
if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict): |
|
raise TypeError( |
|
f»Passing ‘suffixes’ as a {type(suffixes)}, is not supported. « |
|
«Provide ‘suffixes’ as a tuple instead.» |
|
) |
|
|
|
to_rename = left.intersection(right) |
|
if len(to_rename) == 0: |
|
return left, right |
|
|
|
lsuffix, rsuffix = suffixes |
|
|
|
if not lsuffix and not rsuffix: |
|
raise ValueError(f»columns overlap but no suffix specified: {to_rename}«) |
|
|
|
def renamer(x, suffix): |
|
«»» |
|
Rename the left and right indices. |
|
|
|
If there is overlap, and suffix is not None, add |
|
suffix, otherwise, leave it as-is. |
|
|
|
Parameters |
|
———- |
|
x : original column name |
|
suffix : str or None |
|
|
|
Returns |
|
——- |
|
x : renamed column name |
|
«»» |
|
if x in to_rename and suffix is not None: |
|
return f»{x}{suffix}« |
|
return x |
|
|
|
lrenamer = partial(renamer, suffix=lsuffix) |
|
rrenamer = partial(renamer, suffix=rsuffix) |
|
|
|
llabels = left._transform_index(lrenamer) |
|
rlabels = right._transform_index(rrenamer) |
|
|
|
dups = [] |
|
if not llabels.is_unique: |
|
# Only warn when duplicates are caused because of suffixes, already duplicated |
|
# columns in origin should not warn |
|
dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() |
|
if not rlabels.is_unique: |
|
dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) |
|
if dups: |
|
raise MergeError( |
|
f»Passing ‘suffixes’ which cause duplicate columns {set(dups)} is « |
|
f»not allowed.», |
|
) |
|
|
|
return llabels, rlabels |