在以下例子中,我希望回到目前“下游”一栏“上下游”一栏“上层”一栏的最后一栏。 我能够以预期的结果来这样做,但这不是真正的矢量,对较大的数据框架来说效率很低。
import pandas as pd
# Sample DataFrame
data = { lower : [7, 1, 6, 1, 1, 1, 1, 11, 1, 1],
upper : [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}
df = pd.DataFrame(data=data)
df[ DATE ] = pd.date_range( 2020-01-01 , periods=len(data[ lower ]))
df[ DATE ] = pd.to_datetime(df[ DATE ])
df.set_index( DATE , inplace=True)
# new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
def get_most_recent_index(row):
previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]
recent_index = previous_indices[previous_indices[ lower ] >= row[ upper ]].index.max()
return recent_index
df[ prev ] = df.apply(get_most_recent_index, axis=1)
print(df)
我怎么会把这一点变成最有效率的吗?
EDIT:
首先,感谢大家的答复。
关于四个可行解决办法之间的业绩问题,Andrej Kesely提议的明确胜者为一模。 我排除了Pyjanitor和任何数据量接近我的一套,我们很快会发现所有干扰。
baseline: 1min 35s ± 5.15 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
bisect: 1.76 s ± 82.5 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)
enumerate: 1min 13s ± 2.17 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
import pandas as pd
import numpy as np
from bisect import bisect_left
import janitor
def get_sample_df(rows=100_000):
# Sample DataFrame
data = { lower : np.random.default_rng(seed=1).uniform(1,100,rows),
upper : np.random.default_rng(seed=2).uniform(1,100,rows)}
df = pd.DataFrame(data=data)
df = df.astype(int)
df[ DATE ] = pd.date_range( 2020-01-01 , periods=len(data[ lower ]), freq="min")
df[ DATE ] = pd.to_datetime(df[ DATE ])
df.set_index( DATE , inplace=True)
return df
def get_baseline():
df = get_sample_df()
# new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
def get_most_recent_index(row):
previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]
recent_index = previous_indices[previous_indices[ lower ] >= row[ upper ]].index.max()
return recent_index
df[ prev ] = df.apply(get_most_recent_index, axis=1)
return df
def get_pyjanitor():
df = get_sample_df()
df.reset_index(inplace=True)
# set the DATE column as an index
# after the operation you can set the original DATE
# column as an index
left_df = df.assign(index_prev=df.index)
right_df = df.assign(index_next=df.index)
out=(left_df
.conditional_join(
right_df,
( lower , upper , >= ),
( index_prev , index_next , < ),
df_columns= index_prev ,
right_columns=[ index_next , lower , upper ])
)
# based on the matches, we may have multiple returns
# what we need is the closest to the current row
closest=out.index_next-out.index_prev
grouper=[out.index_next, out.lower,out.upper]
min_closest=closest.groupby(grouper).transform( min )
closest=closest==min_closest
# we have out matches, which is defined by `index_prev`
# use index_prev to get the relevant DATE
prev=out.loc[closest, index_prev ]
prev=df.loc[prev, DATE ].array # avoid index alignment here
index_next=out.loc[closest, index_next ]
# now assign back to df, based on index_next and prev
prev=pd.Series(prev,index=index_next)
df = df.assign(prev=prev)
return df
def get_bisect():
df = get_sample_df()
def get_prev_bs(lower, upper, _date):
uniq_lower = sorted(set(lower))
last_seen = {}
for l, u, d in zip(lower, upper, _date):
# find index of element that is >= u
idx = bisect_left(uniq_lower, u)
max_date = None
for lv in uniq_lower[idx:]:
if lv in last_seen:
if max_date is None:
max_date = last_seen[lv]
elif last_seen[lv] > max_date:
max_date = last_seen[lv]
yield max_date
last_seen[l] = d
df["prev"] = list(get_prev_bs(df["lower"], df["upper"], df.index))
return df
def get_enumerate():
df = get_sample_df()
df.reset_index(inplace=True)
date_list=df["DATE"].values.tolist()
lower_list=df["lower"].values.tolist()
upper_list=df["upper"].values.tolist()
new_list=[]
for i,(x,y) in enumerate(zip(lower_list,upper_list)):
if i==0:
new_list.append(None)
else:
if (any(j >= y for j in lower_list[0:i])):
for ll,dl in zip(reversed(lower_list[0:i]),reversed(date_list[0:i])):
if ll>=y:
new_list.append(dl)
break
else:
continue
else:
new_list.append(None)
df[ prev ]=new_list
df[ prev ]=pd.to_datetime(df[ prev ])
return df
print("baseline:")
%timeit -n 2 -r 2 get_baseline()
# Unable to allocate 37.2 GiB for an array with shape (4994299505,) and data type int64
# print("pyjanitor:")
# %timeit -n 2 get_pyjanitor()
print("bisect:")
%timeit -n 2 -r 2 get_bisect()
print("enumerate:")
%timeit -n 2 -r 2 get_enumerate()