import time
import pandas
import numpy as np
import modin.pandas as pd
import modin.config as cfg
import ray
print(f"\tPandas version: {pandas.__version__}")
print(f"\tModin version: {pd.__version__}")
print(f"\tCpuCount: {cfg.CpuCount.get()}")
print(f"\tEngine: {cfg.Engine.get()}")
print(f"\tNPartitions: {cfg.NPartitions.get()}")
ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})
pandas_df = pandas.DataFrame(
np.random.randint(0, 100, size=(1000000, 13))
)
pandas_df.to_csv("foo.csv", index=False)
def read_csv_with_pandas():
start_time = time.time()
pandas_df = pandas.read_csv("foo.csv", index_col=0)
end_time = time.time()
pandas_duration = end_time - start_time
print("Time to read_csv with Pandas: {} seconds".format(round(pandas_duration, 3)))
return pandas_df
def read_csv_with_modin():
start_time = time.time()
modin_df = pd.read_csv("foo.csv", index_col=0)
end_time = time.time()
modin_duration = end_time - start_time
print("Time to read_csv with Modin: {} seconds".format(round(modin_duration, 3)))
return modin_df
for i in range(5):
pandas_df = read_csv_with_pandas()
modin_df = read_csv_with_modin()
Pandas version: 1.5.1
Modin version: 0.16.0+24.g11ba4811
CpuCount: 8
Engine: Ray
NPartitions: 8
Time to read_csv with Pandas: 0.708 seconds
Time to read_csv with Modin: 4.132 seconds
Time to read_csv with Pandas: 0.735 seconds
Time to read_csv with Modin: 0.37 seconds
Time to read_csv with Pandas: 0.646 seconds
Time to read_csv with Modin: 0.377 seconds
Time to read_csv with Pandas: 0.673 seconds
Time to read_csv with Modin: 0.371 seconds
Time to read_csv with Pandas: 0.672 seconds
Time to read_csv with Modin: 0.379 seconds
Yes and no… first of all, it would affect other processes, too, and - second - there is no guarantee that all environment variables will be inherited by Ray workers, as this is implementation detail (not a contract) as far as I’m aware.
Oh, and this won’t work for the case of locally running Ray to which we already connect.
With all that said, I guess you can try to do this optimization while leaving a comment pointing to this discussion, so if something breaks in the future we’d now where to start digging.
Can you please notify us here when your change is merged in master so we are in tune.
@rkooo567, do you have some notes regarding this in Ray documentation? This behavior can really confuse the user.