import pandas as pd
df = pd.read_csv("df_to_optimisation.csv", index_col=0, engine="python")
df
funded_amount | sector | country_code | country | region | female | male | continent_code | country_lat | country_long | reg_lat | reg_long | female_rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 200.0 | Agriculture | IN | India | Maynaguri | 1.0 | 0.0 | AS | 20.593684 | 78.962880 | 26.573829 | 88.821498 | 1.0 |
5 | 225.0 | Agriculture | IN | India | Dhupguri | 1.0 | 0.0 | AS | 20.593684 | 78.962880 | 26.582132 | 89.005144 | 1.0 |
10 | 250.0 | Agriculture | IN | India | Dhupguri | 1.0 | 0.0 | AS | 20.593684 | 78.962880 | 26.582132 | 89.005144 | 1.0 |
11 | 125.0 | Agriculture | IN | India | Maynaguri | 1.0 | 0.0 | AS | 20.593684 | 78.962880 | 26.573829 | 88.821498 | 1.0 |
12 | 250.0 | Agriculture | IN | India | Dhupguri | 1.0 | 0.0 | AS | 20.593684 | 78.962880 | 26.582132 | 89.005144 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
330394 | 175.0 | Retail | TG | Togo | baguida | 1.0 | 0.0 | AF | 8.619543 | 0.824782 | 6.164286 | 1.326274 | 1.0 |
330405 | 175.0 | Retail | TG | Togo | baguida | 1.0 | 0.0 | AF | 8.619543 | 0.824782 | 6.164286 | 1.326274 | 1.0 |
330411 | 175.0 | Retail | TG | Togo | baguida | 0.0 | 1.0 | AF | 8.619543 | 0.824782 | 6.164286 | 1.326274 | 0.0 |
330428 | 100.0 | Retail | TG | Togo | baguida | 1.0 | 0.0 | AF | 8.619543 | 0.824782 | 6.164286 | 1.326274 | 1.0 |
330468 | 100.0 | Retail | TJ | Tajikistan | Khuroson | 1.0 | 0.0 | AS | 38.861034 | 71.276093 | 38.114343 | 68.667803 | 1.0 |
305621 rows × 13 columns
# First lets tidy the index
df.reset_index(drop=True, inplace=True)
df.head()
funded_amount | sector | country_code | country | region | female | male | continent_code | country_lat | country_long | reg_lat | reg_long | female_rate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 200.0 | Agriculture | IN | India | Maynaguri | 1.0 | 0.0 | AS | 20.593684 | 78.96288 | 26.573829 | 88.821498 | 1.0 |
1 | 225.0 | Agriculture | IN | India | Dhupguri | 1.0 | 0.0 | AS | 20.593684 | 78.96288 | 26.582132 | 89.005144 | 1.0 |
2 | 250.0 | Agriculture | IN | India | Dhupguri | 1.0 | 0.0 | AS | 20.593684 | 78.96288 | 26.582132 | 89.005144 | 1.0 |
3 | 125.0 | Agriculture | IN | India | Maynaguri | 1.0 | 0.0 | AS | 20.593684 | 78.96288 | 26.573829 | 88.821498 | 1.0 |
4 | 250.0 | Agriculture | IN | India | Dhupguri | 1.0 | 0.0 | AS | 20.593684 | 78.96288 | 26.582132 | 89.005144 | 1.0 |
# Then we drop the columns we didn't use at the end.
df.drop(columns=["reg_lat", "reg_long","female", "male", "region"], inplace=True)
# Now we check the general info
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 305621 entries, 0 to 305620 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 funded_amount 305621 non-null float64 1 sector 305621 non-null object 2 country_code 305621 non-null object 3 country 305621 non-null object 4 continent_code 305621 non-null object 5 country_lat 305621 non-null float64 6 country_long 305621 non-null float64 7 female_rate 305621 non-null float64 dtypes: float64(4), object(4) memory usage: 18.7+ MB
# Although we have sector, coountry_code, country and continent_code columns as object, their string values are
# mostly repeating.
df.nunique()
# So it will be very enjoyble to see the memory usage is going down.
funded_amount 120 sector 3 country_code 44 country 44 continent_code 2 country_lat 44 country_long 44 female_rate 288 dtype: int64
# Lets first check the memory usage
df.memory_usage(deep=True)
Index 128 funded_amount 2444968 sector 19708330 country_code 18031639 country 20044536 continent_code 18031639 country_lat 2444968 country_long 2444968 female_rate 2444968 dtype: int64
# Now we are changing the columns to the category type.
df_opti = df.copy()
df_opti['sector'] = df_opti['sector'].astype('category')
df_opti['country_code'] = df_opti['country_code'].astype('category')
df_opti['country'] = df_opti['country'].astype('category')
df_opti['continent_code'] = df_opti['continent_code'].astype('category')
df_opti.dtypes
funded_amount float64 sector category country_code category country category continent_code category country_lat float64 country_long float64 female_rate float64 dtype: object
# Now lets check the new memory usage
df_opti.memory_usage(deep=True)
Index 128 funded_amount 2444968 sector 305921 country_code 309289 country 309527 continent_code 305847 country_lat 2444968 country_long 2444968 female_rate 2444968 dtype: int64
# Downcasting of the float columns
df_opti.loc[:,["funded_amount","country_lat","country_long","female_rate"]] = df_opti.loc[:,["funded_amount","country_lat","country_long","female_rate"]].apply(pd.to_numeric, downcast="float")
df_opti.dtypes
funded_amount float32 sector category country_code category country category continent_code category country_lat float32 country_long float32 female_rate float32 dtype: object
# Lets see the memory usage again
df_opti.memory_usage(deep=True)
Index 128 funded_amount 1222484 sector 305921 country_code 309289 country 309527 continent_code 305847 country_lat 1222484 country_long 1222484 female_rate 1222484 dtype: int64
# The total rate of the memory optimisation
proportion = (df.memory_usage(deep=True).sum()-df_opti.memory_usage(deep=True).sum())/df.memory_usage(deep=True).sum()
print(f"{proportion:0f}")
0.928494
# Now i want to see the top difference on the memory usage by saving it under pickle,
# I DONT ADVICE IT because it can cause adaptive problems between different versions
df_opti.to_pickle("test.pkl")
df = pd.read_pickle("test.pkl")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 305621 entries, 0 to 305620 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 funded_amount 305621 non-null float32 1 sector 305621 non-null category 2 country_code 305621 non-null category 3 country 305621 non-null category 4 continent_code 305621 non-null category 5 country_lat 305621 non-null float32 6 country_long 305621 non-null float32 7 female_rate 305621 non-null float32 dtypes: category(4), float32(4) memory usage: 5.8 MB
df.memory_usage(deep=True)
Index 128 funded_amount 1222484 sector 305813 country_code 308217 country 308455 continent_code 305739 country_lat 1222484 country_long 1222484 female_rate 1222484 dtype: int64