import pandas as pd


df = pd.read_csv("df_to_optimisation.csv", index_col=0, engine="python")
df


# First lets tidy the index
df.reset_index(drop=True, inplace=True)
df.head()


# Then we drop the columns we didn't use at the end.
df.drop(columns=["reg_lat", "reg_long","female", "male", "region"], inplace=True)


# Now we check the general info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305621 entries, 0 to 305620
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   funded_amount   305621 non-null  float64
 1   sector          305621 non-null  object 
 2   country_code    305621 non-null  object 
 3   country         305621 non-null  object 
 4   continent_code  305621 non-null  object 
 5   country_lat     305621 non-null  float64
 6   country_long    305621 non-null  float64
 7   female_rate     305621 non-null  float64
dtypes: float64(4), object(4)
memory usage: 18.7+ MB


# Although we have sector, coountry_code, country and continent_code columns as object, their string values are 
# mostly repeating.
df.nunique()
# So it will be very enjoyble to see the memory usage is going down.

funded_amount     120
sector              3
country_code       44
country            44
continent_code      2
country_lat        44
country_long       44
female_rate       288
dtype: int64


# Lets first check the memory usage
df.memory_usage(deep=True)

Index                  128
funded_amount      2444968
sector            19708330
country_code      18031639
country           20044536
continent_code    18031639
country_lat        2444968
country_long       2444968
female_rate        2444968
dtype: int64


# Now we are changing the columns to the category type.
df_opti = df.copy()
df_opti['sector'] = df_opti['sector'].astype('category')
df_opti['country_code'] = df_opti['country_code'].astype('category')
df_opti['country'] = df_opti['country'].astype('category')
df_opti['continent_code'] = df_opti['continent_code'].astype('category')
df_opti.dtypes

funded_amount      float64
sector            category
country_code      category
country           category
continent_code    category
country_lat        float64
country_long       float64
female_rate        float64
dtype: object


# Now lets check the new memory usage
df_opti.memory_usage(deep=True)

Index                 128
funded_amount     2444968
sector             305921
country_code       309289
country            309527
continent_code     305847
country_lat       2444968
country_long      2444968
female_rate       2444968
dtype: int64


# Downcasting of the float columns
df_opti.loc[:,["funded_amount","country_lat","country_long","female_rate"]] = df_opti.loc[:,["funded_amount","country_lat","country_long","female_rate"]].apply(pd.to_numeric, downcast="float")
df_opti.dtypes

funded_amount      float32
sector            category
country_code      category
country           category
continent_code    category
country_lat        float32
country_long       float32
female_rate        float32
dtype: object


# Lets see the memory usage again
df_opti.memory_usage(deep=True)

Index                 128
funded_amount     1222484
sector             305921
country_code       309289
country            309527
continent_code     305847
country_lat       1222484
country_long      1222484
female_rate       1222484
dtype: int64


# The total rate of the memory optimisation

proportion = (df.memory_usage(deep=True).sum()-df_opti.memory_usage(deep=True).sum())/df.memory_usage(deep=True).sum()
print(f"{proportion:0f}")

0.928494


# Now i want to see the top difference on the memory usage by saving it under pickle,
# I DONT ADVICE IT because it can cause adaptive problems between different versions
df_opti.to_pickle("test.pkl")


df = pd.read_pickle("test.pkl")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305621 entries, 0 to 305620
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   funded_amount   305621 non-null  float32 
 1   sector          305621 non-null  category
 2   country_code    305621 non-null  category
 3   country         305621 non-null  category
 4   continent_code  305621 non-null  category
 5   country_lat     305621 non-null  float32 
 6   country_long    305621 non-null  float32 
 7   female_rate     305621 non-null  float32 
dtypes: category(4), float32(4)
memory usage: 5.8 MB


df.memory_usage(deep=True)

Index                 128
funded_amount     1222484
sector             305813
country_code       308217
country            308455
continent_code     305739
country_lat       1222484
country_long      1222484
female_rate       1222484
dtype: int64

	funded_amount	sector	country_code	country	region	female	male	continent_code	country_lat	country_long	reg_lat	reg_long	female_rate
2	200.0	Agriculture	IN	India	Maynaguri	1.0	0.0	AS	20.593684	78.962880	26.573829	88.821498	1.0
5	225.0	Agriculture	IN	India	Dhupguri	1.0	0.0	AS	20.593684	78.962880	26.582132	89.005144	1.0
10	250.0	Agriculture	IN	India	Dhupguri	1.0	0.0	AS	20.593684	78.962880	26.582132	89.005144	1.0
11	125.0	Agriculture	IN	India	Maynaguri	1.0	0.0	AS	20.593684	78.962880	26.573829	88.821498	1.0
12	250.0	Agriculture	IN	India	Dhupguri	1.0	0.0	AS	20.593684	78.962880	26.582132	89.005144	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
330394	175.0	Retail	TG	Togo	baguida	1.0	0.0	AF	8.619543	0.824782	6.164286	1.326274	1.0
330405	175.0	Retail	TG	Togo	baguida	1.0	0.0	AF	8.619543	0.824782	6.164286	1.326274	1.0
330411	175.0	Retail	TG	Togo	baguida	0.0	1.0	AF	8.619543	0.824782	6.164286	1.326274	0.0
330428	100.0	Retail	TG	Togo	baguida	1.0	0.0	AF	8.619543	0.824782	6.164286	1.326274	1.0
330468	100.0	Retail	TJ	Tajikistan	Khuroson	1.0	0.0	AS	38.861034	71.276093	38.114343	68.667803	1.0

	funded_amount	sector	country_code	country	region	female	continent_code	country_lat	country_long	reg_lat	reg_long	female_rate
0	200.0	Agriculture	IN	India	Maynaguri	1.0	AS	20.593684	78.96288	26.573829	88.821498	1.0
1	225.0	Agriculture	IN	India	Dhupguri	1.0	AS	20.593684	78.96288	26.582132	89.005144	1.0
2	250.0	Agriculture	IN	India	Dhupguri	1.0	AS	20.593684	78.96288	26.582132	89.005144	1.0
3	125.0	Agriculture	IN	India	Maynaguri	1.0	AS	20.593684	78.96288	26.573829	88.821498	1.0
4	250.0	Agriculture	IN	India	Dhupguri	1.0	AS	20.593684	78.96288	26.582132	89.005144	1.0

Actually at the end, our data was cleaned and tidied mostly. But still there are some issues to be handled.¶

Categorizing, removing some columns...¶