# We import the needed libraries
import pandas as pd
import plotly.express as px # for the first impression about our data
from geopy.geocoders import Nominatim # to get the lats and longs of the countries
from pycountry_convert import country_alpha2_to_continent_code # to have the continent alpha-2 codes out of country names
import matplotlib.pyplot as plt # to plot piechart
import ipywidgets as widgets # to use tabs together with piechart plot
import numpy as np
import json
# we import our file again
dropped = pd.read_csv("dropped.csv", engine="python")
# General information check to see if the numbers fit
dropped.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 671205 entries, 0 to 671204 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 funded_amount 671205 non-null float64 1 loan_amount 671205 non-null float64 2 activity 671205 non-null object 3 sector 671205 non-null object 4 country_code 671197 non-null object 5 country 671205 non-null object 6 region 614405 non-null object 7 term_in_months 671205 non-null float64 8 lender_count 671205 non-null int64 9 repayment_interval 671205 non-null object 10 borrowers_total 666984 non-null float64 11 female 666984 non-null float64 12 male 666984 non-null float64 dtypes: float64(6), int64(1), object(6) memory usage: 66.6+ MB
dropped.head()
funded_amount | loan_amount | activity | sector | country_code | country | region | term_in_months | lender_count | repayment_interval | borrowers_total | female | male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 300.0 | 300.0 | Fruits & Vegetables | Food | PK | Pakistan | Lahore | 12.0 | 12 | irregular | 1.0 | 1.0 | 0.0 |
1 | 575.0 | 575.0 | Rickshaw | Transportation | PK | Pakistan | Lahore | 11.0 | 14 | irregular | 2.0 | 2.0 | 0.0 |
2 | 150.0 | 150.0 | Transportation | Transportation | IN | India | Maynaguri | 30.0 | 6 | bullet | 1.0 | 1.0 | 0.0 |
3 | 200.0 | 200.0 | Embroidery | Arts | PK | Pakistan | Lahore | 11.0 | 8 | irregular | 1.0 | 1.0 | 0.0 |
4 | 400.0 | 400.0 | Milk Sales | Food | PK | Pakistan | Abdul Hakeem | 14.0 | 16 | monthly | 1.0 | 1.0 | 0.0 |
# with the moduel pycountry_convert we are getting the lats and longs of the countries from country codes
def get_continent(col):
try:
cn_continent = country_alpha2_to_continent_code(col)
except:
cn_continent = 'Unknown'
return cn_continent
dropped["continent_code"] = [get_continent(x) for x in dropped.country_code]
dropped.head()
funded_amount | loan_amount | activity | sector | country_code | country | region | term_in_months | lender_count | repayment_interval | borrowers_total | female | male | continent_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 300.0 | 300.0 | Fruits & Vegetables | Food | PK | Pakistan | Lahore | 12.0 | 12 | irregular | 1.0 | 1.0 | 0.0 | AS |
1 | 575.0 | 575.0 | Rickshaw | Transportation | PK | Pakistan | Lahore | 11.0 | 14 | irregular | 2.0 | 2.0 | 0.0 | AS |
2 | 150.0 | 150.0 | Transportation | Transportation | IN | India | Maynaguri | 30.0 | 6 | bullet | 1.0 | 1.0 | 0.0 | AS |
3 | 200.0 | 200.0 | Embroidery | Arts | PK | Pakistan | Lahore | 11.0 | 8 | irregular | 1.0 | 1.0 | 0.0 | AS |
4 | 400.0 | 400.0 | Milk Sales | Food | PK | Pakistan | Abdul Hakeem | 14.0 | 16 | monthly | 1.0 | 1.0 | 0.0 | AS |
# lets see if there are some missing Data from our new column
dropped.loc[dropped.loc[:,"continent_code"]=="Unknown"]["country_code"].unique()
array(['TL', nan], dtype=object)
# a little research on internet we can find that the country with code TL is in Asia
dropped.loc[dropped.loc[:,"country_code"]=="TL", "continent_code"] = "AS"
# Then we check the nan values, which they all belong to Namibia
dropped.loc[dropped.loc[:,"continent_code"]=="Unknown"]
funded_amount | loan_amount | activity | sector | country_code | country | region | term_in_months | lender_count | repayment_interval | borrowers_total | female | male | continent_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
202537 | 3000.0 | 3000.0 | Wholesale | Wholesale | NaN | Namibia | EEnhana | 6.0 | 70 | bullet | 1.0 | 1.0 | 0.0 | Unknown |
202823 | 3000.0 | 3000.0 | Wholesale | Wholesale | NaN | Namibia | Rundu | 6.0 | 70 | bullet | 1.0 | 0.0 | 1.0 | Unknown |
344929 | 3000.0 | 3000.0 | Wholesale | Wholesale | NaN | Namibia | EEnhana | 7.0 | 70 | bullet | 1.0 | 1.0 | 0.0 | Unknown |
351177 | 3000.0 | 3000.0 | Wholesale | Wholesale | NaN | Namibia | Rundu | 7.0 | 70 | bullet | 1.0 | 0.0 | 1.0 | Unknown |
420953 | 3000.0 | 3000.0 | Wholesale | Wholesale | NaN | Namibia | EEnhana | 7.0 | 70 | bullet | 1.0 | 1.0 | 0.0 | Unknown |
421218 | 3000.0 | 3000.0 | Wholesale | Wholesale | NaN | Namibia | Rundu | 7.0 | 70 | bullet | 1.0 | 0.0 | 1.0 | Unknown |
487207 | 3000.0 | 3000.0 | Renewable Energy Products | Retail | NaN | Namibia | Katima Mulilo | 7.0 | 70 | bullet | 1.0 | 0.0 | 1.0 | Unknown |
487653 | 3000.0 | 3000.0 | Wholesale | Wholesale | NaN | Namibia | Oshakati | 7.0 | 70 | bullet | 1.0 | 1.0 | 0.0 | Unknown |
# First of all, Namibia needs a country code,
dropped.loc[dropped.loc[:,"country"]=="Namibia", "country_code"] = "NA"
# then a continent code
dropped.loc[dropped.loc[:,"country"]=="Namibia", "continent_code"] = "AF"
# piechart male, female, continent, funded amount, sector
grouped = dropped.groupby(["continent_code", "sector" ], as_index = False).agg({"funded_amount": "sum", "female": "sum", "male":"sum", "borrowers_total": "sum" })
grouped
continent_code | sector | funded_amount | female | male | borrowers_total | |
---|---|---|---|---|---|---|
0 | AF | Agriculture | 32055735.0 | 118185.0 | 71886.0 | 190071.0 |
1 | AF | Arts | 577025.0 | 1885.0 | 407.0 | 2292.0 |
2 | AF | Clothing | 9546150.0 | 33783.0 | 5196.0 | 38979.0 |
3 | AF | Construction | 1333840.0 | 2265.0 | 1816.0 | 4081.0 |
4 | AF | Education | 3723800.0 | 2710.0 | 3483.0 | 6193.0 |
... | ... | ... | ... | ... | ... | ... |
84 | SA | Personal Use | 891300.0 | 1079.0 | 702.0 | 1781.0 |
85 | SA | Retail | 17751525.0 | 50952.0 | 4739.0 | 55691.0 |
86 | SA | Services | 9230175.0 | 24486.0 | 2996.0 | 27482.0 |
87 | SA | Transportation | 1531400.0 | 1007.0 | 1740.0 | 2747.0 |
88 | SA | Wholesale | 74125.0 | 51.0 | 54.0 | 105.0 |
89 rows × 6 columns
# lets have a rough idea about how we are making most of our money
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
fig = px.scatter_3d(grouped, x='borrowers_total', y='sector', z='funded_amount',
color='continent_code')
fig.show()
# We want to see the continent and sector portions
grouped2 = dropped.groupby("continent_code", as_index = False).agg({"funded_amount": "sum"})
grouped3 = dropped.groupby("sector", as_index = False).agg({"funded_amount": "sum"})
grouped3 = grouped3.sort_values( by=['funded_amount'], ascending= False)
grouped2 = grouped2.sort_values(by=['funded_amount'], ascending= False)
from ipywidgets.embed import embed_minimal_html
out1 = widgets.Output()
out2 = widgets.Output()
data1 = pd.DataFrame(np.random.normal(size = 50))
data2 = pd.DataFrame(np.random.normal(size = 100))
tab = widgets.Tab(children = [out1, out2])
tab.set_title(0, 'Continents')
tab.set_title(1, 'Sectors')
display(tab)
embed_minimal_html('export.html', views=[tab], title='Widgets export')
with out1:
explode = (0.1, 0.1, 0.0,0.0, 0.0, 0.0)
group2_names= grouped2.continent_code.tolist()
group2_size= grouped2.funded_amount.tolist()
my_plot = plt.pie(group2_size,
labels=group2_names,
autopct="%0.f%%",
radius=2.4,
explode = explode,
counterclock= False,
colors = ["red" if x <= 1 else "grey" for x in range(0,15)],
startangle=90)
plt.show(my_plot)
with out2:
explode = (0.1, 0.1, 0.1, 0.0,0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0)
group3_names= grouped3.sector.tolist()
group3_size= grouped3.funded_amount.tolist()
my_plot = plt.pie(group3_size,
labels=[y if x <= 9 else "" for x,y in enumerate(group3_names)],
autopct="%0.f%%",
radius=2.4,
explode = explode,
counterclock= False,
colors = ["red" if x <= 2 else "grey" for x in range(0,15)],
startangle=90)
plt.show(my_plot)
# Dropping the columns other than we need
dropped.drop(dropped[(dropped['sector'] != "Retail") & (dropped['sector'] != "Food") & (dropped['sector'] != "Agriculture")].index, inplace= True)
dropped = dropped.drop(dropped[(dropped['continent_code'] != "AS") & (dropped['continent_code'] != "AF")].index)
# Left over funded_amount is almost 40% of the total funded amount
d_d = dropped.groupby(["continent_code", "sector"], as_index= False).agg({"funded_amount":"sum"})
d_d.groupby("continent_code", as_index=False).agg({"funded_amount":"sum"}).funded_amount
# Lets see the comparison between the continents and sectors we are gonna need
fig, ax = plt.subplots()
size = 0.9
cmap = plt.get_cmap("tab20c")
outer_colors = cmap(np.arange(3)*4)
inner_colors = cmap(np.array([1, 2, 3, 4,5,6]))
ax.pie(d_d.groupby("continent_code", as_index=False).agg({"funded_amount":"sum"}).funded_amount, radius=3, labels = ["Africa", "Asia"], colors=outer_colors, autopct='%1.1f%%', pctdistance=0.8,
wedgeprops=dict(width=size, edgecolor='w'))
ax.pie(d_d.funded_amount, radius=3-size, labels= d_d["sector"], rotatelabels=False, colors=inner_colors, labeldistance=0.5, autopct='%1.1f%%', pctdistance=0.8,
wedgeprops=dict(width=size, edgecolor='w'))
plt.show()
dropped.drop(columns=["loan_amount", "activity", "borrowers_total", "term_in_months", "lender_count", "repayment_interval"], inplace=True)
# we drop some more rows which we are not earning money from:)
dropped.drop(dropped[(dropped['funded_amount'] == 0)].index)
# not working properly repeatedly
# countries = [x for x in dropped.country]
# unique_countries = [x for x in dropped.country.unique()]
# geolocator = Nominatim(user_agent="Fatih")
# u_c_dict = {}
# for country in unique_countries:
# try:
# loc = geolocator.geocode(country)
# u_c_dict[country] = {"latitude":loc.latitude, "longitude": loc.longitude}
# except:
# u_c_dict[country] = {"latitude":np.nan, "longitude": np.nan}
# geolocator was working good, but a local file would be more consistent
data = pd.read_csv('latlong.txt', sep="\t", header=0)
data
# We merge the lats and longs dataframe to our original dataframe
dropped = dropped.merge(data, left_on='country', right_on='name', how='left')
# Getting the lats and longs of the regions
regions = [x for x in dropped.region]
unique_regions = [x for x in dropped.region.unique()]
# for i in unique_regions:
# if i not in u_r_dict:
# try:
# response = requests.get(f"https://maps.googleapis.com/maps/api/geocode/json?address={i}&key={your key}")
# u_r_dict[i] = {"latitude":response.json()["results"][0]["geometry"]["location"]["lat"], "longitude": response.json()["results"][0]["geometry"]["location"]["lng"]}
# except:
# u_r_dict[i] = {"latitude":np.nan, "longitude": np.nan}
# with open('u_r_dict.json', 'w') as fp:
# json.dump(u_r_dict, fp, indent=4)
# Get back the dictionary which includes the data of the locations from Regions
f = open('u_r_dict.json')
u_r_dict = json.load(f)
len(u_r_dict)
# we write the lat and longs into the lists, than we will make new columns out of them
reg_lat = []
reg_lng = []
for x in regions:
try:
reg_lat.append(u_r_dict[x]['latitude'])
reg_lng.append(u_r_dict[x]['longitude'])
except:
reg_lat.append(np.nan)
reg_lng.append(np.nan)
# Here are our new columns
dropped["reg_lat"] = reg_lat
dropped["reg_long"] = reg_lng
#Renew the index
dropped.reset_index(drop=True, inplace=True)
# Rename some columns to understand them better
dropped.rename(columns={"latitude": "country_lat", "longitude":"country_long"}, inplace=True)
# Drop the columns coming from the merge
dropped.drop(columns="name", inplace=True)
dropped.drop(columns="country_y", inplace=True)
# fixing the name of our original country column
dropped.rename(columns={"country_x": "country"}, inplace=True)
dropped.head()
# We fill the nan values in reg_lat and reg_lot columns with the values of country lats and longs,
# on the map there will not be very much difference, SO WE WILL NOT LOSE VALUE
dropped.reg_lat.fillna(dropped.country_lat, inplace=True)
dropped.reg_long.fillna(dropped.country_long, inplace=True)
# We split our dataframe to 3 pieces in order to show them on the kepler module, which it will be easier this way
df_agriculture = dropped.loc[dropped.loc[:,"sector"]=="Agriculture",:]
df_food = dropped.loc[dropped.loc[:,"sector"]=="Food",:]
df_retail = dropped.loc[dropped.loc[:,"sector"]=="Retail",:]
dropped.to_csv("df.csv")
df_agriculture.to_csv("agri.csv")
df_food.to_csv("food.csv")
df_retail.to_csv("retail.csv")