# We import the needed libraries

import pandas as pd
import plotly.express as px   # for the first impression about our data
from geopy.geocoders import Nominatim    # to get the lats and longs of the countries
from pycountry_convert import country_alpha2_to_continent_code   # to have the continent alpha-2 codes out of country names
import matplotlib.pyplot as plt    # to plot piechart
import ipywidgets as widgets    # to use tabs together with piechart plot
import numpy as np
import json


# we import our file again
dropped = pd.read_csv("dropped.csv", engine="python")


# General information check to see if the numbers fit
dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   funded_amount       671205 non-null  float64
 1   loan_amount         671205 non-null  float64
 2   activity            671205 non-null  object 
 3   sector              671205 non-null  object 
 4   country_code        671197 non-null  object 
 5   country             671205 non-null  object 
 6   region              614405 non-null  object 
 7   term_in_months      671205 non-null  float64
 8   lender_count        671205 non-null  int64  
 9   repayment_interval  671205 non-null  object 
 10  borrowers_total     666984 non-null  float64
 11  female              666984 non-null  float64
 12  male                666984 non-null  float64
dtypes: float64(6), int64(1), object(6)
memory usage: 66.6+ MB


dropped.head()


# with the moduel pycountry_convert we are getting the lats and longs of the countries from country codes
def get_continent(col):
    try:
        cn_continent = country_alpha2_to_continent_code(col)
    except:
        cn_continent = 'Unknown' 
    return cn_continent
dropped["continent_code"] = [get_continent(x) for x in dropped.country_code]
dropped.head()


# lets see if there are some missing Data from our new column
dropped.loc[dropped.loc[:,"continent_code"]=="Unknown"]["country_code"].unique()

array(['TL', nan], dtype=object)


# a little research on internet we can find that the country with code TL is in Asia
dropped.loc[dropped.loc[:,"country_code"]=="TL", "continent_code"] = "AS"


# Then we check the nan values, which they all belong to Namibia
dropped.loc[dropped.loc[:,"continent_code"]=="Unknown"]


# First of all, Namibia needs a country code,
dropped.loc[dropped.loc[:,"country"]=="Namibia", "country_code"] = "NA"


#  then a continent code
dropped.loc[dropped.loc[:,"country"]=="Namibia", "continent_code"] = "AF"


# piechart male, female, continent, funded amount, sector
grouped = dropped.groupby(["continent_code", "sector" ], as_index = False).agg({"funded_amount": "sum", "female": "sum", "male":"sum", "borrowers_total": "sum" })


grouped


# lets have a rough idea about how we are making most of our money
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
fig = px.scatter_3d(grouped,  x='borrowers_total', y='sector', z='funded_amount',
              color='continent_code')
fig.show()


# We want to see the continent and sector portions
grouped2 = dropped.groupby("continent_code", as_index = False).agg({"funded_amount": "sum"})
grouped3 = dropped.groupby("sector", as_index = False).agg({"funded_amount": "sum"})
grouped3 = grouped3.sort_values( by=['funded_amount'], ascending= False)
grouped2 = grouped2.sort_values(by=['funded_amount'], ascending= False)


from ipywidgets.embed import embed_minimal_html




out1 = widgets.Output()
out2 = widgets.Output()
data1 = pd.DataFrame(np.random.normal(size = 50))
data2 = pd.DataFrame(np.random.normal(size = 100))

tab = widgets.Tab(children = [out1, out2])
tab.set_title(0, 'Continents')
tab.set_title(1, 'Sectors')
display(tab)
embed_minimal_html('export.html', views=[tab], title='Widgets export')
with out1:
    explode = (0.1, 0.1, 0.0,0.0, 0.0, 0.0)
    group2_names= grouped2.continent_code.tolist()
    group2_size= grouped2.funded_amount.tolist()
    my_plot  = plt.pie(group2_size, 
                       labels=group2_names,
                       autopct="%0.f%%",
                       radius=2.4,
                       explode = explode, 
                       counterclock= False,
                       colors = ["red" if x <= 1 else "grey" for x in range(0,15)],
                       startangle=90)
    plt.show(my_plot)

with out2:
    
    
    explode = (0.1, 0.1, 0.1, 0.0,0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0)
    group3_names= grouped3.sector.tolist()
    group3_size= grouped3.funded_amount.tolist()
    my_plot  = plt.pie(group3_size, 
                       labels=[y if x <= 9 else "" for x,y in enumerate(group3_names)],
                       autopct="%0.f%%",
                       radius=2.4, 
                       explode = explode, 
                       counterclock= False,
                       colors = ["red" if x <= 2 else "grey" for x in range(0,15)],
                       startangle=90)
    plt.show(my_plot)


# Dropping the columns other than we need
dropped.drop(dropped[(dropped['sector'] != "Retail") & (dropped['sector'] != "Food") & (dropped['sector'] != "Agriculture")].index, inplace= True)
dropped = dropped.drop(dropped[(dropped['continent_code'] != "AS") & (dropped['continent_code'] != "AF")].index)


# Left over funded_amount is almost 40% of the total funded amount
d_d = dropped.groupby(["continent_code", "sector"], as_index= False).agg({"funded_amount":"sum"})
d_d.groupby("continent_code", as_index=False).agg({"funded_amount":"sum"}).funded_amount


# Lets see the comparison between the continents and sectors we are gonna need
fig, ax = plt.subplots()

size = 0.9

cmap = plt.get_cmap("tab20c")
outer_colors = cmap(np.arange(3)*4)
inner_colors = cmap(np.array([1, 2, 3, 4,5,6]))

ax.pie(d_d.groupby("continent_code", as_index=False).agg({"funded_amount":"sum"}).funded_amount, radius=3, labels = ["Africa", "Asia"], colors=outer_colors, autopct='%1.1f%%', pctdistance=0.8,
       wedgeprops=dict(width=size, edgecolor='w'))

ax.pie(d_d.funded_amount, radius=3-size, labels= d_d["sector"], rotatelabels=False, colors=inner_colors, labeldistance=0.5, autopct='%1.1f%%', pctdistance=0.8,
       wedgeprops=dict(width=size, edgecolor='w'))

plt.show()


dropped.drop(columns=["loan_amount", "activity", "borrowers_total", "term_in_months", "lender_count", "repayment_interval"], inplace=True)


# we drop some more rows which we are not earning money from:)
dropped.drop(dropped[(dropped['funded_amount'] == 0)].index)


# not working properly repeatedly

# countries = [x for x in dropped.country]
# unique_countries = [x for x in dropped.country.unique()]
# geolocator = Nominatim(user_agent="Fatih")
# u_c_dict = {}
# for country in unique_countries:
#     try:
#         loc = geolocator.geocode(country)
#         u_c_dict[country] = {"latitude":loc.latitude, "longitude": loc.longitude}
#     except:
#         u_c_dict[country] = {"latitude":np.nan, "longitude": np.nan}


# geolocator was working good, but a local file would be more consistent
data = pd.read_csv('latlong.txt', sep="\t", header=0)


data


# We merge the lats and longs dataframe to our original dataframe
dropped = dropped.merge(data, left_on='country', right_on='name', how='left')


# Getting the lats and longs of the regions

regions = [x for x in dropped.region]
unique_regions = [x for x in dropped.region.unique()]
# for i in unique_regions:
#     if i not in u_r_dict:
#         try:
#             response = requests.get(f"https://maps.googleapis.com/maps/api/geocode/json?address={i}&key={your key}")
#             u_r_dict[i] = {"latitude":response.json()["results"][0]["geometry"]["location"]["lat"], "longitude": response.json()["results"][0]["geometry"]["location"]["lng"]}
#         except:
#             u_r_dict[i] = {"latitude":np.nan, "longitude": np.nan}


# with open('u_r_dict.json', 'w') as fp:
#     json.dump(u_r_dict, fp, indent=4)


# Get back the dictionary which includes the data of the locations from Regions
f = open('u_r_dict.json')
u_r_dict = json.load(f)


len(u_r_dict)


# we write the lat and longs into the lists, than we will make new columns out of them
reg_lat = []
reg_lng = []
for x in regions:
    try:
        reg_lat.append(u_r_dict[x]['latitude'])
        reg_lng.append(u_r_dict[x]['longitude'])
    except:
        reg_lat.append(np.nan)
        reg_lng.append(np.nan)


# Here are our new columns
dropped["reg_lat"] = reg_lat
dropped["reg_long"] = reg_lng


#Renew the index
dropped.reset_index(drop=True, inplace=True)


# Rename some columns to understand them better
dropped.rename(columns={"latitude": "country_lat", "longitude":"country_long"}, inplace=True)


# Drop the columns coming from the merge
dropped.drop(columns="name", inplace=True)
dropped.drop(columns="country_y", inplace=True)


# fixing the name of our original country column
dropped.rename(columns={"country_x": "country"}, inplace=True)


dropped.head()


# We fill the nan values in reg_lat and reg_lot columns with the values of country lats and longs, 
# on the map there will not be very much difference, SO WE WILL NOT LOSE VALUE
dropped.reg_lat.fillna(dropped.country_lat, inplace=True)
dropped.reg_long.fillna(dropped.country_long, inplace=True)


# We split our dataframe to 3 pieces in order to show them on the kepler module, which it will be easier this way
df_agriculture = dropped.loc[dropped.loc[:,"sector"]=="Agriculture",:]
df_food = dropped.loc[dropped.loc[:,"sector"]=="Food",:]
df_retail = dropped.loc[dropped.loc[:,"sector"]=="Retail",:]


dropped.to_csv("df.csv")


df_agriculture.to_csv("agri.csv")
df_food.to_csv("food.csv")
df_retail.to_csv("retail.csv")

	funded_amount	loan_amount	activity	sector	country_code	country	region	term_in_months	lender_count	repayment_interval	borrowers_total	female
0	300.0	300.0	Fruits & Vegetables	Food	PK	Pakistan	Lahore	12.0	12	irregular	1.0	1.0
1	575.0	575.0	Rickshaw	Transportation	PK	Pakistan	Lahore	11.0	14	irregular	2.0	2.0
2	150.0	150.0	Transportation	Transportation	IN	India	Maynaguri	30.0	6	bullet	1.0	1.0
3	200.0	200.0	Embroidery	Arts	PK	Pakistan	Lahore	11.0	8	irregular	1.0	1.0
4	400.0	400.0	Milk Sales	Food	PK	Pakistan	Abdul Hakeem	14.0	16	monthly	1.0	1.0

	funded_amount	loan_amount	activity	sector	country_code	country	region	term_in_months	lender_count	repayment_interval	borrowers_total	female	continent_code
0	300.0	300.0	Fruits & Vegetables	Food	PK	Pakistan	Lahore	12.0	12	irregular	1.0	1.0	AS
1	575.0	575.0	Rickshaw	Transportation	PK	Pakistan	Lahore	11.0	14	irregular	2.0	2.0	AS
2	150.0	150.0	Transportation	Transportation	IN	India	Maynaguri	30.0	6	bullet	1.0	1.0	AS
3	200.0	200.0	Embroidery	Arts	PK	Pakistan	Lahore	11.0	8	irregular	1.0	1.0	AS
4	400.0	400.0	Milk Sales	Food	PK	Pakistan	Abdul Hakeem	14.0	16	monthly	1.0	1.0	AS

	funded_amount	loan_amount	activity	sector	country_code	country	region	term_in_months	lender_count	repayment_interval	borrowers_total	female	male	continent_code
202537	3000.0	3000.0	Wholesale	Wholesale	NaN	Namibia	EEnhana	6.0	70	bullet	1.0	1.0	0.0	Unknown
202823	3000.0	3000.0	Wholesale	Wholesale	NaN	Namibia	Rundu	6.0	70	bullet	1.0	0.0	1.0	Unknown
344929	3000.0	3000.0	Wholesale	Wholesale	NaN	Namibia	EEnhana	7.0	70	bullet	1.0	1.0	0.0	Unknown
351177	3000.0	3000.0	Wholesale	Wholesale	NaN	Namibia	Rundu	7.0	70	bullet	1.0	0.0	1.0	Unknown
420953	3000.0	3000.0	Wholesale	Wholesale	NaN	Namibia	EEnhana	7.0	70	bullet	1.0	1.0	0.0	Unknown
421218	3000.0	3000.0	Wholesale	Wholesale	NaN	Namibia	Rundu	7.0	70	bullet	1.0	0.0	1.0	Unknown
487207	3000.0	3000.0	Renewable Energy Products	Retail	NaN	Namibia	Katima Mulilo	7.0	70	bullet	1.0	0.0	1.0	Unknown
487653	3000.0	3000.0	Wholesale	Wholesale	NaN	Namibia	Oshakati	7.0	70	bullet	1.0	1.0	0.0	Unknown

	continent_code	sector	funded_amount	female	male	borrowers_total
0	AF	Agriculture	32055735.0	118185.0	71886.0	190071.0
1	AF	Arts	577025.0	1885.0	407.0	2292.0
2	AF	Clothing	9546150.0	33783.0	5196.0	38979.0
3	AF	Construction	1333840.0	2265.0	1816.0	4081.0
4	AF	Education	3723800.0	2710.0	3483.0	6193.0
...	...	...	...	...	...	...
84	SA	Personal Use	891300.0	1079.0	702.0	1781.0
85	SA	Retail	17751525.0	50952.0	4739.0	55691.0
86	SA	Services	9230175.0	24486.0	2996.0	27482.0
87	SA	Transportation	1531400.0	1007.0	1740.0	2747.0
88	SA	Wholesale	74125.0	51.0	54.0	105.0

Lets have a new column to represent the continents¶

To do this we need to import the library "pycountry-convert"¶

It seems i can focus on the continents, lets see how is the difference between the sectors¶

Now lets get rid of the unneeded columns and rows¶

We want to get the longitude and latitude to use the folium map plotter¶

I also want to evaluate the column of "region" to go a little more detailed on the maps. So let me go get the lats and longs of this column too...¶

Lets save our sectorically categorized files for the plotting.¶