%%html
<style>
.MathJax {
font-size: 1.3em;
}
.rendered_html tr, .rendered_html th, .rendered_html td {
text-align: right !important;
}
a[data-snippet-code]::after {
background: #262931 !important;
}
.titre-pers{
font-family: arial;
font-size: 250% !important;
line-height: 200% !important;
text-align: center !important;
color: #4c8be2 !important;
}
.jp-RenderedHTMLCommon h1,
.rendered_html h1,
.text_cell_render h1 {
color: #86bed9 !important;
line-height: 150% !important;
}
.jp-RenderedHTMLCommon h2,
.rendered_html h2,
.text_cell_render h2 {
color: #b08c20 !important;
padding-left: .5rem !important;
line-height: 150% !important;
}
.jp-RenderedHTMLCommon h3,
.rendered_html h3,
.text_cell_render h3 {
color: #3aa237 !important;
padding-left: 1rem !important;
line-height: 150% !important;
font-size: 120% !important;
}
.jp-RenderedHTMLCommon h4,
.rendered_html h4,
.text_cell_render h4 {
color: #29858a !important;
padding-left: 2rem !important;
font-size: 110% !important;
}
.jp-RenderedHTMLCommon h5,
.rendered_html h5,
.text_cell_render h5 {
color: #21417d !important;
padding-left: 2.5rem !important;
font-size: 110% !important;
}
.jp-RenderedHTMLCommon h6,
.rendered_html h6,
.text_cell_render h6 {
color: #d8a802c2 !important;
padding-left: 1rem !important;
font-family: sans-serif !important;
font-size: 120% !important;
font-weight: normal !important;
font-style: normal !important;
}
.renf{
font-size: 18px !important;
font-family: Arial !important;
color: #14db9a !important;
}
.renf2{
font-size: 18px !important;
font-family: Arial !important;
color: orangered !important;
}
</style>
import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
import scipy.stats as st
import scipy as sp
import math
from math import sqrt
import numpy as np
import pandas as pd
import pingouin as pg
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
plt.style.use('seaborn-deep')
%matplotlib inline
#from jupyterthemes import jtplot
# jtplot.style()
params = {
'legend.fontsize': 'large',
'legend.title_fontsize': 'large',
'figure.figsize': (18, 6),
'axes.labelsize': 'x-large',
'axes.titlesize': 'x-large',
'xtick.labelsize': 'x-large',
'xtick.major.pad': 12,
'ytick.labelsize': 'x-large',
'ytick.major.pad': 12,
'lines.linewidth': 2,
'savefig.dpi': 300
}
pylab.rcParams.update(params)
pd.options.mode.chained_assignment = None # default='warn'
color1 = ['#7fc97f', '#beaed4', '#fdc086', '#ffff99',
'#386cb0', '#f0027f', '#bf5b17', '#666666']
color2 = ['#1b9e77', '#d95f02', '#7570b3', '#e7298a',
'#66a61e', '#e6ab02', '#a6761d', '#666666']
color3 = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c',
'#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00']
color4 = ['#fbb4ae', '#b3cde3', '#ccebc5', '#decbe4',
'#fed9a6', '#ffffcc', '#e5d8bd', '#fddaec']
color5 = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3',
'#a6d854', '#ffd92f', '#e5c494', '#b3b3b3']
color6 = ['#b3e2cd', '#fdcdac', '#cbd5e8', '#f4cae4',
'#e6f5c9', '#fff2ae', '#f1e2cc', '#cccccc']
color7 = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3',
'#ff7f00', '#ffff33', '#a65628', '#f781bf']
color8 = ['#8dd3c7', '#ffffb3', '#bebada', '#fb8072',
'#80b1d3', '#fdb462', '#b3de69', '#fccde5']
color2l = ['red', 'green', '#78a6cd', '#ccebc5', '#decbe4']
color_seq1 = ['#f7fcfd', '#e5f5f9', '#ccece6', '#99d8c9',
'#66c2a4', '#41ae76', '#238b45', '#005824']
color_seq2 = ['#fff7ec', '#fee8c8', '#fdd49e', '#fdbb84',
'#fc8d59', '#ef6548', '#d7301f', '#990000']
color_seq3 = ['#f7fbff', '#deebf7', '#c6dbef', '#9ecae1',
'#6baed6', '#4292c6', '#2171b5', '#084594']
color_seq4 = ['#f7fcf5', '#e5f5e0', '#c7e9c0', '#a1d99b',
'#74c476', '#41ab5d', '#238b45', '#005a32']
color_seq5 = ['#fff5f0', '#fee0d2', '#fcbba1', '#fc9272',
'#fb6a4a', '#ef3b2c', '#cb181d', '#99000d']
color_seq6 = ['#fff5eb', '#fee6ce', '#fdd0a2', '#fdae6b',
'#fd8d3c', '#f16913', '#d94801', '#8c2d04']
color_seq7 = ['#ffffd9', '#edf8b1', '#c7e9b4', '#7fcdbb',
'#41b6c4', '#1d91c0', '#225ea8', '#0c2c84']
color_seq8 = ['#fff7fb', '#ece2f0', '#d0d1e6', '#a6bddb',
'#67a9cf', '#3690c0', '#02818a', '#016450']
color_seq9 = ['#ffffff', '#f0f0f0', '#d9d9d9', '#bdbdbd',
'#969696', '#737373', '#525252', '#252525']
color_div1 = ['#8c510a', '#bf812d', '#dfc27d', '#f6e8c3',
'#c7eae5', '#80cdc1', '#35978f', '#01665e']
color_div2 = ['#c51b7d', '#de77ae', '#f1b6da', '#fde0ef',
'#e6f5d0', '#b8e186', '#7fbc41', '#4d9221']
color_div3 = ['#762a83', '#9970ab', '#c2a5cf', '#e7d4e8',
'#d9f0d3', '#a6dba0', '#5aae61', '#1b7837']
color_div4 = ['#b2182b', '#d6604d', '#f4a582', '#fddbc7',
'#d1e5f0', '#92c5de', '#4393c3', '#2166ac']
color_div5 = ['#b2182b', '#d6604d', '#f4a582', '#fddbc7',
'#e0e0e0', '#bababa', '#878787', '#4d4d4d']
color_div6 = ['#d73027', '#f46d43', '#fdae61', '#fee090',
'#e0f3f8', '#abd9e9', '#74add1', '#4575b4']
color_div7 = ['#d73027', '#f46d43', '#fdae61', '#fee08b',
'#d9ef8b', '#a6d96a', '#66bd63', '#1a9850']
color_div8 = ['#d53e4f', '#f46d43', '#fdae61', '#fee08b',
'#e6f598', '#abdda4', '#66c2a5', '#3288bd']
def draw_bar_single(var_x, var_y, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1, titre_x=0.5, title_linespacing=1.0, legend_out=False, col_bar='teal', alpha_bar=0.7, base_mtick=1.0, xdate_available=False, alpha_grid=0.3):
"""
Mettre xdate_available=True pour orienté le texte des graduations sur l'axe x
"""
x = var_x
y = var_y
_ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
loc = mtick.MultipleLocator(base=base_mtick)
ax.xaxis.set_major_locator(loc)
if xdate_available:
fig.autofmt_xdate(bottom=0.2, rotation=30, ha='right', which='major')
_ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
_ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
if legend_out:
_ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
_ = plt.bar(x, y, align='center', color=col_bar, alpha=alpha_bar)
_ = plt.grid(linestyle='--', alpha=alpha_grid)
###########################
def draw_plot(var_y, var_x, label_x, label_y, titre, nb=1, col_titre='MidnightBlue', pos_titre='center', titre_y=1, titre_x=0.5, title_linespacing=1.0, legend_out=False, col_bar='teal', alpha_bar=0.7, base_mtick=1.0, xdate_available=False, alpha_grid=0.3):
"""
Mettre xdate_available=True pour orienté le texte des graduations sur l'axe x
"""
y = var_y
x = var_x
_ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
loc = mtick.MultipleLocator(base=base_mtick)
ax.xaxis.set_major_locator(loc)
if xdate_available:
fig.autofmt_xdate(bottom=0.2, rotation=30, ha='right', which='major')
_ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
_ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
if legend_out:
ax.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
ax.plot(y, x, color=col_bar, alpha=alpha_bar)
ax.grid(linestyle='--', alpha=alpha_grid)
################################
def draw_bar_several(dfgroupby, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, legend_pers=False, legend=None, xticks_rotation='horizontal', base_mtick=1.0, g_stacked=False, g_stacked_perc=False, g_width=0.8, g_kind='bar', g_color=color1, alpha_grid=0.3):
if g_stacked_perc:
plt.plot = dfgroupby.groupby(level=0).apply(lambda x: 100 * x / x.sum()).unstack().plot(
kind=g_kind,
stacked=g_stacked,
width=g_width,
color=g_color
)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
else:
plt.plot = dfgroupby.unstack().plot(
kind=g_kind,
stacked=g_stacked,
width=g_width,
color=g_color
)
if legend_out:
_ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
if legend_pers:
_ = plt.legend(legend, bbox_to_anchor=(1.03, 1.0), loc='upper left')
_ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
_ = plt.xticks(rotation=xticks_rotation)
loc = mtick.MultipleLocator(base=base_mtick)
ax.xaxis.set_major_locator(loc)
_ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
_ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
_ = plt.grid(linestyle='--', alpha=alpha_grid)
def draw_bar_pie(titre, labels_in, size_in, explode_in, titre_legend, z, a=1, b=1, c=1, color_titre='MidnightBlue', def_angle=90, titre_x=0.5, titre_y=1.05, title_linespacing=1, legend_available=True, pos_titre='center', size_titre=18, color_perc='white', size_perc=16, loc_legend="center left", bbox_legend=(1, 0, 0.5, 1), g_color=color5):
"""
fig=plt.figure(figsize=(18,6))
"""
labels = labels_in
size = size_in
explode = explode_in
z = fig.add_subplot(a, b, c)
patches, texts, autotexts = z.pie(
size, explode=explode, autopct='%1.1f%%', shadow=True, startangle=def_angle, colors=g_color)
if legend_available:
legend = z.legend(patches, labels,
title=titre_legend,
loc=loc_legend,
bbox_to_anchor=bbox_legend)
# Equal aspect ratio ensures that pie is drawn as a circle.
z.axis('equal')
_ = plt.setp(autotexts, color=color_perc, size=size_perc, weight="bold")
z.set_title(titre, color=color_titre, loc='center', y=titre_y, x=titre_x,
linespacing=title_linespacing, fontdict={'size': size_titre, 'weight': 500})
def draw_scatter(df, col_df_x, col_df_y, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, xticks_rotation='horizontal', base_mtick=1.0, g_stacked=False, g_stacked_perc=False, color_scatter='teal', alpha_grid=0.3):
params = {
'figure.figsize': (16, 6),
}
pylab.rcParams.update(params)
df.plot(
kind='scatter',
x=col_df_x,
y=col_df_y,
color=color_scatter
)
if legend_out:
plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
_ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
_ = plt.xticks(rotation=xticks_rotation)
loc = mtick.MultipleLocator(base=base_mtick)
ax.xaxis.set_major_locator(loc)
_ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
_ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
_ = plt.grid(linestyle='--', alpha=alpha_grid)
# _=plt.show()
def draw_options(titre, label_x, label_y, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, legend_pers=False, legend=None, alpha_grid=0.3):
_ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
_ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
_ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
if legend_out:
_ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
if legend_pers:
_ = plt.legend(legend_p, bbox_to_anchor=(1.03, 1.0), loc='upper left')
_ = plt.grid(linestyle='--', alpha=alpha_grid)
# this plots multiple seaborn histograms on different subplots
def plot_multiple_histograms(df, cols):
num_plots = len(cols)
num_cols = math.ceil(np.sqrt(num_plots))
num_rows = math.ceil(num_plots/num_cols)
fig, axs = plt.subplots(num_rows, num_cols)
for ind, col in enumerate(cols):
i = math.floor(ind/num_cols)
j = ind - i*num_cols
if num_rows == 1:
if num_cols == 1:
sns.distplot(df[col], kde=True, ax=axs)
else:
sns.distplot(df[col], kde=True, ax=axs[j])
else:
sns.distplot(df[col], kde=True, ax=axs[i, j])
def gini(arr):
count = arr.size
coefficient = 2 / count
indexes = np.arange(1, count + 1)
weighted_sum = (indexes * arr).sum()
total = arr.sum()
constant = (count + 1) / count
return coefficient * weighted_sum / total - constant
def lorenz_f(arrg, title, xlabel, ylabel, arrg2=None, arrg3=None, arrg4=None, nb=1, fs=8, title_lorenz=None):
patterns = ('X', '\\', '-', '+', 'O', '*', '.')
fig = plt.figure(figsize=(fs, fs))
x1 = [0, 1]
y1 = [0, 1]
ax = fig.add_subplot(111)
_ = ax.plot(x1, y1, color='black', alpha=0.7)
_ = ax.set(xlim=(0, 1), ylim=(0, 1))
# _=ax.axis('equal')
_ = ax.tick_params(axis='both', which='major', labelsize=14)
plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
_ = plt.ylabel(ylabel, color='gray', labelpad=10, fontdict={'size': 16})
_ = plt.xlabel(xlabel, color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.title(title, y=1.02, fontdict={'size': 20, 'weight': 500})
if nb == 2:
arrge = [arrg, arrg2]
if nb == 3:
arrge = [arrg, arrg2, arrg3]
if nb == 4:
arrge = [arrg, arrg2, arrg3, arrg4]
for i in range(nb):
if nb == 1:
arr = arrg
else:
arr = arrge[i]
arr = arr.sort_values()
n = len(arr)
lorenz = np.cumsum(arr) / arr.sum()
lorenz = np.append([0], lorenz)
xaxis = np.linspace(0, 1, n+1)
if title_lorenz:
_ = ax.plot(xaxis, lorenz, drawstyle='steps-post',
color=color2l[i], label='Courbe de Lorenz '+title_lorenz[i])
else:
_ = ax.plot(xaxis, lorenz, drawstyle='steps-post',
color=color2l[i], label='Courbe de Lorenz')
if nb == 1:
_ = ax.fill_between(xaxis, xaxis, lorenz, alpha=0.6, hatch="X",
color='teal', label='Surface de concentration')
_ = ax.fill_between(np.linspace(0, 1, len(lorenz)),
lorenz, color='gray', alpha=0.3)
else:
_ = ax.fill_between(xaxis, xaxis, lorenz,
hatch=patterns[i], color=color2l[i], alpha=0.3)
gini_value = gini(arr)
if title_lorenz:
r = 0.7-(i/20)
_ = ax.text(0.1, r, "Gini "+str(title_lorenz[i]) + " = "+str(
round(gini_value, 2)), fontsize=16, color=color2l[i], weight='semibold')
else:
_ = ax.text(0.2, 0.6, "Gini = " + str(round(gini_value, 2)),
fontsize=18, color='darkred', weight='semibold')
patch = []
if nb == 1:
a_patch = mpatches.Patch(
color='teal', alpha=0.6, hatch="X", label='Surface de concentration')
b_patch = mpatches.Patch(
color=color2l[0], label='Courbe de Lorenz')
_ = plt.legend(handles=[a_patch, b_patch])
else:
if title_lorenz:
p = mpatches.Patch(
color=color2l[i], label='Courbe de Lorenz '+title_lorenz[i])
patch.append(p)
_ = plt.legend(handles=patch)
else:
p = mpatches.Patch(color=color2l[i], label='Courbe de Lorenz')
patch.append(p)
_ = plt.legend(handles=patch)
_ = ax.legend(loc='upper left', bbox_to_anchor=(0.02, 0.98), fontsize=12,
frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=0.5)
_ = ax.grid(linestyle='--', alpha=0.4)
def eta_squared(x, y):
moyenne_y = y.mean()
classes = []
for classe in x.unique():
yi_classe = y[x == classe]
classes.append({'ni': len(yi_classe),
'moyenne_classe': yi_classe.mean()})
SCT = sum([(yj-moyenne_y)**2 for yj in y])
SCE = sum([c['ni']*(c['moyenne_classe']-moyenne_y)**2 for c in classes])
return SCE/SCT
def linear_regression(x, y):
N = len(x)
x_mean = x.mean()
y_mean = y.mean()
B1_num = ((x - x_mean) * (y - y_mean)).sum()
B1_den = ((x - x_mean)**2).sum()
B1 = B1_num / B1_den
B0 = y_mean - (B1*x_mean)
sign = '+'
if B1 < 0:
sign = '-'
B3 = -B1
else:
B3 = B1
reg_line = 'y = {} {} {}β'.format(B0, sign, round(B3, 3))
return (B0, B1, reg_line)
def corr_coef(x, y):
N = len(x)
num = (N * (x*y).sum()) - (x.sum() * y.sum())
den = np.sqrt((N * (x**2).sum() - x.sum()**2) * (N * (y**2).sum() - y.sum()**2))
R = num / den
return R
def linear_reg_aff(x,y):
B0, B1, reg_line = linear_regression(x, y)
R = corr_coef(x, y)
text="Droite de régression : "+str(reg_line)+"\nCoef. de corrélation R : "+str(R)+"\nCoef. de détermination R² : "+str(R**2)
return print(text)
def graph_droite_regression(dataset, x, y, text_x, text_y, titre, axes_x, axes_y,
size_col, sizes_taille, titre_leg="",
leg_x=0.1, leg_y=0.1, loc_leg='lower left', axe="ax1",
inter=np.arange(100), inter_droite=np.arange(100), size_title=20,
xlim_p=None, ylim_p=None, droite_reg=True):
x1=dataset[x]
y1=dataset[y]
B0, B1, reg_line = linear_regression(x1,y1)
R = corr_coef(x1,y1)
sign = '+'
if B1 < 0:
sign = '-'
B3 = -B1
else:
B3 = B1
dic_ax = { "ax1":ax1, "ax2":ax2 }
X=dic_ax[axe]
X = plt.gca()
text = "Moyenne X : {}\nMoyenne Y : {}\nR : {}\nR^2 : {}\ny = {} {} {}X".format(
round(x1.mean(), 2),
round(y1.mean(), 2),
round(R, 4),
round(R**2, 4),
round(B0, 3),
sign,
round(B3, 3))
if droite_reg:
_ = plt.text(x=text_x, y=text_y, s=text, fontsize=12, bbox={'facecolor': 'grey', 'edgecolor':'black', 'boxstyle':'round,pad=1', 'alpha': 0.2, 'pad': 10})
_ = X.plot(inter_droite, [B0 + B1*x for x in inter_droite], c = 'r', linewidth=5, alpha=.5, solid_capstyle='round')
_ = sns.scatterplot(data=dataset, x=x, y=y, marker="o", size=size_col, sizes=sizes_taille, ax=X)
if xlim_p:
_ = plt.xlim(xlim_p)
if ylim_p:
_ = plt.ylim(ylim_p)
_ = plt.xticks(inter)
_ = plt.xlabel(axes_x, color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel(axes_y, color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.title(titre, y=1.02, fontdict={'size': size_title, 'weight': 500})
_ = plt.legend(title=titre_leg, loc=loc_leg, bbox_to_anchor=(leg_x, leg_y),frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=1,fontsize=12)
produits = pd.read_csv("data/dataset_P4/products.csv")
produits.sample(5)
id_prod | price | categ | |
---|---|---|---|
2944 | 1_319 | 43.99 | 1 |
2153 | 0_1250 | 5.17 | 0 |
704 | 0_2277 | 10.99 | 0 |
1129 | 0_2057 | 8.99 | 0 |
2516 | 0_1331 | 29.99 | 0 |
produits.info()
produits.describe(include="all")
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3287 entries, 0 to 3286 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_prod 3287 non-null object 1 price 3287 non-null float64 2 categ 3287 non-null int64 dtypes: float64(1), int64(1), object(1) memory usage: 77.2+ KB
id_prod | price | categ | |
---|---|---|---|
count | 3287 | 3287.000000 | 3287.000000 |
unique | 3287 | NaN | NaN |
top | 0_686 | NaN | NaN |
freq | 1 | NaN | NaN |
mean | NaN | 21.856641 | 0.370246 |
std | NaN | 29.847908 | 0.615387 |
min | NaN | -1.000000 | 0.000000 |
25% | NaN | 6.990000 | 0.000000 |
50% | NaN | 13.060000 | 0.000000 |
75% | NaN | 22.990000 | 1.000000 |
max | NaN | 300.000000 | 2.000000 |
On constate plusieurs éléments :
id_prod
products
categ
price
produits.isna().sum()
produits.duplicated().sum()
id_prod 0 price 0 categ 0 dtype: int64
0
Il n'y a aucune valeur nulle et aucune ligne en double dans le df produits
produits.categ.value_counts().nunique()
produits.categ.value_counts()
3
0 2309 1 739 2 239 Name: categ, dtype: int64
On a bien 3 catégories différentes.
On détermine leur part relative
print(f'Nombre total de produits : {produits.categ.value_counts().sum()}')
for i in range(3):
part_cat = (produits.categ.value_counts()[i] / produits.shape[0])*100
part_cat = round(part_cat, 2)
print(
f'Catégorie {i} : {produits.categ.value_counts()[i]} produits, soit {part_cat}% de l\'ensemble des produits')
Nombre total de produits : 3287 Catégorie 0 : 2309 produits, soit 70.25% de l'ensemble des produits Catégorie 1 : 739 produits, soit 22.48% de l'ensemble des produits Catégorie 2 : 239 produits, soit 7.27% de l'ensemble des produits
On cherche les produits qui ont un prix inférieur ou égal à 0
produits[produits['price'] <= 0]
id_prod | price | categ | |
---|---|---|---|
731 | T_0 | -1.0 | 0 |
Il n'y a que un produit avec un prix négatif
Notons que l'id_prod de ce produit (T_0) ne semble pas avoir le même format que les autres produits
clients = pd.read_csv("data/dataset_P4/customers.csv")
clients.sample(5)
client_id | sex | birth | |
---|---|---|---|
6121 | c_3388 | m | 1989 |
4376 | c_1631 | f | 1981 |
8263 | c_2046 | f | 1969 |
5513 | c_3232 | m | 1971 |
3027 | c_8269 | f | 1969 |
clients.info()
clients.describe(include="all")
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8623 entries, 0 to 8622 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 client_id 8623 non-null object 1 sex 8623 non-null object 2 birth 8623 non-null int64 dtypes: int64(1), object(2) memory usage: 202.2+ KB
client_id | sex | birth | |
---|---|---|---|
count | 8623 | 8623 | 8623.000000 |
unique | 8623 | 2 | NaN |
top | c_460 | f | NaN |
freq | 1 | 4491 | NaN |
mean | NaN | NaN | 1978.280877 |
std | NaN | NaN | 16.919535 |
min | NaN | NaN | 1929.000000 |
25% | NaN | NaN | 1966.000000 |
50% | NaN | NaN | 1979.000000 |
75% | NaN | NaN | 1992.000000 |
max | NaN | NaN | 2004.000000 |
On constate que :
client_id
a une valeur unique pour chaque produit : clé primaire de la table customers
sex
clients.isna().sum()
clients.duplicated().sum()
client_id 0 sex 0 birth 0 dtype: int64
0
Il n'y a aucune valeur nulle et aucune ligne en double dans le df clients
nb_h = clients.sex.value_counts()['m']
nb_f = clients.sex.value_counts()['f']
part_genre = (clients.groupby('sex')['sex'].count() / clients.shape[0])*100
part_genre = round(part_genre, 2)
part_h = part_genre['m']
part_f = part_genre['f']
print(
f'Nombre de clients enregistrés : {clients.client_id.value_counts().sum()}')
print(f'Nombre de clients Homme : {nb_h}, soit {part_h}%')
print(f'Nombre de clientes Femme : {nb_f}, soit {part_f}%')
Nombre de clients enregistrés : 8623 Nombre de clients Homme : 4132, soit 47.92% Nombre de clientes Femme : 4491, soit 52.08%
print(sorted(clients.birth.unique()))
[1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004]
birthyear = clients.birth.value_counts()
birthyear
2004 440 1979 215 1988 207 1986 203 1978 199 ... 1935 8 1932 6 1930 4 1931 4 1929 3 Name: birth, Length: 76, dtype: int64
birthyear.count()
birthyear.nlargest()
birthyear.nsmallest()
76
2004 440 1979 215 1988 207 1986 203 1978 199 Name: birth, dtype: int64
1929 3 1930 4 1931 4 1932 6 1933 8 Name: birth, dtype: int64
Il semble que l'année 2004 soit sur-représentée
On visualise avec un graph
_ = sns.displot(clients['birth'], kde=False, bins=150)
ventes = pd.read_csv("data/dataset_P4/transactions.csv")
ventes.sample(4)
id_prod | date | session_id | client_id | |
---|---|---|---|---|
258609 | 0_1604 | 2021-12-23 01:31:54.167449 | s_138520 | c_6334 |
29820 | 0_1282 | 2021-05-11 11:38:10.968569 | s_32993 | c_1221 |
50844 | 0_1604 | 2021-08-29 08:41:08.081023 | s_82151 | c_4101 |
121093 | 0_1409 | 2021-05-20 12:25:47.887476 | s_37069 | c_3474 |
ventes.info()
ventes.describe(include="all", datetime_is_numeric=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 337016 entries, 0 to 337015 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_prod 337016 non-null object 1 date 337016 non-null object 2 session_id 337016 non-null object 3 client_id 337016 non-null object dtypes: object(4) memory usage: 10.3+ MB
id_prod | date | session_id | client_id | |
---|---|---|---|---|
count | 337016 | 337016 | 337016 | 337016 |
unique | 3266 | 336855 | 169195 | 8602 |
top | 1_369 | test_2021-03-01 02:30:02.237413 | s_0 | c_1609 |
freq | 1081 | 13 | 200 | 12855 |
Plusieurs constats :
session_id
correspond un id_prod
)session_id
n'est pas la clé primaire de la table transactions car il y a 169195 valeurs uniques pour un total de 337016 valeurstransactions
sont bien présents dans la table customers
)products
transactions
sont bien présents dans la table products
)test_
session_id
associé est s_0 => on regardera à quoi corresponde ces lignesventes.isna().sum()
ventes.duplicated().sum()
id_prod 0 date 0 session_id 0 client_id 0 dtype: int64
126
On constate qu'il n'y a pas de valeurs nulles mais qu'il y a des doublons.
On regarde de plus près ces doublons
ventes[ventes.duplicated()].head(3)
id_prod | date | session_id | client_id | |
---|---|---|---|---|
34387 | T_0 | test_2021-03-01 02:30:02.237443 | s_0 | ct_0 |
54813 | T_0 | test_2021-03-01 02:30:02.237412 | s_0 | ct_1 |
57261 | T_0 | test_2021-03-01 02:30:02.237439 | s_0 | ct_1 |
Il semble que ces doublons concernent les cas particuliers relevés précédemment :
On poursuit l'analyse de ces doublons
ventes[ventes.duplicated()].nunique()
id_prod 1 date 36 session_id 1 client_id 2 dtype: int64
ventes[ventes.duplicated()].date.unique()
ventes[ventes.duplicated()].client_id.unique()
ventes[ventes.duplicated()].id_prod.unique()
array(['test_2021-03-01 02:30:02.237443', 'test_2021-03-01 02:30:02.237412', 'test_2021-03-01 02:30:02.237439', 'test_2021-03-01 02:30:02.237429', 'test_2021-03-01 02:30:02.237446', 'test_2021-03-01 02:30:02.237424', 'test_2021-03-01 02:30:02.237442', 'test_2021-03-01 02:30:02.237414', 'test_2021-03-01 02:30:02.237430', 'test_2021-03-01 02:30:02.237436', 'test_2021-03-01 02:30:02.237413', 'test_2021-03-01 02:30:02.237418', 'test_2021-03-01 02:30:02.237427', 'test_2021-03-01 02:30:02.237432', 'test_2021-03-01 02:30:02.237425', 'test_2021-03-01 02:30:02.237435', 'test_2021-03-01 02:30:02.237426', 'test_2021-03-01 02:30:02.237428', 'test_2021-03-01 02:30:02.237447', 'test_2021-03-01 02:30:02.237419', 'test_2021-03-01 02:30:02.237433', 'test_2021-03-01 02:30:02.237440', 'test_2021-03-01 02:30:02.237437', 'test_2021-03-01 02:30:02.237441', 'test_2021-03-01 02:30:02.237434', 'test_2021-03-01 02:30:02.237449', 'test_2021-03-01 02:30:02.237444', 'test_2021-03-01 02:30:02.237417', 'test_2021-03-01 02:30:02.237445', 'test_2021-03-01 02:30:02.237415', 'test_2021-03-01 02:30:02.237416', 'test_2021-03-01 02:30:02.237420', 'test_2021-03-01 02:30:02.237431', 'test_2021-03-01 02:30:02.237423', 'test_2021-03-01 02:30:02.237438', 'test_2021-03-01 02:30:02.237421'], dtype=object)
array(['ct_0', 'ct_1'], dtype=object)
array(['T_0'], dtype=object)
On regarde dans l'ensemble de la table transactions
les éléments qui concernent la session_id 's_0' pour vérifier qu'ils ne sont pas présents ailleurs
test = ventes[ventes['session_id'] == 's_0']
test
id_prod | date | session_id | client_id | |
---|---|---|---|---|
1431 | T_0 | test_2021-03-01 02:30:02.237420 | s_0 | ct_1 |
2365 | T_0 | test_2021-03-01 02:30:02.237446 | s_0 | ct_1 |
2895 | T_0 | test_2021-03-01 02:30:02.237414 | s_0 | ct_1 |
5955 | T_0 | test_2021-03-01 02:30:02.237441 | s_0 | ct_0 |
7283 | T_0 | test_2021-03-01 02:30:02.237434 | s_0 | ct_1 |
... | ... | ... | ... | ... |
332594 | T_0 | test_2021-03-01 02:30:02.237445 | s_0 | ct_0 |
332705 | T_0 | test_2021-03-01 02:30:02.237423 | s_0 | ct_1 |
332730 | T_0 | test_2021-03-01 02:30:02.237421 | s_0 | ct_1 |
333442 | T_0 | test_2021-03-01 02:30:02.237431 | s_0 | ct_1 |
335279 | T_0 | test_2021-03-01 02:30:02.237430 | s_0 | ct_0 |
200 rows × 4 columns
test.nunique()
id_prod 1 date 39 session_id 1 client_id 2 dtype: int64
test[test.duplicated()].client_id.unique()
test[test.duplicated()].id_prod.unique()
array(['ct_0', 'ct_1'], dtype=object)
array(['T_0'], dtype=object)
On retrouve bien les mêmes éléments
Ainsi, les éléments de test (qui ne correspondent pas à des ventes réelles) sont limités aux sessions de test et ne se retrouvent pas ailleurs. On va donc pouvoir les supprimer
Les éléments à supprimer sont donc les suivants :
produits
--> le produit de test T_0clients
---> les utilisateurs test ct_0 et ct_1ventes
-----> les dates qui commencent par 'test', c'est-à-dire celles qui correspondent aux sessions s_0clients[(clients['client_id'] == 'ct_0') | (clients['client_id'] == 'ct_1')]
produits[produits['id_prod'] == 'T_0']
client_id | sex | birth | |
---|---|---|---|
2735 | ct_0 | f | 2001 |
8494 | ct_1 | m | 2001 |
id_prod | price | categ | |
---|---|---|---|
731 | T_0 | -1.0 | 0 |
# suppression de 'T_O' dans le df produits
produits.drop(produits.loc[produits['id_prod'] == 'T_0'].index, inplace=True)
# on vérifie que 'T_0' n'est plus plus présent dans le df produits
produits[produits['id_prod'] == 'T_0']
# suppression de 'ct_0' et de 'ct_1' dans le df clients
clients.drop(clients.loc[(clients['client_id'] == 'ct_0') | (
clients['client_id'] == 'ct_1')].index, inplace=True)
# on vérifie que 'ct_0' et 'ct_1' ne sont plus plus présents dans le df client
clients[(clients['client_id'] == 'ct_0') | (clients['client_id'] == 'ct_1')]
# suppression des lignes correspondant à une 'session_id' égale à 's_0' dans le df ventes
ventes.drop(ventes.loc[ventes['session_id'] == 's_0'].index, inplace=True)
# on vérifie que 's_0' n'est plus plus présent dans le df ventes
ventes.loc[ventes['session_id'] == 's_0']
id_prod | price | categ |
---|
client_id | sex | birth |
---|
id_prod | date | session_id | client_id |
---|
.to_datetime
Remarque : on utilisera l'argument 'errors' avec pour valeur 'coerce'
Ainsi, si la date passée à la fonction .to_datetime
ne correspond pas au format spécifié, alors celle-ci n'est pas considérée comme une date et prend une valeur nulle (NaT : Not a Time)
ventes['date'] = pd.to_datetime(ventes['date'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
ventes.dtypes
id_prod object date datetime64[ns] session_id object client_id object dtype: object
ventes.isna().sum()
id_prod 0 date 0 session_id 0 client_id 0 dtype: int64
Il n'y a pas de valeurs nulles (normal, car on a supprimé au préalable les sessions de test)
session_date
session_time
import locale
_ = locale.setlocale(locale.LC_ALL, 'fr_FR.utf8')
# print(locale.getlocale(locale.LC_TIME))
ventes_temp = ventes.copy()
ventes_temp['session_year'] = ventes_temp['date'].dt.year
ventes_temp['session_month'] = ventes_temp['date'].dt.month
ventes_temp['session_week'] = (ventes_temp['date'].dt.strftime('%V')).astype('int')
ventes_temp['session_day_name'] = ventes_temp['date'].dt.day_name('fr_FR')
ventes_temp['session_day'] = ventes_temp['date'].dt.day
ventes_temp['session_hour'] = ventes_temp['date'].dt.hour
ventes_temp['session_minute'] = ventes_temp['date'].dt.minute
ventes_temp.dtypes
ventes_temp['session_date'] = ventes_temp['date'].dt.normalize()
ventes = ventes_temp.copy()
ventes.sample(3)
id_prod object date datetime64[ns] session_id object client_id object session_year int64 session_month int64 session_week int32 session_day_name object session_day int64 session_hour int64 session_minute int64 dtype: object
id_prod | date | session_id | client_id | session_year | session_month | session_week | session_day_name | session_day | session_hour | session_minute | session_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
329447 | 0_1328 | 2021-10-10 02:28:23.753941 | s_102544 | c_8576 | 2021 | 10 | 40 | Dimanche | 10 | 2 | 28 | 2021-10-10 |
193220 | 0_2127 | 2021-04-13 23:00:26.347021 | s_20201 | c_56 | 2021 | 4 | 15 | Mardi | 13 | 23 | 0 | 2021-04-13 |
160650 | 0_1464 | 2022-01-01 05:31:53.251212 | s_143229 | c_1939 | 2022 | 1 | 52 | Samedi | 1 | 5 | 31 | 2022-01-01 |
ventes.groupby('session_year')['session_month'].nunique()
session_year 2021 10 2022 2 Name: session_month, dtype: int64
Les données s'étalent bien de début mars 2021 (10 mois en 2021) à fin février 2022 (2 mois en 2022)
ventes.groupby('session_month')['session_day'].nunique()
session_month 1 31 2 28 3 31 4 30 5 31 6 30 7 31 8 31 9 30 10 31 11 30 12 31 Name: session_day, dtype: int64
Le nombre de jours pour les différents mois de l'année calendaire est correct.
ventes = ventes_temp[['id_prod', 'session_id', 'client_id', 'date', 'session_date']] ventes.sample(4)
ventes['session_date'].min()
ventes['session_date'].max()
Timestamp('2021-03-01 00:00:00')
Timestamp('2022-02-28 00:00:00')
On constate qu'elle s'étale sur un an
Remarque : on va garder les années 2021 et 2022 (bien que n'étant qu'en 2021)
from datetime import datetime
aujdhui = datetime.now()
def age_client(df):
df['age'] = aujdhui.year - df['birth']
clients.apply(lambda x: age_client(clients))
client_id None sex None birth None dtype: object
clients.sample(3)
client_id | sex | birth | age | |
---|---|---|---|---|
1647 | c_6102 | f | 1983 | 38 |
3905 | c_2296 | f | 1987 | 34 |
2521 | c_2349 | f | 1986 | 35 |
clients.age.value_counts()
17 440 42 215 33 207 35 203 43 199 ... 88 8 89 6 90 4 91 4 92 3 Name: age, Length: 76, dtype: int64
x = clients.loc[clients['sex'] == 'm']
y = clients.loc[clients['sex'] == 'f']
print(f'L\'âge moyen des clients est de {int(clients.age.mean())} ans, {int(x.age.mean())} pour les hommes et {int(y.age.mean())} pour les femmes')
L'âge moyen des clients est de 42 ans, 42 pour les hommes et 43 pour les femmes
rep_age = clients.groupby(['age', 'sex'])['client_id'].count()
rep_age = rep_age.unstack()
rep_age.reset_index(inplace=True)
rep_age['total'] = rep_age['f'] + rep_age['m']
rep_age.columns = ['age', 'f', 'm', 'total']
rep_age
age | f | m | total | |
---|---|---|---|---|
0 | 17 | 232 | 208 | 440 |
1 | 18 | 65 | 81 | 146 |
2 | 19 | 66 | 80 | 146 |
3 | 20 | 71 | 56 | 127 |
4 | 21 | 78 | 58 | 136 |
... | ... | ... | ... | ... |
71 | 88 | 5 | 3 | 8 |
72 | 89 | 5 | 1 | 6 |
73 | 90 | 2 | 2 | 4 |
74 | 91 | 3 | 1 | 4 |
75 | 92 | 1 | 2 | 3 |
76 rows × 4 columns
x = rep_age['age']
y = rep_age['total']
plt.style.use('seaborn-whitegrid')
_ = plt.figure(figsize=(16, 6))
_ = plt.xlim(15, 95)
_ = plt.xlabel('Age', color='gray', fontdict={'size': 16})
_ = plt.ylabel('Nombre de clients', color='gray', fontdict={'size': 16})
_ = plt.title('Répartition des clients par âge', color='MidnightBlue',
loc='right', y=0.9, x=0.98, fontdict={'size': 20})
_ = plt.bar(x, y, align='center', color='DarkOrange', alpha=0.7)
plt.grid(linestyle='--', alpha=0.3)
plt.show()
On constate une valeur atypique pour la valeur age = 17
On peut faire l'hypothèse que lors de l'inscription sur le site de vente en ligne, par défaut, l'âge est fixé à 18 ans (pour être sûr que ceux qui s'inscrivent sont bien majeurs)
Si tel est le cas, il est probable qu'un certain nombre de personnes laisse cette valeur telle qu'elle, bien qu'elle ne corresponde pas à leur âge
=> On va donc garder cette distribution
produits.sample(3)
id_prod | price | categ | |
---|---|---|---|
1129 | 0_2057 | 8.99 | 0 |
1816 | 0_974 | 11.75 | 0 |
2539 | 0_309 | 3.99 | 0 |
produits['price'].describe()
nb_px = produits[['price']].value_counts()
nb_px = nb_px.reset_index()
nb_px.columns = ['prix', 'quantite']
x = nb_px.sort_values(by='prix', ascending=False)
produits.loc[produits['price'] > 50].value_counts().sum()
count 3286.000000 mean 21.863597 std 29.849786 min 0.620000 25% 6.990000 50% 13.075000 75% 22.990000 max 300.000000 Name: price, dtype: float64
273
Moins de 10% des prix dépassent les 50€
ax = sns.boxplot(x='price', data=produits)
rep_prix = produits.groupby('price')['id_prod'].nunique()
rep_prix = rep_prix.reset_index()
rep_prix.columns = ['prix', 'nombre']
rep_prix.head(2)
rep_prix.nombre.sum()
prix | nombre | |
---|---|---|
0 | 0.62 | 2 |
1 | 0.66 | 1 |
3286
fig, ax = plt.subplots(figsize=(12, 4))
plt.style.use('seaborn-whitegrid')
rep_prix.plot(x='prix', y='nombre', kind='scatter', ax=ax)
<AxesSubplot:xlabel='prix', ylabel='nombre'>
produits.groupby('categ').describe()
price | ||||||||
---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | |
categ | ||||||||
0 | 2308.0 | 11.732795 | 7.565755 | 0.62 | 5.5875 | 10.32 | 16.655 | 40.99 |
1 | 739.0 | 25.531421 | 15.425162 | 2.00 | 13.3900 | 22.99 | 33.990 | 80.99 |
2 | 239.0 | 108.354686 | 49.561431 | 30.99 | 71.0650 | 101.99 | 136.530 | 300.00 |
produits.sample(3)
clients.sample(3)
ventes.columns = ['id_prod', 'date', 's_id', 'client_id', 's_year',
's_month', 's_week', 's_day_name', 's_day',
's_hour', 's_minute', 's_date']
ventes.sample(3)
id_prod | price | categ | |
---|---|---|---|
2744 | 1_123 | 2.99 | 1 |
44 | 0_1317 | 4.99 | 0 |
3162 | 1_713 | 33.99 | 1 |
client_id | sex | birth | age | |
---|---|---|---|---|
4218 | c_1051 | f | 1968 | 53 |
3890 | c_3508 | m | 1972 | 49 |
2750 | c_5004 | m | 1954 | 67 |
id_prod | date | s_id | client_id | s_year | s_month | s_week | s_day_name | s_day | s_hour | s_minute | s_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
254401 | 0_1476 | 2022-02-16 11:45:26.278797 | s_166060 | c_3913 | 2022 | 2 | 7 | Mercredi | 16 | 11 | 45 | 2022-02-16 |
94718 | 1_445 | 2021-09-05 10:38:45.648721 | s_85351 | c_3536 | 2021 | 9 | 35 | Dimanche | 5 | 10 | 38 | 2021-09-05 |
114954 | 0_2251 | 2021-03-08 15:42:49.593020 | s_3527 | c_2815 | 2021 | 3 | 10 | Lundi | 8 | 15 | 42 | 2021-03-08 |
outer
pour garder tous les élémentstransactions
sont présents dans la table products
transactions
sont présents dans la table clients
general = pd.merge(ventes, produits, how='outer')
general = pd.merge(general, clients, how='outer')
general.sort_values('s_id')
id_prod | date | s_id | client_id | s_year | s_month | s_week | s_day_name | s_day | s_hour | s_minute | s_date | price | categ | sex | birth | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
272113 | 0_1259 | 2021-03-01 00:01:07.843138 | s_1 | c_329 | 2021.0 | 3.0 | 9.0 | Lundi | 1.0 | 0.0 | 1.0 | 2021-03-01 | 11.99 | 0.0 | f | 1967.0 | 54.0 |
212254 | 1_635 | 2021-03-01 00:10:33.163037 | s_10 | c_2218 | 2021.0 | 3.0 | 9.0 | Lundi | 1.0 | 0.0 | 10.0 | 2021-03-01 | 26.99 | 1.0 | f | 1970.0 | 51.0 |
24502 | 0_1451 | 2021-03-01 04:43:58.025677 | s_100 | c_3854 | 2021.0 | 3.0 | 9.0 | Lundi | 1.0 | 4.0 | 43.0 | 2021-03-01 | 19.99 | 0.0 | f | 1978.0 | 43.0 |
24482 | 0_1030 | 2021-03-01 04:12:43.572994 | s_100 | c_3854 | 2021.0 | 3.0 | 9.0 | Lundi | 1.0 | 4.0 | 12.0 | 2021-03-01 | 13.73 | 0.0 | f | 1978.0 | 43.0 |
275592 | 0_1625 | 2021-03-03 02:38:09.568389 | s_1000 | c_1014 | 2021.0 | 3.0 | 9.0 | Mercredi | 3.0 | 2.0 | 38.0 | 2021-03-03 | 5.99 | 0.0 | m | 1989.0 | 32.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
336854 | NaN | NaT | NaN | c_862 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | f | 1956.0 | 65.0 |
336855 | NaN | NaT | NaN | c_7584 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | f | 1960.0 | 61.0 |
336856 | NaN | NaT | NaN | c_90 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | m | 2001.0 | 20.0 |
336857 | NaN | NaT | NaN | c_587 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | m | 1993.0 | 28.0 |
336858 | NaN | NaT | NaN | c_3526 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | m | 1956.0 | 65.0 |
336859 rows × 17 columns
general.isna().sum()
id_prod 21 date 43 s_id 43 client_id 22 s_year 43 s_month 43 s_week 43 s_day_name 43 s_day 43 s_hour 43 s_minute 43 s_date 43 price 124 categ 124 sex 22 birth 22 age 22 dtype: int64
On va commencer par les valeurs null du df 'clients' : price et categ
-> on identifie ces valeurs
general[general['price'].isna()].sample(5)
id_prod | date | s_id | client_id | s_year | s_month | s_week | s_day_name | s_day | s_hour | s_minute | s_date | price | categ | sex | birth | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
224627 | 0_2245 | 2022-01-04 19:02:47.712837 | s_144966 | c_8595 | 2022.0 | 1.0 | 1.0 | Mardi | 4.0 | 19.0 | 2.0 | 2022-01-04 | NaN | NaN | f | 1980.0 | 41.0 |
27016 | 0_2245 | 2021-11-01 14:00:26.535213 | s_113219 | c_7810 | 2021.0 | 11.0 | 44.0 | Lundi | 1.0 | 14.0 | 0.0 | 2021-11-01 | NaN | NaN | m | 1978.0 | 43.0 |
283133 | 0_2245 | 2021-04-18 08:28:11.501497 | s_22209 | c_3496 | 2021.0 | 4.0 | 15.0 | Dimanche | 18.0 | 8.0 | 28.0 | 2021-04-18 | NaN | NaN | f | 1957.0 | 64.0 |
42159 | 0_2245 | 2021-05-06 20:14:22.522372 | s_30824 | c_4973 | 2021.0 | 5.0 | 18.0 | Jeudi | 6.0 | 20.0 | 14.0 | 2021-05-06 | NaN | NaN | f | 1983.0 | 38.0 |
295329 | 0_2245 | 2021-05-01 03:35:03.146305 | s_28235 | c_5714 | 2021.0 | 5.0 | 17.0 | Samedi | 1.0 | 3.0 | 35.0 | 2021-05-01 | NaN | NaN | f | 1972.0 | 49.0 |
general[general['price'].isna()].info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 124 entries, 18202 to 336858 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_prod 103 non-null object 1 date 103 non-null datetime64[ns] 2 s_id 103 non-null object 3 client_id 124 non-null object 4 s_year 103 non-null float64 5 s_month 103 non-null float64 6 s_week 103 non-null float64 7 s_day_name 103 non-null object 8 s_day 103 non-null float64 9 s_hour 103 non-null float64 10 s_minute 103 non-null float64 11 s_date 103 non-null datetime64[ns] 12 price 0 non-null float64 13 categ 0 non-null float64 14 sex 124 non-null object 15 birth 124 non-null float64 16 age 124 non-null float64 dtypes: datetime64[ns](2), float64(10), object(5) memory usage: 17.4+ KB
general[general['price'].isna()].nunique()
id_prod 1 date 103 s_id 103 client_id 121 s_year 2 s_month 12 s_week 45 s_day_name 7 s_day 30 s_hour 24 s_minute 47 s_date 90 price 0 categ 0 sex 2 birth 46 age 46 dtype: int64
On constate qu'il y a un article qui n'a pas de prix
--> on l'identifie
general[general['price'].isna()]['id_prod'].value_counts()
0_2245 103 Name: id_prod, dtype: int64
Il s'agit du produit avec pour id_prod
0_2245
Ce produit doit être absent de la table 'products' car on ne l'a pas détecté lors de l'étape précédente
On vérifie qu'il est absent du df 'produits'
produits[produits['id_prod'] == '0_2245']
id_prod | price | categ |
---|
Il est bien absent de la table produits
Il semble que les id_prod
soient préfixés par la catégorie à laquelle ils appartiennent
-> on vérifie cela
temp = general[['categ', 'id_prod']]
temp = temp.dropna()
temp['categ'] = temp['categ'].apply(lambda x: int(x)).astype('str')
id_prod_0 = temp[temp['id_prod'].str.startswith('0')]
id_prod_0['categ'].unique()
id_prod_non_0 = temp[~temp['id_prod'].str.startswith('0')]
id_prod_non_0['categ'].unique()
array(['0'], dtype=object)
array(['1', '2'], dtype=object)
On constate que les id_prod
sont bien préfixés par la catégorie dont ils font partie
-> on va pourvoir ainsi imputer le prix du produit ayant pour id_prod
0_2245 par le prix moyen (ou median) de la catégorie 0
--> on calcule ces valeurs
mean_cat_0 = produits.loc[produits['categ'] == 0, 'price'].mean()
median_cat_0 = produits.loc[produits['categ'] == 0, 'price'].median()
mean_cat_0
median_cat_0
11.732794627383296
10.32
On constate que les 2 valeurs sont proches
On va imputer par le prix moyen de la catégorie 0
general.loc[general['id_prod'] == '0_2245', 'price'] = mean_cat_0
general.loc[general['id_prod'] == '0_2245', 'categ'] = 0.0
general[general['id_prod'] == '0_2245']
id_prod | date | s_id | client_id | s_year | s_month | s_week | s_day_name | s_day | s_hour | s_minute | s_date | price | categ | sex | birth | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18202 | 0_2245 | 2021-06-05 17:04:43.982913 | s_44481 | c_6714 | 2021.0 | 6.0 | 22.0 | Samedi | 5.0 | 17.0 | 4.0 | 2021-06-05 | 11.732795 | 0.0 | f | 1968.0 | 53.0 |
18203 | 0_2245 | 2021-04-22 04:57:20.090378 | s_23987 | c_6714 | 2021.0 | 4.0 | 16.0 | Jeudi | 22.0 | 4.0 | 57.0 | 2021-04-22 | 11.732795 | 0.0 | f | 1968.0 | 53.0 |
20708 | 0_2245 | 2021-04-17 16:43:16.543156 | s_21906 | c_7808 | 2021.0 | 4.0 | 15.0 | Samedi | 17.0 | 16.0 | 43.0 | 2021-04-17 | 11.732795 | 0.0 | m | 1977.0 | 44.0 |
27016 | 0_2245 | 2021-11-01 14:00:26.535213 | s_113219 | c_7810 | 2021.0 | 11.0 | 44.0 | Lundi | 1.0 | 14.0 | 0.0 | 2021-11-01 | 11.732795 | 0.0 | m | 1978.0 | 43.0 |
30498 | 0_2245 | 2021-09-11 10:52:05.205583 | s_88251 | c_3468 | 2021.0 | 9.0 | 36.0 | Samedi | 11.0 | 10.0 | 52.0 | 2021-09-11 | 11.732795 | 0.0 | f | 1981.0 | 40.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
299794 | 0_2245 | 2021-09-19 03:08:45.918021 | s_92049 | c_4935 | 2021.0 | 9.0 | 37.0 | Dimanche | 19.0 | 3.0 | 8.0 | 2021-09-19 | 11.732795 | 0.0 | f | 1982.0 | 39.0 |
311031 | 0_2245 | 2021-11-12 09:25:22.905886 | s_118458 | c_7416 | 2021.0 | 11.0 | 45.0 | Vendredi | 12.0 | 9.0 | 25.0 | 2021-11-12 | 11.732795 | 0.0 | m | 1933.0 | 88.0 |
311482 | 0_2245 | 2021-06-17 03:03:12.668129 | s_49705 | c_1533 | 2021.0 | 6.0 | 24.0 | Jeudi | 17.0 | 3.0 | 3.0 | 2021-06-17 | 11.732795 | 0.0 | m | 1972.0 | 49.0 |
324186 | 0_2245 | 2021-11-20 20:21:06.505658 | s_122593 | c_8524 | 2021.0 | 11.0 | 46.0 | Samedi | 20.0 | 20.0 | 21.0 | 2021-11-20 | 11.732795 | 0.0 | f | 1982.0 | 39.0 |
325475 | 0_2245 | 2021-05-20 07:44:21.415061 | s_36985 | c_1450 | 2021.0 | 5.0 | 20.0 | Jeudi | 20.0 | 7.0 | 44.0 | 2021-05-20 | 11.732795 | 0.0 | f | 1959.0 | 62.0 |
103 rows × 17 columns
produits_cor = produits.copy()
produits_cor.loc[3287]=['0_2245', 11.732795, 0]
produits_cor
id_prod | price | categ | |
---|---|---|---|
0 | 0_1421 | 19.990000 | 0 |
1 | 0_1368 | 5.130000 | 0 |
2 | 0_731 | 17.990000 | 0 |
3 | 1_587 | 4.990000 | 1 |
4 | 0_1507 | 3.990000 | 0 |
... | ... | ... | ... |
3283 | 0_146 | 17.140000 | 0 |
3284 | 0_802 | 11.220000 | 0 |
3285 | 1_140 | 38.560000 | 1 |
3286 | 0_1920 | 25.160000 | 0 |
3287 | 0_2245 | 11.732795 | 0 |
3287 rows × 3 columns
general.isna().sum()
id_prod 21 date 43 s_id 43 client_id 22 s_year 43 s_month 43 s_week 43 s_day_name 43 s_day 43 s_hour 43 s_minute 43 s_date 43 price 21 categ 21 sex 22 birth 22 age 22 dtype: int64
general[general['client_id'].isna()].sample(3)
prod_sans_commande_ls = list(general[general['client_id'].isna()]['id_prod'])
len(prod_sans_commande_ls)
id_prod | date | s_id | client_id | s_year | s_month | s_week | s_day_name | s_day | s_hour | s_minute | s_date | price | categ | sex | birth | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
336816 | 0_1016 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | 35.06 | 0.0 | NaN | NaN | NaN |
336819 | 0_1119 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | 2.99 | 0.0 | NaN | NaN | NaN |
336824 | 0_1645 | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | 2.99 | 0.0 | NaN | NaN | NaN |
22
On constate qu'il y a 22 produits qui n'ont jamais été vendus
(sur un total de 3287 produits, soit 0,67%)
prod_sans_commande = produits[produits['id_prod'].isin(prod_sans_commande_ls)]
general = general.dropna(subset=['client_id'])
general[general['id_prod'].isna()].sample(3)
clients_sans_commande_ls = general[general['id_prod'].isna()]['client_id']
len(clients_sans_commande_ls)
id_prod | date | s_id | client_id | s_year | s_month | s_week | s_day_name | s_day | s_hour | s_minute | s_date | price | categ | sex | birth | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
336844 | NaN | NaT | NaN | c_3017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | f | 1992.0 | 29.0 |
336852 | NaN | NaT | NaN | c_5223 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | m | 2003.0 | 18.0 |
336841 | NaN | NaT | NaN | c_2706 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaT | NaN | NaN | f | 1967.0 | 54.0 |
21
On constate qu'il y a 21 clients qui n'ont jamais passé de commandes
(sur un total de 8623 clients, soit 0,24%)
clients_sans_commande = clients[clients['client_id'].isin(clients_sans_commande_ls)]
general = general.dropna(subset=['id_prod'])
general.isna().sum()
id_prod 0 date 0 s_id 0 client_id 0 s_year 0 s_month 0 s_week 0 s_day_name 0 s_day 0 s_hour 0 s_minute 0 s_date 0 price 0 categ 0 sex 0 birth 0 age 0 dtype: int64
general.info()
general.sample(5)
<class 'pandas.core.frame.DataFrame'> Int64Index: 336816 entries, 0 to 336815 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_prod 336816 non-null object 1 date 336816 non-null datetime64[ns] 2 s_id 336816 non-null object 3 client_id 336816 non-null object 4 s_year 336816 non-null float64 5 s_month 336816 non-null float64 6 s_week 336816 non-null float64 7 s_day_name 336816 non-null object 8 s_day 336816 non-null float64 9 s_hour 336816 non-null float64 10 s_minute 336816 non-null float64 11 s_date 336816 non-null datetime64[ns] 12 price 336816 non-null float64 13 categ 336816 non-null float64 14 sex 336816 non-null object 15 birth 336816 non-null float64 16 age 336816 non-null float64 dtypes: datetime64[ns](2), float64(10), object(5) memory usage: 46.3+ MB
id_prod | date | s_id | client_id | s_year | s_month | s_week | s_day_name | s_day | s_hour | s_minute | s_date | price | categ | sex | birth | age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
177017 | 0_1623 | 2021-04-08 17:53:07.850941 | s_17806 | c_3297 | 2021.0 | 4.0 | 14.0 | Jeudi | 8.0 | 17.0 | 53.0 | 2021-04-08 | 7.99 | 0.0 | f | 1975.0 | 46.0 |
68121 | 0_1335 | 2021-11-17 17:20:14.980155 | s_121071 | c_4329 | 2021.0 | 11.0 | 46.0 | Mercredi | 17.0 | 17.0 | 20.0 | 2021-11-17 | 17.99 | 0.0 | f | 1990.0 | 31.0 |
266314 | 0_1410 | 2021-04-25 02:21:28.264392 | s_25347 | c_7137 | 2021.0 | 4.0 | 16.0 | Dimanche | 25.0 | 2.0 | 21.0 | 2021-04-25 | 14.85 | 0.0 | f | 1980.0 | 41.0 |
319707 | 1_444 | 2021-06-03 17:31:48.105361 | s_43597 | c_7256 | 2021.0 | 6.0 | 22.0 | Jeudi | 3.0 | 17.0 | 31.0 | 2021-06-03 | 12.61 | 1.0 | m | 1996.0 | 25.0 |
81860 | 0_1193 | 2022-02-05 06:15:59.982439 | s_160228 | c_6646 | 2022.0 | 2.0 | 5.0 | Samedi | 5.0 | 6.0 | 15.0 | 2022-02-05 | 9.99 | 0.0 | m | 1974.0 | 47.0 |
general.isna().sum()
id_prod 0 date 0 s_id 0 client_id 0 s_year 0 s_month 0 s_week 0 s_day_name 0 s_day 0 s_hour 0 s_minute 0 s_date 0 price 0 categ 0 sex 0 birth 0 age 0 dtype: int64
general.columns = ['p_id', 's_id_date', 's_panier_id', 'c_id', 's_year', 's_month', 's_week', 's_day_n',
's_day', 's_hour', 's_minute', 's_date', 'p_prix', 'p_categ', 'c_sex', 'c_birth', 'c_age']
res = general.reindex(columns=[
'c_id', 'c_sex', 'c_age', 'c_birth',
'p_id', 'p_categ', 'p_prix',
's_panier_id', 's_id_date', 's_year', 's_month', 's_week', 's_day_n', 's_day', 's_hour', 's_minute', 's_date'
])
datetime_is_numeric = True
res.describe(datetime_is_numeric=True)
c_age | c_birth | p_categ | p_prix | s_id_date | s_year | s_month | s_week | s_day | s_hour | s_minute | s_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 336816.000000 | 336816.000000 | 336816.000000 | 336816.000000 | 336816 | 336816.000000 | 336816.000000 | 336816.000000 | 336816.000000 | 336816.000000 | 336816.000000 | 336816 |
mean | 43.176604 | 1977.823396 | 0.430024 | 17.213512 | 2021-09-03 02:15:11.058548224 | 2021.174822 | 6.482133 | 26.297925 | 15.801628 | 11.513533 | 29.481533 | 2021-09-02 14:14:23.445917184 |
min | 17.000000 | 1929.000000 | 0.000000 | 0.620000 | 2021-03-01 00:01:07.843138 | 2021.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 2021-03-01 00:00:00 |
25% | 34.000000 | 1971.000000 | 0.000000 | 8.610000 | 2021-05-30 21:13:10.275369728 | 2021.000000 | 3.000000 | 13.000000 | 8.000000 | 6.000000 | 14.000000 | 2021-05-30 00:00:00 |
50% | 41.000000 | 1980.000000 | 0.000000 | 13.900000 | 2021-09-07 05:00:52.682486272 | 2021.000000 | 6.000000 | 26.000000 | 16.000000 | 12.000000 | 29.000000 | 2021-09-07 00:00:00 |
75% | 50.000000 | 1987.000000 | 1.000000 | 18.990000 | 2021-12-08 04:39:47.265316864 | 2021.000000 | 9.000000 | 39.000000 | 23.000000 | 18.000000 | 45.000000 | 2021-12-08 00:00:00 |
max | 92.000000 | 2004.000000 | 2.000000 | 300.000000 | 2022-02-28 23:59:58.040472 | 2022.000000 | 12.000000 | 52.000000 | 31.000000 | 23.000000 | 59.000000 | 2022-02-28 00:00:00 |
std | 13.523923 | 13.523923 | 0.591039 | 17.852972 | NaN | 0.379816 | 3.519807 | 15.350837 | 8.818357 | 6.919540 | 17.329827 | NaN |
res.sample(5)
c_id | c_sex | c_age | c_birth | p_id | p_categ | p_prix | s_panier_id | s_id_date | s_year | s_month | s_week | s_day_n | s_day | s_hour | s_minute | s_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
157658 | c_6359 | m | 39.0 | 1982.0 | 0_1469 | 0.0 | 14.99 | s_22080 | 2021-04-18 01:32:17.955742 | 2021.0 | 4.0 | 15.0 | Dimanche | 18.0 | 1.0 | 32.0 | 2021-04-18 |
259667 | c_3162 | f | 58.0 | 1963.0 | 1_4 | 1.0 | 24.21 | s_39904 | 2021-05-26 18:55:52.239097 | 2021.0 | 5.0 | 21.0 | Mercredi | 26.0 | 18.0 | 55.0 | 2021-05-26 |
226730 | c_1756 | f | 42.0 | 1979.0 | 0_1567 | 0.0 | 16.99 | s_36542 | 2021-05-19 07:53:21.440913 | 2021.0 | 5.0 | 20.0 | Mercredi | 19.0 | 7.0 | 53.0 | 2021-05-19 |
3956 | c_1609 | m | 41.0 | 1980.0 | 0_1253 | 0.0 | 12.99 | s_127149 | 2021-11-30 05:06:45.487262 | 2021.0 | 11.0 | 48.0 | Mardi | 30.0 | 5.0 | 6.0 | 2021-11-30 |
145637 | c_7924 | f | 50.0 | 1971.0 | 1_325 | 1.0 | 27.99 | s_21641 | 2021-04-17 01:27:35.421205 | 2021.0 | 4.0 | 15.0 | Samedi | 17.0 | 1.0 | 27.0 | 2021-04-17 |
res.dtypes
c_id object c_sex object c_age float64 c_birth float64 p_id object p_categ float64 p_prix float64 s_panier_id object s_id_date datetime64[ns] s_year float64 s_month float64 s_week float64 s_day_n object s_day float64 s_hour float64 s_minute float64 s_date datetime64[ns] dtype: object
Pour respecter le découpage indiquée, ce premier notebook, traitant du nettoyage des données s'arrête ici
Mais vous trouverez dans le notebook suivant d'autres éléments de nettoyage, à savoir :
le traitement des données absentes relatives aux produits de caégorie 2 pour le mois d'octobre 2021
J'ai volontairement laissé ce traitement dans la partie suivante car ce n'est que lorsque j'ai analysé le CA que je m'en suis rendu compte
le traitement des 4 clients 'premiums', pour la même raison
res.to_csv("data/data_transition/res.csv", index = False, header = True)
produits_cor.to_csv("data/data_transition/produits_cor.csv", index = False, header = True)