%%html
<style>
.MathJax {
    font-size: 1.3em;
}
.rendered_html tr, .rendered_html th, .rendered_html td {
    text-align: right !important;
}
a[data-snippet-code]::after {
    background: #262931 !important;
}
.titre-pers{
    font-family: arial;
    font-size: 250% !important;
    line-height: 200% !important;
    text-align: center !important;
    color: #4c8be2 !important;
}
.jp-RenderedHTMLCommon h1,
.rendered_html h1,
.text_cell_render h1 {
    color: #86bed9 !important;
    line-height: 150% !important;
}
.jp-RenderedHTMLCommon h2,
.rendered_html h2,
.text_cell_render h2 {
    color: #b08c20 !important;
    padding-left: .5rem !important;
    line-height: 150% !important;
}
.jp-RenderedHTMLCommon h3,
.rendered_html h3,
.text_cell_render h3 {
    color: #3aa237 !important;
    padding-left: 1rem !important;
    line-height: 150% !important;
    font-size: 120% !important;
}
.jp-RenderedHTMLCommon h4,
.rendered_html h4,
.text_cell_render h4 {
    color: #29858a !important;
    padding-left: 2rem !important;
    font-size: 110% !important;
}
.jp-RenderedHTMLCommon h5,
.rendered_html h5,
.text_cell_render h5 {
    color: #21417d !important;
    padding-left: 2.5rem !important;
    font-size: 110% !important;
}
.jp-RenderedHTMLCommon h6,
.rendered_html h6,
.text_cell_render h6 {
    color: #d8a802c2 !important;
    padding-left: 1rem !important;
    font-family: sans-serif !important;
    font-size: 120% !important;
    font-weight: normal !important;
    font-style: normal !important;
}
.renf{
    font-size: 18px !important;
    font-family: Arial !important;
    color: #14db9a !important;
}
.renf2{
    font-size: 18px !important;
    font-family: Arial !important;
    color: orangered !important;
}
</style>


import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
import scipy.stats as st
import scipy as sp
import math
from math import sqrt
import numpy as np
import pandas as pd
import pingouin as pg
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
plt.style.use('seaborn-deep')
%matplotlib inline
#from jupyterthemes import jtplot
# jtplot.style()
params = {
    'legend.fontsize': 'large',
    'legend.title_fontsize': 'large',
    'figure.figsize': (18, 6),
    'axes.labelsize': 'x-large',
    'axes.titlesize': 'x-large',
    'xtick.labelsize': 'x-large',
    'xtick.major.pad': 12,
    'ytick.labelsize': 'x-large',
    'ytick.major.pad': 12,
    'lines.linewidth': 2,
    'savefig.dpi': 300
}
pylab.rcParams.update(params)
pd.options.mode.chained_assignment = None  # default='warn'


color1 = ['#7fc97f', '#beaed4', '#fdc086', '#ffff99',
          '#386cb0', '#f0027f', '#bf5b17', '#666666']
color2 = ['#1b9e77', '#d95f02', '#7570b3', '#e7298a',
          '#66a61e', '#e6ab02', '#a6761d', '#666666']
color3 = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c',
          '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00']
color4 = ['#fbb4ae', '#b3cde3', '#ccebc5', '#decbe4',
          '#fed9a6', '#ffffcc', '#e5d8bd', '#fddaec']
color5 = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3',
          '#a6d854', '#ffd92f', '#e5c494', '#b3b3b3']
color6 = ['#b3e2cd', '#fdcdac', '#cbd5e8', '#f4cae4',
          '#e6f5c9', '#fff2ae', '#f1e2cc', '#cccccc']
color7 = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3',
          '#ff7f00', '#ffff33', '#a65628', '#f781bf']
color8 = ['#8dd3c7', '#ffffb3', '#bebada', '#fb8072',
          '#80b1d3', '#fdb462', '#b3de69', '#fccde5']

color2l = ['red', 'green', '#78a6cd', '#ccebc5', '#decbe4']

color_seq1 = ['#f7fcfd', '#e5f5f9', '#ccece6', '#99d8c9',
              '#66c2a4', '#41ae76', '#238b45', '#005824']
color_seq2 = ['#fff7ec', '#fee8c8', '#fdd49e', '#fdbb84',
              '#fc8d59', '#ef6548', '#d7301f', '#990000']
color_seq3 = ['#f7fbff', '#deebf7', '#c6dbef', '#9ecae1',
              '#6baed6', '#4292c6', '#2171b5', '#084594']
color_seq4 = ['#f7fcf5', '#e5f5e0', '#c7e9c0', '#a1d99b',
              '#74c476', '#41ab5d', '#238b45', '#005a32']
color_seq5 = ['#fff5f0', '#fee0d2', '#fcbba1', '#fc9272',
              '#fb6a4a', '#ef3b2c', '#cb181d', '#99000d']
color_seq6 = ['#fff5eb', '#fee6ce', '#fdd0a2', '#fdae6b',
              '#fd8d3c', '#f16913', '#d94801', '#8c2d04']
color_seq7 = ['#ffffd9', '#edf8b1', '#c7e9b4', '#7fcdbb',
              '#41b6c4', '#1d91c0', '#225ea8', '#0c2c84']
color_seq8 = ['#fff7fb', '#ece2f0', '#d0d1e6', '#a6bddb',
              '#67a9cf', '#3690c0', '#02818a', '#016450']
color_seq9 = ['#ffffff', '#f0f0f0', '#d9d9d9', '#bdbdbd',
              '#969696', '#737373', '#525252', '#252525']

color_div1 = ['#8c510a', '#bf812d', '#dfc27d', '#f6e8c3',
              '#c7eae5', '#80cdc1', '#35978f', '#01665e']
color_div2 = ['#c51b7d', '#de77ae', '#f1b6da', '#fde0ef',
              '#e6f5d0', '#b8e186', '#7fbc41', '#4d9221']
color_div3 = ['#762a83', '#9970ab', '#c2a5cf', '#e7d4e8',
              '#d9f0d3', '#a6dba0', '#5aae61', '#1b7837']
color_div4 = ['#b2182b', '#d6604d', '#f4a582', '#fddbc7',
              '#d1e5f0', '#92c5de', '#4393c3', '#2166ac']
color_div5 = ['#b2182b', '#d6604d', '#f4a582', '#fddbc7',
              '#e0e0e0', '#bababa', '#878787', '#4d4d4d']
color_div6 = ['#d73027', '#f46d43', '#fdae61', '#fee090',
              '#e0f3f8', '#abd9e9', '#74add1', '#4575b4']
color_div7 = ['#d73027', '#f46d43', '#fdae61', '#fee08b',
              '#d9ef8b', '#a6d96a', '#66bd63', '#1a9850']
color_div8 = ['#d53e4f', '#f46d43', '#fdae61', '#fee08b',
              '#e6f598', '#abdda4', '#66c2a5', '#3288bd']


def draw_bar_single(var_x, var_y, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1, titre_x=0.5, title_linespacing=1.0, legend_out=False, col_bar='teal', alpha_bar=0.7, base_mtick=1.0, xdate_available=False, alpha_grid=0.3):
    """
        Mettre xdate_available=True pour orienté le texte des graduations sur l'axe x
    """

    x = var_x
    y = var_y

    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})

    loc = mtick.MultipleLocator(base=base_mtick)
    ax.xaxis.set_major_locator(loc)
    if xdate_available:
        fig.autofmt_xdate(bottom=0.2, rotation=30, ha='right', which='major')
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})

    if legend_out:
        _ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')

    _ = plt.bar(x, y, align='center', color=col_bar, alpha=alpha_bar)
    _ = plt.grid(linestyle='--', alpha=alpha_grid)


###########################

def draw_plot(var_y, var_x, label_x, label_y, titre, nb=1, col_titre='MidnightBlue', pos_titre='center', titre_y=1, titre_x=0.5, title_linespacing=1.0, legend_out=False, col_bar='teal', alpha_bar=0.7, base_mtick=1.0, xdate_available=False, alpha_grid=0.3):
    """
        Mettre xdate_available=True pour orienté le texte des graduations sur l'axe x
    """

    y = var_y
    x = var_x

    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})

    loc = mtick.MultipleLocator(base=base_mtick)
    ax.xaxis.set_major_locator(loc)
    if xdate_available:
        fig.autofmt_xdate(bottom=0.2, rotation=30, ha='right', which='major')
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})

    if legend_out:
        ax.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
    ax.plot(y, x, color=col_bar, alpha=alpha_bar)
    ax.grid(linestyle='--', alpha=alpha_grid)


################################

def draw_bar_several(dfgroupby, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, legend_pers=False, legend=None, xticks_rotation='horizontal', base_mtick=1.0, g_stacked=False, g_stacked_perc=False, g_width=0.8, g_kind='bar', g_color=color1, alpha_grid=0.3):
    if g_stacked_perc:
        plt.plot = dfgroupby.groupby(level=0).apply(lambda x: 100 * x / x.sum()).unstack().plot(
            kind=g_kind,
            stacked=g_stacked,
            width=g_width,
            color=g_color
        )
        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
    else:
        plt.plot = dfgroupby.unstack().plot(
            kind=g_kind,
            stacked=g_stacked,
            width=g_width,
            color=g_color
        )

    if legend_out:
        _ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
    if legend_pers:
        _ = plt.legend(legend, bbox_to_anchor=(1.03, 1.0), loc='upper left')
    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
    _ = plt.xticks(rotation=xticks_rotation)
    loc = mtick.MultipleLocator(base=base_mtick)
    ax.xaxis.set_major_locator(loc)
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.grid(linestyle='--', alpha=alpha_grid)


def draw_bar_pie(titre, labels_in, size_in, explode_in, titre_legend, z, a=1, b=1, c=1, color_titre='MidnightBlue', def_angle=90, titre_x=0.5, titre_y=1.05, title_linespacing=1, legend_available=True, pos_titre='center', size_titre=18, color_perc='white', size_perc=16, loc_legend="center left", bbox_legend=(1, 0, 0.5, 1), g_color=color5):
    """
        fig=plt.figure(figsize=(18,6))
    """

    labels = labels_in
    size = size_in
    explode = explode_in
    z = fig.add_subplot(a, b, c)
    patches, texts, autotexts = z.pie(
        size, explode=explode, autopct='%1.1f%%', shadow=True, startangle=def_angle, colors=g_color)
    if legend_available:
        legend = z.legend(patches, labels,
                          title=titre_legend,
                          loc=loc_legend,
                          bbox_to_anchor=bbox_legend)
    # Equal aspect ratio ensures that pie is drawn as a circle.
    z.axis('equal')
    _ = plt.setp(autotexts, color=color_perc, size=size_perc, weight="bold")
    z.set_title(titre, color=color_titre, loc='center', y=titre_y, x=titre_x,
                linespacing=title_linespacing, fontdict={'size': size_titre, 'weight': 500})


def draw_scatter(df, col_df_x, col_df_y, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, xticks_rotation='horizontal', base_mtick=1.0, g_stacked=False, g_stacked_perc=False, color_scatter='teal', alpha_grid=0.3):
    params = {
        'figure.figsize': (16, 6),
    }
    pylab.rcParams.update(params)
    df.plot(
        kind='scatter',
        x=col_df_x,
        y=col_df_y,
        color=color_scatter
    )
    if legend_out:
        plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')

    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
    _ = plt.xticks(rotation=xticks_rotation)
    loc = mtick.MultipleLocator(base=base_mtick)
    ax.xaxis.set_major_locator(loc)
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.grid(linestyle='--', alpha=alpha_grid)
    # _=plt.show()


def draw_options(titre, label_x, label_y, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, legend_pers=False, legend=None, alpha_grid=0.3):
    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
    if legend_out:
        _ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
    if legend_pers:
        _ = plt.legend(legend_p, bbox_to_anchor=(1.03, 1.0), loc='upper left')
    _ = plt.grid(linestyle='--', alpha=alpha_grid)

# this plots multiple seaborn histograms on different subplots


def plot_multiple_histograms(df, cols):
    num_plots = len(cols)
    num_cols = math.ceil(np.sqrt(num_plots))
    num_rows = math.ceil(num_plots/num_cols)

    fig, axs = plt.subplots(num_rows, num_cols)

    for ind, col in enumerate(cols):
        i = math.floor(ind/num_cols)
        j = ind - i*num_cols

        if num_rows == 1:
            if num_cols == 1:
                sns.distplot(df[col], kde=True, ax=axs)
            else:
                sns.distplot(df[col], kde=True, ax=axs[j])
        else:
            sns.distplot(df[col], kde=True, ax=axs[i, j])


def gini(arr):
    count = arr.size
    coefficient = 2 / count
    indexes = np.arange(1, count + 1)
    weighted_sum = (indexes * arr).sum()
    total = arr.sum()
    constant = (count + 1) / count
    return coefficient * weighted_sum / total - constant


def lorenz_f(arrg, title, xlabel, ylabel, arrg2=None, arrg3=None, arrg4=None, nb=1, fs=8, title_lorenz=None):
    patterns = ('X', '\\', '-', '+', 'O', '*', '.')
    fig = plt.figure(figsize=(fs, fs))

    x1 = [0, 1]
    y1 = [0, 1]
    ax = fig.add_subplot(111)
    _ = ax.plot(x1, y1, color='black', alpha=0.7)
    _ = ax.set(xlim=(0, 1), ylim=(0, 1))
    # _=ax.axis('equal')
    _ = ax.tick_params(axis='both', which='major', labelsize=14)
    plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    _ = plt.ylabel(ylabel, color='gray', labelpad=10, fontdict={'size': 16})
    _ = plt.xlabel(xlabel, color='gray', labelpad=15, fontdict={'size': 16})
    _ = plt.title(title, y=1.02, fontdict={'size': 20, 'weight': 500})

    if nb == 2:
        arrge = [arrg, arrg2]
    if nb == 3:
        arrge = [arrg, arrg2, arrg3]
    if nb == 4:
        arrge = [arrg, arrg2, arrg3, arrg4]
    for i in range(nb):
        if nb == 1:
            arr = arrg
        else:
            arr = arrge[i]

        arr = arr.sort_values()
        n = len(arr)
        lorenz = np.cumsum(arr) / arr.sum()
        lorenz = np.append([0], lorenz)
        xaxis = np.linspace(0, 1, n+1)
        if title_lorenz:
            _ = ax.plot(xaxis, lorenz, drawstyle='steps-post',
                        color=color2l[i], label='Courbe de Lorenz '+title_lorenz[i])
        else:
            _ = ax.plot(xaxis, lorenz, drawstyle='steps-post',
                        color=color2l[i], label='Courbe de Lorenz')
        if nb == 1:
            _ = ax.fill_between(xaxis, xaxis, lorenz, alpha=0.6, hatch="X",
                                color='teal', label='Surface de concentration')
            _ = ax.fill_between(np.linspace(0, 1, len(lorenz)),
                                lorenz, color='gray', alpha=0.3)
        else:
            _ = ax.fill_between(xaxis, xaxis, lorenz,
                                hatch=patterns[i],  color=color2l[i], alpha=0.3)

        gini_value = gini(arr)

        if title_lorenz:
            r = 0.7-(i/20)
            _ = ax.text(0.1, r, "Gini "+str(title_lorenz[i]) + " = "+str(
                round(gini_value, 2)), fontsize=16, color=color2l[i], weight='semibold')
        else:
            _ = ax.text(0.2, 0.6, "Gini = " + str(round(gini_value, 2)),
                        fontsize=18, color='darkred', weight='semibold')

        patch = []
        if nb == 1:
            a_patch = mpatches.Patch(
                color='teal', alpha=0.6, hatch="X", label='Surface de concentration')
            b_patch = mpatches.Patch(
                color=color2l[0], label='Courbe de Lorenz')
            _ = plt.legend(handles=[a_patch, b_patch])
        else:
            if title_lorenz:
                p = mpatches.Patch(
                    color=color2l[i], label='Courbe de Lorenz '+title_lorenz[i])
                patch.append(p)
                _ = plt.legend(handles=patch)
            else:
                p = mpatches.Patch(color=color2l[i], label='Courbe de Lorenz')
                patch.append(p)
                _ = plt.legend(handles=patch)

    _ = ax.legend(loc='upper left', bbox_to_anchor=(0.02, 0.98), fontsize=12,
                  frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=0.5)
    _ = ax.grid(linestyle='--', alpha=0.4)


def eta_squared(x, y):
    moyenne_y = y.mean()
    classes = []
    for classe in x.unique():
        yi_classe = y[x == classe]
        classes.append({'ni': len(yi_classe),
                        'moyenne_classe': yi_classe.mean()})
    SCT = sum([(yj-moyenne_y)**2 for yj in y])
    SCE = sum([c['ni']*(c['moyenne_classe']-moyenne_y)**2 for c in classes])
    return SCE/SCT


def linear_regression(x, y):     
    N = len(x)
    x_mean = x.mean()
    y_mean = y.mean()
    
    B1_num = ((x - x_mean) * (y - y_mean)).sum()
    B1_den = ((x - x_mean)**2).sum()
    B1 = B1_num / B1_den
    
    B0 = y_mean - (B1*x_mean)
    
    sign = '+'
    if B1 < 0:
        sign = '-'
        B3 = -B1
    else:
        B3 = B1
        
    reg_line = 'y = {} {} {}β'.format(B0, sign, round(B3, 3))
    
    return (B0, B1, reg_line)

def corr_coef(x, y):
    N = len(x)
    
    num = (N * (x*y).sum()) - (x.sum() * y.sum())
    den = np.sqrt((N * (x**2).sum() - x.sum()**2) * (N * (y**2).sum() - y.sum()**2))
    R = num / den
    return R

def linear_reg_aff(x,y):
    B0, B1, reg_line = linear_regression(x, y)
    R = corr_coef(x, y)
    text="Droite de régression : "+str(reg_line)+"\nCoef. de corrélation R : "+str(R)+"\nCoef. de détermination R² : "+str(R**2)
    return print(text)

def graph_droite_regression(dataset, x, y, text_x, text_y, titre, axes_x, axes_y, 
                            size_col, sizes_taille, titre_leg="", 
                            leg_x=0.1, leg_y=0.1, loc_leg='lower left', axe="ax1",
                            inter=np.arange(100), inter_droite=np.arange(100), size_title=20,
                            xlim_p=None, ylim_p=None, droite_reg=True):
    
    x1=dataset[x]
    y1=dataset[y]
    
    B0, B1, reg_line = linear_regression(x1,y1)
    R = corr_coef(x1,y1)
    
    sign = '+'
    if B1 < 0:
        sign = '-'
        B3 = -B1
    else:
        B3 = B1
    
    dic_ax = { "ax1":ax1, "ax2":ax2 }
    X=dic_ax[axe]

    X = plt.gca()
    text = "Moyenne X : {}\nMoyenne Y : {}\nR : {}\nR^2 : {}\ny = {} {} {}X".format(
        round(x1.mean(), 2), 
        round(y1.mean(), 2), 
        round(R, 4), 
        round(R**2, 4),
        round(B0, 3),
        sign,
        round(B3, 3))
    if droite_reg:
        _ = plt.text(x=text_x, y=text_y, s=text, fontsize=12, bbox={'facecolor': 'grey', 'edgecolor':'black', 'boxstyle':'round,pad=1', 'alpha': 0.2, 'pad': 10})
        _ = X.plot(inter_droite, [B0 + B1*x for x in inter_droite], c = 'r', linewidth=5, alpha=.5, solid_capstyle='round')
    _ = sns.scatterplot(data=dataset, x=x, y=y, marker="o", size=size_col, sizes=sizes_taille, ax=X)
    if xlim_p:
        _ = plt.xlim(xlim_p)
    if ylim_p:
        _ = plt.ylim(ylim_p)
    _ = plt.xticks(inter)
    _ = plt.xlabel(axes_x, color='gray', labelpad=15, fontdict={'size': 16})
    _ = plt.ylabel(axes_y, color='gray', labelpad=15, fontdict={'size': 16})
    _ = plt.title(titre, y=1.02, fontdict={'size': size_title, 'weight': 500})
    _ = plt.legend(title=titre_leg, loc=loc_leg, bbox_to_anchor=(leg_x, leg_y),frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=1,fontsize=12)


produits = pd.read_csv("data/dataset_P4/products.csv")
produits.sample(5)


produits.info()
produits.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287 entries, 0 to 3286
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id_prod  3287 non-null   object 
 1   price    3287 non-null   float64
 2   categ    3287 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 77.2+ KB


produits.isna().sum()
produits.duplicated().sum()

id_prod    0
price      0
categ      0
dtype: int64

0


produits.categ.value_counts().nunique()
produits.categ.value_counts()

3

0    2309
1     739
2     239
Name: categ, dtype: int64


print(f'Nombre total de produits : {produits.categ.value_counts().sum()}')
for i in range(3):
    part_cat = (produits.categ.value_counts()[i] / produits.shape[0])*100
    part_cat = round(part_cat, 2)
    print(
        f'Catégorie {i} : {produits.categ.value_counts()[i]} produits, soit {part_cat}% de l\'ensemble des produits')

Nombre total de produits : 3287
Catégorie 0 : 2309 produits, soit 70.25% de l'ensemble des produits
Catégorie 1 : 739 produits, soit 22.48% de l'ensemble des produits
Catégorie 2 : 239 produits, soit 7.27% de l'ensemble des produits


produits[produits['price'] <= 0]


clients = pd.read_csv("data/dataset_P4/customers.csv")
clients.sample(5)


clients.info()
clients.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8623 entries, 0 to 8622
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   client_id  8623 non-null   object
 1   sex        8623 non-null   object
 2   birth      8623 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 202.2+ KB


clients.isna().sum()
clients.duplicated().sum()

client_id    0
sex          0
birth        0
dtype: int64

0


nb_h = clients.sex.value_counts()['m']
nb_f = clients.sex.value_counts()['f']
part_genre = (clients.groupby('sex')['sex'].count() / clients.shape[0])*100
part_genre = round(part_genre, 2)
part_h = part_genre['m']
part_f = part_genre['f']

print(
    f'Nombre de clients enregistrés : {clients.client_id.value_counts().sum()}')
print(f'Nombre de clients Homme : {nb_h}, soit {part_h}%')
print(f'Nombre de clientes Femme : {nb_f}, soit {part_f}%')

Nombre de clients enregistrés : 8623
Nombre de clients Homme : 4132, soit 47.92%
Nombre de clientes Femme : 4491, soit 52.08%


print(sorted(clients.birth.unique()))

[1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004]


birthyear = clients.birth.value_counts()
birthyear

2004    440
1979    215
1988    207
1986    203
1978    199
       ... 
1935      8
1932      6
1930      4
1931      4
1929      3
Name: birth, Length: 76, dtype: int64


birthyear.count()
birthyear.nlargest()
birthyear.nsmallest()

76

2004    440
1979    215
1988    207
1986    203
1978    199
Name: birth, dtype: int64

1929    3
1930    4
1931    4
1932    6
1933    8
Name: birth, dtype: int64


_ = sns.displot(clients['birth'], kde=False, bins=150)


ventes = pd.read_csv("data/dataset_P4/transactions.csv")
ventes.sample(4)


ventes.info()
ventes.describe(include="all", datetime_is_numeric=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337016 entries, 0 to 337015
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id_prod     337016 non-null  object
 1   date        337016 non-null  object
 2   session_id  337016 non-null  object
 3   client_id   337016 non-null  object
dtypes: object(4)
memory usage: 10.3+ MB


ventes.isna().sum()
ventes.duplicated().sum()

id_prod       0
date          0
session_id    0
client_id     0
dtype: int64

126


ventes[ventes.duplicated()].head(3)


ventes[ventes.duplicated()].nunique()

id_prod        1
date          36
session_id     1
client_id      2
dtype: int64


ventes[ventes.duplicated()].date.unique()
ventes[ventes.duplicated()].client_id.unique()
ventes[ventes.duplicated()].id_prod.unique()

array(['test_2021-03-01 02:30:02.237443',
       'test_2021-03-01 02:30:02.237412',
       'test_2021-03-01 02:30:02.237439',
       'test_2021-03-01 02:30:02.237429',
       'test_2021-03-01 02:30:02.237446',
       'test_2021-03-01 02:30:02.237424',
       'test_2021-03-01 02:30:02.237442',
       'test_2021-03-01 02:30:02.237414',
       'test_2021-03-01 02:30:02.237430',
       'test_2021-03-01 02:30:02.237436',
       'test_2021-03-01 02:30:02.237413',
       'test_2021-03-01 02:30:02.237418',
       'test_2021-03-01 02:30:02.237427',
       'test_2021-03-01 02:30:02.237432',
       'test_2021-03-01 02:30:02.237425',
       'test_2021-03-01 02:30:02.237435',
       'test_2021-03-01 02:30:02.237426',
       'test_2021-03-01 02:30:02.237428',
       'test_2021-03-01 02:30:02.237447',
       'test_2021-03-01 02:30:02.237419',
       'test_2021-03-01 02:30:02.237433',
       'test_2021-03-01 02:30:02.237440',
       'test_2021-03-01 02:30:02.237437',
       'test_2021-03-01 02:30:02.237441',
       'test_2021-03-01 02:30:02.237434',
       'test_2021-03-01 02:30:02.237449',
       'test_2021-03-01 02:30:02.237444',
       'test_2021-03-01 02:30:02.237417',
       'test_2021-03-01 02:30:02.237445',
       'test_2021-03-01 02:30:02.237415',
       'test_2021-03-01 02:30:02.237416',
       'test_2021-03-01 02:30:02.237420',
       'test_2021-03-01 02:30:02.237431',
       'test_2021-03-01 02:30:02.237423',
       'test_2021-03-01 02:30:02.237438',
       'test_2021-03-01 02:30:02.237421'], dtype=object)

array(['ct_0', 'ct_1'], dtype=object)

array(['T_0'], dtype=object)


test = ventes[ventes['session_id'] == 's_0']
test


test.nunique()

id_prod        1
date          39
session_id     1
client_id      2
dtype: int64


test[test.duplicated()].client_id.unique()
test[test.duplicated()].id_prod.unique()

array(['ct_0', 'ct_1'], dtype=object)

array(['T_0'], dtype=object)


clients[(clients['client_id'] == 'ct_0') | (clients['client_id'] == 'ct_1')]
produits[produits['id_prod'] == 'T_0']


# suppression de 'T_O' dans le df produits
produits.drop(produits.loc[produits['id_prod'] == 'T_0'].index, inplace=True)
# on vérifie que 'T_0' n'est plus plus présent dans le df produits
produits[produits['id_prod'] == 'T_0']

# suppression de 'ct_0' et de 'ct_1' dans le df clients
clients.drop(clients.loc[(clients['client_id'] == 'ct_0') | (
    clients['client_id'] == 'ct_1')].index, inplace=True)
# on vérifie que 'ct_0' et 'ct_1' ne sont plus plus présents dans le df client
clients[(clients['client_id'] == 'ct_0') | (clients['client_id'] == 'ct_1')]

# suppression des lignes correspondant à une 'session_id' égale à 's_0' dans le df ventes
ventes.drop(ventes.loc[ventes['session_id'] == 's_0'].index, inplace=True)
# on vérifie que 's_0' n'est plus plus présent dans le df ventes
ventes.loc[ventes['session_id'] == 's_0']


ventes['date'] = pd.to_datetime(ventes['date'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
ventes.dtypes

id_prod               object
date          datetime64[ns]
session_id            object
client_id             object
dtype: object


ventes.isna().sum()

id_prod       0
date          0
session_id    0
client_id     0
dtype: int64


import locale
_ = locale.setlocale(locale.LC_ALL, 'fr_FR.utf8')
# print(locale.getlocale(locale.LC_TIME))

ventes_temp = ventes.copy()
ventes_temp['session_year'] = ventes_temp['date'].dt.year
ventes_temp['session_month'] = ventes_temp['date'].dt.month
ventes_temp['session_week'] = (ventes_temp['date'].dt.strftime('%V')).astype('int')
ventes_temp['session_day_name'] = ventes_temp['date'].dt.day_name('fr_FR')
ventes_temp['session_day'] = ventes_temp['date'].dt.day
ventes_temp['session_hour'] = ventes_temp['date'].dt.hour
ventes_temp['session_minute'] = ventes_temp['date'].dt.minute
ventes_temp.dtypes
ventes_temp['session_date'] = ventes_temp['date'].dt.normalize()
ventes = ventes_temp.copy()
ventes.sample(3)

id_prod                     object
date                datetime64[ns]
session_id                  object
client_id                   object
session_year                 int64
session_month                int64
session_week                 int32
session_day_name            object
session_day                  int64
session_hour                 int64
session_minute               int64
dtype: object


ventes.groupby('session_year')['session_month'].nunique()

session_year
2021    10
2022     2
Name: session_month, dtype: int64


ventes.groupby('session_month')['session_day'].nunique()

session_month
1     31
2     28
3     31
4     30
5     31
6     30
7     31
8     31
9     30
10    31
11    30
12    31
Name: session_day, dtype: int64


ventes['session_date'].min()
ventes['session_date'].max()

Timestamp('2021-03-01 00:00:00')

Timestamp('2022-02-28 00:00:00')


from datetime import datetime
aujdhui = datetime.now()

def age_client(df):
    df['age'] = aujdhui.year - df['birth']

clients.apply(lambda x: age_client(clients))

client_id    None
sex          None
birth        None
dtype: object


clients.sample(3)


clients.age.value_counts()

17    440
42    215
33    207
35    203
43    199
     ... 
88      8
89      6
90      4
91      4
92      3
Name: age, Length: 76, dtype: int64


x = clients.loc[clients['sex'] == 'm']
y = clients.loc[clients['sex'] == 'f']
print(f'L\'âge moyen des clients est de {int(clients.age.mean())} ans, {int(x.age.mean())} pour les hommes et {int(y.age.mean())} pour les femmes')

L'âge moyen des clients est de 42 ans, 42 pour les hommes et 43 pour les femmes


rep_age = clients.groupby(['age', 'sex'])['client_id'].count()
rep_age = rep_age.unstack()
rep_age.reset_index(inplace=True)
rep_age['total'] = rep_age['f'] + rep_age['m']
rep_age.columns = ['age', 'f', 'm', 'total']
rep_age


x = rep_age['age']
y = rep_age['total']
plt.style.use('seaborn-whitegrid')
_ = plt.figure(figsize=(16, 6))
_ = plt.xlim(15, 95)
_ = plt.xlabel('Age', color='gray', fontdict={'size': 16})
_ = plt.ylabel('Nombre de clients', color='gray', fontdict={'size': 16})
_ = plt.title('Répartition des clients par âge', color='MidnightBlue',
              loc='right', y=0.9, x=0.98, fontdict={'size': 20})
_ = plt.bar(x, y, align='center', color='DarkOrange', alpha=0.7)
plt.grid(linestyle='--', alpha=0.3)
plt.show()


produits.sample(3)


produits['price'].describe()
nb_px = produits[['price']].value_counts()
nb_px = nb_px.reset_index()
nb_px.columns = ['prix', 'quantite']
x = nb_px.sort_values(by='prix', ascending=False)
produits.loc[produits['price'] > 50].value_counts().sum()

count    3286.000000
mean       21.863597
std        29.849786
min         0.620000
25%         6.990000
50%        13.075000
75%        22.990000
max       300.000000
Name: price, dtype: float64

273


ax = sns.boxplot(x='price', data=produits)


rep_prix = produits.groupby('price')['id_prod'].nunique()
rep_prix = rep_prix.reset_index()
rep_prix.columns = ['prix', 'nombre']
rep_prix.head(2)
rep_prix.nombre.sum()

3286


fig, ax = plt.subplots(figsize=(12, 4))
plt.style.use('seaborn-whitegrid')
rep_prix.plot(x='prix', y='nombre', kind='scatter', ax=ax)

<AxesSubplot:xlabel='prix', ylabel='nombre'>


produits.groupby('categ').describe()


produits.sample(3)
clients.sample(3)
ventes.columns = ['id_prod', 'date', 's_id', 'client_id', 's_year',
       's_month', 's_week', 's_day_name', 's_day',
       's_hour', 's_minute', 's_date']
ventes.sample(3)


general = pd.merge(ventes, produits, how='outer')
general = pd.merge(general, clients, how='outer')
general.sort_values('s_id')


general.isna().sum()

id_prod        21
date           43
s_id           43
client_id      22
s_year         43
s_month        43
s_week         43
s_day_name     43
s_day          43
s_hour         43
s_minute       43
s_date         43
price         124
categ         124
sex            22
birth          22
age            22
dtype: int64


general[general['price'].isna()].sample(5)


general[general['price'].isna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 18202 to 336858
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id_prod     103 non-null    object        
 1   date        103 non-null    datetime64[ns]
 2   s_id        103 non-null    object        
 3   client_id   124 non-null    object        
 4   s_year      103 non-null    float64       
 5   s_month     103 non-null    float64       
 6   s_week      103 non-null    float64       
 7   s_day_name  103 non-null    object        
 8   s_day       103 non-null    float64       
 9   s_hour      103 non-null    float64       
 10  s_minute    103 non-null    float64       
 11  s_date      103 non-null    datetime64[ns]
 12  price       0 non-null      float64       
 13  categ       0 non-null      float64       
 14  sex         124 non-null    object        
 15  birth       124 non-null    float64       
 16  age         124 non-null    float64       
dtypes: datetime64[ns](2), float64(10), object(5)
memory usage: 17.4+ KB


general[general['price'].isna()].nunique()

id_prod         1
date          103
s_id          103
client_id     121
s_year          2
s_month        12
s_week         45
s_day_name      7
s_day          30
s_hour         24
s_minute       47
s_date         90
price           0
categ           0
sex             2
birth          46
age            46
dtype: int64


general[general['price'].isna()]['id_prod'].value_counts()

0_2245    103
Name: id_prod, dtype: int64


produits[produits['id_prod'] == '0_2245']


temp = general[['categ', 'id_prod']]
temp = temp.dropna()
temp['categ'] = temp['categ'].apply(lambda x: int(x)).astype('str')
id_prod_0 = temp[temp['id_prod'].str.startswith('0')]
id_prod_0['categ'].unique()
id_prod_non_0 = temp[~temp['id_prod'].str.startswith('0')]
id_prod_non_0['categ'].unique()

array(['0'], dtype=object)

array(['1', '2'], dtype=object)


mean_cat_0 = produits.loc[produits['categ'] == 0, 'price'].mean()
median_cat_0 = produits.loc[produits['categ'] == 0, 'price'].median()
mean_cat_0
median_cat_0

11.732794627383296

10.32


general.loc[general['id_prod'] == '0_2245', 'price'] = mean_cat_0
general.loc[general['id_prod'] == '0_2245', 'categ'] = 0.0


general[general['id_prod'] == '0_2245']


produits_cor = produits.copy()
produits_cor.loc[3287]=['0_2245', 11.732795, 0]
produits_cor


general.isna().sum()

id_prod       21
date          43
s_id          43
client_id     22
s_year        43
s_month       43
s_week        43
s_day_name    43
s_day         43
s_hour        43
s_minute      43
s_date        43
price         21
categ         21
sex           22
birth         22
age           22
dtype: int64


general[general['client_id'].isna()].sample(3)
prod_sans_commande_ls = list(general[general['client_id'].isna()]['id_prod'])
len(prod_sans_commande_ls)

22


prod_sans_commande = produits[produits['id_prod'].isin(prod_sans_commande_ls)]
general = general.dropna(subset=['client_id'])


general[general['id_prod'].isna()].sample(3)
clients_sans_commande_ls = general[general['id_prod'].isna()]['client_id']
len(clients_sans_commande_ls)

21


clients_sans_commande = clients[clients['client_id'].isin(clients_sans_commande_ls)]
general = general.dropna(subset=['id_prod'])


general.isna().sum()

id_prod       0
date          0
s_id          0
client_id     0
s_year        0
s_month       0
s_week        0
s_day_name    0
s_day         0
s_hour        0
s_minute      0
s_date        0
price         0
categ         0
sex           0
birth         0
age           0
dtype: int64


general.info()
general.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 336816 entries, 0 to 336815
Data columns (total 17 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   id_prod     336816 non-null  object        
 1   date        336816 non-null  datetime64[ns]
 2   s_id        336816 non-null  object        
 3   client_id   336816 non-null  object        
 4   s_year      336816 non-null  float64       
 5   s_month     336816 non-null  float64       
 6   s_week      336816 non-null  float64       
 7   s_day_name  336816 non-null  object        
 8   s_day       336816 non-null  float64       
 9   s_hour      336816 non-null  float64       
 10  s_minute    336816 non-null  float64       
 11  s_date      336816 non-null  datetime64[ns]
 12  price       336816 non-null  float64       
 13  categ       336816 non-null  float64       
 14  sex         336816 non-null  object        
 15  birth       336816 non-null  float64       
 16  age         336816 non-null  float64       
dtypes: datetime64[ns](2), float64(10), object(5)
memory usage: 46.3+ MB


general.isna().sum()

id_prod       0
date          0
s_id          0
client_id     0
s_year        0
s_month       0
s_week        0
s_day_name    0
s_day         0
s_hour        0
s_minute      0
s_date        0
price         0
categ         0
sex           0
birth         0
age           0
dtype: int64


general.columns = ['p_id', 's_id_date', 's_panier_id', 'c_id', 's_year', 's_month', 's_week', 's_day_n',
                   's_day', 's_hour', 's_minute', 's_date', 'p_prix', 'p_categ', 'c_sex', 'c_birth', 'c_age']


res = general.reindex(columns=[
    'c_id', 'c_sex', 'c_age', 'c_birth',
    'p_id', 'p_categ', 'p_prix',
    's_panier_id', 's_id_date', 's_year', 's_month', 's_week', 's_day_n', 's_day', 's_hour', 's_minute', 's_date'
])
datetime_is_numeric = True
res.describe(datetime_is_numeric=True)


res.sample(5)


res.dtypes

c_id                   object
c_sex                  object
c_age                 float64
c_birth               float64
p_id                   object
p_categ               float64
p_prix                float64
s_panier_id            object
s_id_date      datetime64[ns]
s_year                float64
s_month               float64
s_week                float64
s_day_n                object
s_day                 float64
s_hour                float64
s_minute              float64
s_date         datetime64[ns]
dtype: object


res.to_csv("data/data_transition/res.csv", index = False, header = True)
produits_cor.to_csv("data/data_transition/produits_cor.csv", index = False, header = True)

	id_prod	price	categ
2944	1_319	43.99	1
2153	0_1250	5.17	0
704	0_2277	10.99	0
1129	0_2057	8.99	0
2516	0_1331	29.99	0

	id_prod	price	categ
count	3287	3287.000000	3287.000000
unique	3287	NaN	NaN
top	0_686	NaN	NaN
freq	1	NaN	NaN
mean	NaN	21.856641	0.370246
std	NaN	29.847908	0.615387
min	NaN	-1.000000	0.000000
25%	NaN	6.990000	0.000000
50%	NaN	13.060000	0.000000
75%	NaN	22.990000	1.000000
max	NaN	300.000000	2.000000

	client_id	sex	birth
count	8623	8623	8623.000000
unique	8623	2	NaN
top	c_460	f	NaN
freq	1	4491	NaN
mean	NaN	NaN	1978.280877
std	NaN	NaN	16.919535
min	NaN	NaN	1929.000000
25%	NaN	NaN	1966.000000
50%	NaN	NaN	1979.000000
75%	NaN	NaN	1992.000000
max	NaN	NaN	2004.000000

	id_prod	date	session_id	client_id
258609	0_1604	2021-12-23 01:31:54.167449	s_138520	c_6334
29820	0_1282	2021-05-11 11:38:10.968569	s_32993	c_1221
50844	0_1604	2021-08-29 08:41:08.081023	s_82151	c_4101
121093	0_1409	2021-05-20 12:25:47.887476	s_37069	c_3474

	id_prod	price
1129	0_2057	8.99
1816	0_974	11.75
2539	0_309	3.99

Table des matières

Paramètres config¶

Mission 1 : Nettoyage des données¶

1. Import et vérification des données¶

1.1 Table products¶

1.2 Table customers¶

1.3 Table transactions¶

2. Traitement des données¶

2.1 Suppression des éléments de test¶

2.2 Typage des dates¶

2.3 Création d'un df global¶

2.3.1 Travail préparatoire¶

2.3.1.1 df Clients¶

Ajout de l'âge des clients¶

2.3.1.2 df Produits¶

2.3.2 Merge des 3 tables¶

2.3.3 Traitement des valeurs nulles¶

2.3.2.1 Un produit non référencé dans la table products¶

2.3.2.2 Références produits jamais commandés¶

2.3.2.3 Clients qui n'ont jamais commandés¶

2.3.4 Organisation du df global¶

3. Remarque¶

Export des données¶

	client_id	sex	birth
6121	c_3388	m	1989
4376	c_1631	f	1981
8263	c_2046	f	1969
5513	c_3232	m	1971
3027	c_8269	f	1969

	id_prod	date	session_id	client_id
count	337016	337016	337016	337016
unique	3266	336855	169195	8602
top	1_369	test_2021-03-01 02:30:02.237413	s_0	c_1609
freq	1081	13	200	12855

	id_prod	date	session_id	client_id
34387	T_0	test_2021-03-01 02:30:02.237443	s_0	ct_0
54813	T_0	test_2021-03-01 02:30:02.237412	s_0	ct_1
57261	T_0	test_2021-03-01 02:30:02.237439	s_0	ct_1

	id_prod	date	session_id	client_id
1431	T_0	test_2021-03-01 02:30:02.237420	s_0	ct_1
2365	T_0	test_2021-03-01 02:30:02.237446	s_0	ct_1
2895	T_0	test_2021-03-01 02:30:02.237414	s_0	ct_1
5955	T_0	test_2021-03-01 02:30:02.237441	s_0	ct_0
7283	T_0	test_2021-03-01 02:30:02.237434	s_0	ct_1
...	...	...	...	...
332594	T_0	test_2021-03-01 02:30:02.237445	s_0	ct_0
332705	T_0	test_2021-03-01 02:30:02.237423	s_0	ct_1
332730	T_0	test_2021-03-01 02:30:02.237421	s_0	ct_1
333442	T_0	test_2021-03-01 02:30:02.237431	s_0	ct_1
335279	T_0	test_2021-03-01 02:30:02.237430	s_0	ct_0

	id_prod	date	session_id	client_id	session_year	session_month	session_week	session_day_name	session_day	session_hour	session_minute	session_date
329447	0_1328	2021-10-10 02:28:23.753941	s_102544	c_8576	2021	10	40	Dimanche	10	2	28	2021-10-10
193220	0_2127	2021-04-13 23:00:26.347021	s_20201	c_56	2021	4	15	Mardi	13	23	0	2021-04-13
160650	0_1464	2022-01-01 05:31:53.251212	s_143229	c_1939	2022	1	52	Samedi	1	5	31	2022-01-01

	age	f	m	total
0	17	232	208	440
1	18	65	81	146
2	19	66	80	146
3	20	71	56	127
4	21	78	58	136
...	...	...	...	...
71	88	5	3	8
72	89	5	1	6
73	90	2	2	4
74	91	3	1	4
75	92	1	2	3

	id_prod	date	s_id	client_id	s_year	s_month	s_week	s_day_name	s_day	s_hour	s_minute	s_date
254401	0_1476	2022-02-16 11:45:26.278797	s_166060	c_3913	2022	2	7	Mercredi	16	11	45	2022-02-16
94718	1_445	2021-09-05 10:38:45.648721	s_85351	c_3536	2021	9	35	Dimanche	5	10	38	2021-09-05
114954	0_2251	2021-03-08 15:42:49.593020	s_3527	c_2815	2021	3	10	Lundi	8	15	42	2021-03-08

	id_prod	date	s_id	client_id	s_year	s_month	s_week	s_day_name	s_day	s_hour	s_minute	s_date	price	categ	sex	birth	age
272113	0_1259	2021-03-01 00:01:07.843138	s_1	c_329	2021.0	3.0	9.0	Lundi	1.0	0.0	1.0	2021-03-01	11.99	0.0	f	1967.0	54.0
212254	1_635	2021-03-01 00:10:33.163037	s_10	c_2218	2021.0	3.0	9.0	Lundi	1.0	0.0	10.0	2021-03-01	26.99	1.0	f	1970.0	51.0
24502	0_1451	2021-03-01 04:43:58.025677	s_100	c_3854	2021.0	3.0	9.0	Lundi	1.0	4.0	43.0	2021-03-01	19.99	0.0	f	1978.0	43.0
24482	0_1030	2021-03-01 04:12:43.572994	s_100	c_3854	2021.0	3.0	9.0	Lundi	1.0	4.0	12.0	2021-03-01	13.73	0.0	f	1978.0	43.0
275592	0_1625	2021-03-03 02:38:09.568389	s_1000	c_1014	2021.0	3.0	9.0	Mercredi	3.0	2.0	38.0	2021-03-03	5.99	0.0	m	1989.0	32.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
336854	NaN	NaT	NaN	c_862	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaN	NaN	f	1956.0	65.0
336855	NaN	NaT	NaN	c_7584	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaN	NaN	f	1960.0	61.0
336856	NaN	NaT	NaN	c_90	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaN	NaN	m	2001.0	20.0
336857	NaN	NaT	NaN	c_587	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaN	NaN	m	1993.0	28.0
336858	NaN	NaT	NaN	c_3526	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaN	NaN	m	1956.0	65.0

	id_prod	date	s_id	client_id	s_year	s_month	s_week	s_day_name	s_day	s_hour	s_minute	s_date	price	categ	sex	birth	age
224627	0_2245	2022-01-04 19:02:47.712837	s_144966	c_8595	2022.0	1.0	1.0	Mardi	4.0	19.0	2.0	2022-01-04	NaN	NaN	f	1980.0	41.0
27016	0_2245	2021-11-01 14:00:26.535213	s_113219	c_7810	2021.0	11.0	44.0	Lundi	1.0	14.0	0.0	2021-11-01	NaN	NaN	m	1978.0	43.0
283133	0_2245	2021-04-18 08:28:11.501497	s_22209	c_3496	2021.0	4.0	15.0	Dimanche	18.0	8.0	28.0	2021-04-18	NaN	NaN	f	1957.0	64.0
42159	0_2245	2021-05-06 20:14:22.522372	s_30824	c_4973	2021.0	5.0	18.0	Jeudi	6.0	20.0	14.0	2021-05-06	NaN	NaN	f	1983.0	38.0
295329	0_2245	2021-05-01 03:35:03.146305	s_28235	c_5714	2021.0	5.0	17.0	Samedi	1.0	3.0	35.0	2021-05-01	NaN	NaN	f	1972.0	49.0

	id_prod	date	s_id	client_id	s_year	s_month	s_week	s_day_name	s_day	s_hour	s_minute	s_date	price	categ	sex	birth	age
18202	0_2245	2021-06-05 17:04:43.982913	s_44481	c_6714	2021.0	6.0	22.0	Samedi	5.0	17.0	4.0	2021-06-05	11.732795	0.0	f	1968.0	53.0
18203	0_2245	2021-04-22 04:57:20.090378	s_23987	c_6714	2021.0	4.0	16.0	Jeudi	22.0	4.0	57.0	2021-04-22	11.732795	0.0	f	1968.0	53.0
20708	0_2245	2021-04-17 16:43:16.543156	s_21906	c_7808	2021.0	4.0	15.0	Samedi	17.0	16.0	43.0	2021-04-17	11.732795	0.0	m	1977.0	44.0
27016	0_2245	2021-11-01 14:00:26.535213	s_113219	c_7810	2021.0	11.0	44.0	Lundi	1.0	14.0	0.0	2021-11-01	11.732795	0.0	m	1978.0	43.0
30498	0_2245	2021-09-11 10:52:05.205583	s_88251	c_3468	2021.0	9.0	36.0	Samedi	11.0	10.0	52.0	2021-09-11	11.732795	0.0	f	1981.0	40.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
299794	0_2245	2021-09-19 03:08:45.918021	s_92049	c_4935	2021.0	9.0	37.0	Dimanche	19.0	3.0	8.0	2021-09-19	11.732795	0.0	f	1982.0	39.0
311031	0_2245	2021-11-12 09:25:22.905886	s_118458	c_7416	2021.0	11.0	45.0	Vendredi	12.0	9.0	25.0	2021-11-12	11.732795	0.0	m	1933.0	88.0
311482	0_2245	2021-06-17 03:03:12.668129	s_49705	c_1533	2021.0	6.0	24.0	Jeudi	17.0	3.0	3.0	2021-06-17	11.732795	0.0	m	1972.0	49.0
324186	0_2245	2021-11-20 20:21:06.505658	s_122593	c_8524	2021.0	11.0	46.0	Samedi	20.0	20.0	21.0	2021-11-20	11.732795	0.0	f	1982.0	39.0
325475	0_2245	2021-05-20 07:44:21.415061	s_36985	c_1450	2021.0	5.0	20.0	Jeudi	20.0	7.0	44.0	2021-05-20	11.732795	0.0	f	1959.0	62.0

	client_id	sex	birth	age
1647	c_6102	f	1983	38
3905	c_2296	f	1987	34
2521	c_2349	f	1986	35

	prix	nombre
0	0.62	2
1	0.66	1

	price
	count	mean	std	min	25%	50%	75%	max
categ
0	2308.0	11.732795	7.565755	0.62	5.5875	10.32	16.655	40.99
1	739.0	25.531421	15.425162	2.00	13.3900	22.99	33.990	80.99
2	239.0	108.354686	49.561431	30.99	71.0650	101.99	136.530	300.00

	id_prod	price	categ
2744	1_123	2.99	1
44	0_1317	4.99	0
3162	1_713	33.99	1

	age	f	m	total
0	17	232	208	440
1	18	65	81	146
2	19	66	80	146
3	20	71	56	127
4	21	78	58	136
...	...	...	...	...
71	88	5	3	8
72	89	5	1	6
73	90	2	2	4
74	91	3	1	4
75	92	1	2	3

	id_prod	price	categ
0	0_1421	19.990000	0
1	0_1368	5.130000	0
2	0_731	17.990000	0
3	1_587	4.990000	1
4	0_1507	3.990000	0
...	...	...	...
3283	0_146	17.140000	0
3284	0_802	11.220000	0
3285	1_140	38.560000	1
3286	0_1920	25.160000	0
3287	0_2245	11.732795	0

	id_prod	date	s_id	client_id	s_year	s_month	s_week	s_day_name	s_day	s_hour	s_minute	s_date	price	sex	birth	age
336816	0_1016	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	35.06	NaN	NaN	NaN
336819	0_1119	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	2.99	NaN	NaN	NaN
336824	0_1645	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	2.99	NaN	NaN	NaN

	id_prod	date	s_id	client_id	s_year	s_month	s_week	s_day_name	s_day	s_hour	s_minute	s_date	price	categ	sex	birth	age
336844	NaN	NaT	NaN	c_3017	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaN	NaN	f	1992.0	29.0
336852	NaN	NaT	NaN	c_5223	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaN	NaN	m	2003.0	18.0
336841	NaN	NaT	NaN	c_2706	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaN	NaN	f	1967.0	54.0

	id_prod	date	s_id	client_id	s_year	s_month	s_week	s_day_name	s_day	s_hour	s_minute	s_date	price	categ	sex	birth	age
177017	0_1623	2021-04-08 17:53:07.850941	s_17806	c_3297	2021.0	4.0	14.0	Jeudi	8.0	17.0	53.0	2021-04-08	7.99	0.0	f	1975.0	46.0
68121	0_1335	2021-11-17 17:20:14.980155	s_121071	c_4329	2021.0	11.0	46.0	Mercredi	17.0	17.0	20.0	2021-11-17	17.99	0.0	f	1990.0	31.0
266314	0_1410	2021-04-25 02:21:28.264392	s_25347	c_7137	2021.0	4.0	16.0	Dimanche	25.0	2.0	21.0	2021-04-25	14.85	0.0	f	1980.0	41.0
319707	1_444	2021-06-03 17:31:48.105361	s_43597	c_7256	2021.0	6.0	22.0	Jeudi	3.0	17.0	31.0	2021-06-03	12.61	1.0	m	1996.0	25.0
81860	0_1193	2022-02-05 06:15:59.982439	s_160228	c_6646	2022.0	2.0	5.0	Samedi	5.0	6.0	15.0	2022-02-05	9.99	0.0	m	1974.0	47.0

	c_age	c_birth	p_categ	p_prix	s_id_date	s_year	s_month	s_week	s_day	s_hour	s_minute	s_date
count	336816.000000	336816.000000	336816.000000	336816.000000	336816	336816.000000	336816.000000	336816.000000	336816.000000	336816.000000	336816.000000	336816
mean	43.176604	1977.823396	0.430024	17.213512	2021-09-03 02:15:11.058548224	2021.174822	6.482133	26.297925	15.801628	11.513533	29.481533	2021-09-02 14:14:23.445917184
min	17.000000	1929.000000	0.000000	0.620000	2021-03-01 00:01:07.843138	2021.000000	1.000000	1.000000	1.000000	0.000000	0.000000	2021-03-01 00:00:00
25%	34.000000	1971.000000	0.000000	8.610000	2021-05-30 21:13:10.275369728	2021.000000	3.000000	13.000000	8.000000	6.000000	14.000000	2021-05-30 00:00:00
50%	41.000000	1980.000000	0.000000	13.900000	2021-09-07 05:00:52.682486272	2021.000000	6.000000	26.000000	16.000000	12.000000	29.000000	2021-09-07 00:00:00
75%	50.000000	1987.000000	1.000000	18.990000	2021-12-08 04:39:47.265316864	2021.000000	9.000000	39.000000	23.000000	18.000000	45.000000	2021-12-08 00:00:00
max	92.000000	2004.000000	2.000000	300.000000	2022-02-28 23:59:58.040472	2022.000000	12.000000	52.000000	31.000000	23.000000	59.000000	2022-02-28 00:00:00
std	13.523923	13.523923	0.591039	17.852972	NaN	0.379816	3.519807	15.350837	8.818357	6.919540	17.329827	NaN

	c_id	c_sex	c_age	c_birth	p_id	p_categ	p_prix	s_panier_id	s_id_date	s_year	s_month	s_week	s_day_n	s_day	s_hour	s_minute	s_date
157658	c_6359	m	39.0	1982.0	0_1469	0.0	14.99	s_22080	2021-04-18 01:32:17.955742	2021.0	4.0	15.0	Dimanche	18.0	1.0	32.0	2021-04-18
259667	c_3162	f	58.0	1963.0	1_4	1.0	24.21	s_39904	2021-05-26 18:55:52.239097	2021.0	5.0	21.0	Mercredi	26.0	18.0	55.0	2021-05-26
226730	c_1756	f	42.0	1979.0	0_1567	0.0	16.99	s_36542	2021-05-19 07:53:21.440913	2021.0	5.0	20.0	Mercredi	19.0	7.0	53.0	2021-05-19
3956	c_1609	m	41.0	1980.0	0_1253	0.0	12.99	s_127149	2021-11-30 05:06:45.487262	2021.0	11.0	48.0	Mardi	30.0	5.0	6.0	2021-11-30
145637	c_7924	f	50.0	1971.0	1_325	1.0	27.99	s_21641	2021-04-17 01:27:35.421205	2021.0	4.0	15.0	Samedi	17.0	1.0	27.0	2021-04-17

	age	f	m	total
0	17	232	208	440
1	18	65	81	146
2	19	66	80	146
3	20	71	56	127
4	21	78	58	136
...	...	...	...	...
71	88	5	3	8
72	89	5	1	6
73	90	2	2	4
74	91	3	1	4
75	92	1	2	3