%%html
<style>
.MathJax {
    font-size: 1.3em;
}
.rendered_html tr, .rendered_html th, .rendered_html td {
    text-align: right !important;
}
a[data-snippet-code]::after {
    background: #262931 !important;
}
.titre-pers{
    font-family: arial;
    font-size: 250% !important;
    line-height: 200% !important;
    text-align: center !important;
    color: #4c8be2 !important;
}
.jp-RenderedHTMLCommon h1,
.rendered_html h1,
.text_cell_render h1 {
    color: #86bed9 !important;
    line-height: 150% !important;
}
.jp-RenderedHTMLCommon h2,
.rendered_html h2,
.text_cell_render h2 {
    color: #b08c20 !important;
    padding-left: .5rem !important;
    line-height: 150% !important;
}
.jp-RenderedHTMLCommon h3,
.rendered_html h3,
.text_cell_render h3 {
    color: #3aa237 !important;
    padding-left: 1rem !important;
    line-height: 150% !important;
    font-size: 120% !important;
}
.jp-RenderedHTMLCommon h4,
.rendered_html h4,
.text_cell_render h4 {
    color: #29858a !important;
    padding-left: 2rem !important;
    font-size: 110% !important;
}
.jp-RenderedHTMLCommon h5,
.rendered_html h5,
.text_cell_render h5 {
    color: #21417d !important;
    padding-left: 2.5rem !important;
    font-size: 110% !important;
}
.jp-RenderedHTMLCommon h6,
.rendered_html h6,
.text_cell_render h6 {
    color: #d8a802c2 !important;
    padding-left: 1rem !important;
    font-family: sans-serif !important;
    font-size: 120% !important;
    font-weight: normal !important;
    font-style: normal !important;
}
.renf{
    font-size: 18px !important;
    font-family: Arial !important;
    color: #14db9a !important;
}
.renf2{
    font-size: 18px !important;
    font-family: Arial !important;
    color: orangered !important;
}
</style>


import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
import scipy.stats as st
import scipy as sp
import math
from math import sqrt
import numpy as np
import pandas as pd
import pingouin as pg
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
plt.style.use('seaborn-deep')
%matplotlib inline
#from jupyterthemes import jtplot
# jtplot.style()
params = {
    'legend.fontsize': 'large',
    'legend.title_fontsize': 'large',
    'figure.figsize': (18, 6),
    'axes.labelsize': 'x-large',
    'axes.titlesize': 'x-large',
    'xtick.labelsize': 'x-large',
    'xtick.major.pad': 12,
    'ytick.labelsize': 'x-large',
    'ytick.major.pad': 12,
    'lines.linewidth': 2,
    'savefig.dpi': 300
}
pylab.rcParams.update(params)
pd.options.mode.chained_assignment = None  # default='warn'


color1 = ['#7fc97f', '#beaed4', '#fdc086', '#ffff99',
          '#386cb0', '#f0027f', '#bf5b17', '#666666']
color2 = ['#1b9e77', '#d95f02', '#7570b3', '#e7298a',
          '#66a61e', '#e6ab02', '#a6761d', '#666666']
color3 = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c',
          '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00']
color4 = ['#fbb4ae', '#b3cde3', '#ccebc5', '#decbe4',
          '#fed9a6', '#ffffcc', '#e5d8bd', '#fddaec']
color5 = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3',
          '#a6d854', '#ffd92f', '#e5c494', '#b3b3b3']
color6 = ['#b3e2cd', '#fdcdac', '#cbd5e8', '#f4cae4',
          '#e6f5c9', '#fff2ae', '#f1e2cc', '#cccccc']
color7 = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3',
          '#ff7f00', '#ffff33', '#a65628', '#f781bf']
color8 = ['#8dd3c7', '#ffffb3', '#bebada', '#fb8072',
          '#80b1d3', '#fdb462', '#b3de69', '#fccde5']

color2l = ['red', 'green', '#78a6cd', '#ccebc5', '#decbe4']

color_seq1 = ['#f7fcfd', '#e5f5f9', '#ccece6', '#99d8c9',
              '#66c2a4', '#41ae76', '#238b45', '#005824']
color_seq2 = ['#fff7ec', '#fee8c8', '#fdd49e', '#fdbb84',
              '#fc8d59', '#ef6548', '#d7301f', '#990000']
color_seq3 = ['#f7fbff', '#deebf7', '#c6dbef', '#9ecae1',
              '#6baed6', '#4292c6', '#2171b5', '#084594']
color_seq4 = ['#f7fcf5', '#e5f5e0', '#c7e9c0', '#a1d99b',
              '#74c476', '#41ab5d', '#238b45', '#005a32']
color_seq5 = ['#fff5f0', '#fee0d2', '#fcbba1', '#fc9272',
              '#fb6a4a', '#ef3b2c', '#cb181d', '#99000d']
color_seq6 = ['#fff5eb', '#fee6ce', '#fdd0a2', '#fdae6b',
              '#fd8d3c', '#f16913', '#d94801', '#8c2d04']
color_seq7 = ['#ffffd9', '#edf8b1', '#c7e9b4', '#7fcdbb',
              '#41b6c4', '#1d91c0', '#225ea8', '#0c2c84']
color_seq8 = ['#fff7fb', '#ece2f0', '#d0d1e6', '#a6bddb',
              '#67a9cf', '#3690c0', '#02818a', '#016450']
color_seq9 = ['#ffffff', '#f0f0f0', '#d9d9d9', '#bdbdbd',
              '#969696', '#737373', '#525252', '#252525']

color_div1 = ['#8c510a', '#bf812d', '#dfc27d', '#f6e8c3',
              '#c7eae5', '#80cdc1', '#35978f', '#01665e']
color_div2 = ['#c51b7d', '#de77ae', '#f1b6da', '#fde0ef',
              '#e6f5d0', '#b8e186', '#7fbc41', '#4d9221']
color_div3 = ['#762a83', '#9970ab', '#c2a5cf', '#e7d4e8',
              '#d9f0d3', '#a6dba0', '#5aae61', '#1b7837']
color_div4 = ['#b2182b', '#d6604d', '#f4a582', '#fddbc7',
              '#d1e5f0', '#92c5de', '#4393c3', '#2166ac']
color_div5 = ['#b2182b', '#d6604d', '#f4a582', '#fddbc7',
              '#e0e0e0', '#bababa', '#878787', '#4d4d4d']
color_div6 = ['#d73027', '#f46d43', '#fdae61', '#fee090',
              '#e0f3f8', '#abd9e9', '#74add1', '#4575b4']
color_div7 = ['#d73027', '#f46d43', '#fdae61', '#fee08b',
              '#d9ef8b', '#a6d96a', '#66bd63', '#1a9850']
color_div8 = ['#d53e4f', '#f46d43', '#fdae61', '#fee08b',
              '#e6f598', '#abdda4', '#66c2a5', '#3288bd']


def draw_bar_single(var_x, var_y, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1, titre_x=0.5, title_linespacing=1.0, legend_out=False, col_bar='teal', alpha_bar=0.7, base_mtick=1.0, xdate_available=False, alpha_grid=0.3):
    """
        Mettre xdate_available=True pour orienté le texte des graduations sur l'axe x
    """

    x = var_x
    y = var_y

    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})

    loc = mtick.MultipleLocator(base=base_mtick)
    ax.xaxis.set_major_locator(loc)
    if xdate_available:
        fig.autofmt_xdate(bottom=0.2, rotation=30, ha='right', which='major')
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})

    if legend_out:
        _ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')

    _ = plt.bar(x, y, align='center', color=col_bar, alpha=alpha_bar)
    _ = plt.grid(linestyle='--', alpha=alpha_grid)


###########################

def draw_plot(var_y, var_x, label_x, label_y, titre, nb=1, col_titre='MidnightBlue', pos_titre='center', titre_y=1, titre_x=0.5, title_linespacing=1.0, legend_out=False, col_bar='teal', alpha_bar=0.7, base_mtick=1.0, xdate_available=False, alpha_grid=0.3):
    """
        Mettre xdate_available=True pour orienté le texte des graduations sur l'axe x
    """

    y = var_y
    x = var_x

    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})

    loc = mtick.MultipleLocator(base=base_mtick)
    ax.xaxis.set_major_locator(loc)
    if xdate_available:
        fig.autofmt_xdate(bottom=0.2, rotation=30, ha='right', which='major')
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})

    if legend_out:
        ax.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
    ax.plot(y, x, color=col_bar, alpha=alpha_bar)
    ax.grid(linestyle='--', alpha=alpha_grid)


################################

def draw_bar_several(dfgroupby, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, legend_pers=False, legend=None, xticks_rotation='horizontal', base_mtick=1.0, g_stacked=False, g_stacked_perc=False, g_width=0.8, g_kind='bar', g_color=color1, alpha_grid=0.3):
    if g_stacked_perc:
        plt.plot = dfgroupby.groupby(level=0).apply(lambda x: 100 * x / x.sum()).unstack().plot(
            kind=g_kind,
            stacked=g_stacked,
            width=g_width,
            color=g_color
        )
        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
    else:
        plt.plot = dfgroupby.unstack().plot(
            kind=g_kind,
            stacked=g_stacked,
            width=g_width,
            color=g_color
        )

    if legend_out:
        _ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
    if legend_pers:
        _ = plt.legend(legend, bbox_to_anchor=(1.03, 1.0), loc='upper left')
    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
    _ = plt.xticks(rotation=xticks_rotation)
    loc = mtick.MultipleLocator(base=base_mtick)
    ax.xaxis.set_major_locator(loc)
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.grid(linestyle='--', alpha=alpha_grid)


def draw_bar_pie(titre, labels_in, size_in, explode_in, titre_legend, z, a=1, b=1, c=1, color_titre='MidnightBlue', def_angle=90, titre_x=0.5, titre_y=1.05, title_linespacing=1, legend_available=True, pos_titre='center', size_titre=18, color_perc='white', size_perc=16, loc_legend="center left", bbox_legend=(1, 0, 0.5, 1), g_color=color5):
    """
        fig=plt.figure(figsize=(18,6))
    """

    labels = labels_in
    size = size_in
    explode = explode_in
    z = fig.add_subplot(a, b, c)
    patches, texts, autotexts = z.pie(
        size, explode=explode, autopct='%1.1f%%', shadow=True, startangle=def_angle, colors=g_color)
    if legend_available:
        legend = z.legend(patches, labels,
                          title=titre_legend,
                          loc=loc_legend,
                          bbox_to_anchor=bbox_legend)
    # Equal aspect ratio ensures that pie is drawn as a circle.
    z.axis('equal')
    _ = plt.setp(autotexts, color=color_perc, size=size_perc, weight="bold")
    z.set_title(titre, color=color_titre, loc='center', y=titre_y, x=titre_x,
                linespacing=title_linespacing, fontdict={'size': size_titre, 'weight': 500})


def draw_scatter(df, col_df_x, col_df_y, label_x, label_y, titre, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, xticks_rotation='horizontal', base_mtick=1.0, g_stacked=False, g_stacked_perc=False, color_scatter='teal', alpha_grid=0.3):
    params = {
        'figure.figsize': (16, 6),
    }
    pylab.rcParams.update(params)
    df.plot(
        kind='scatter',
        x=col_df_x,
        y=col_df_y,
        color=color_scatter
    )
    if legend_out:
        plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')

    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
    _ = plt.xticks(rotation=xticks_rotation)
    loc = mtick.MultipleLocator(base=base_mtick)
    ax.xaxis.set_major_locator(loc)
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.grid(linestyle='--', alpha=alpha_grid)
    # _=plt.show()


def draw_options(titre, label_x, label_y, col_titre='MidnightBlue', pos_titre='center', titre_y=1.02, titre_x=0.5, title_linespacing=1, legend_out=False, legend_pers=False, legend=None, alpha_grid=0.3):
    _ = plt.title(titre, color=col_titre, loc=pos_titre, y=titre_y, x=titre_x,
                  linespacing=title_linespacing, fontdict={'size': 20, 'weight': 500})
    _ = plt.xlabel(label_x, labelpad=15, color='gray', fontdict={'size': 16})
    _ = plt.ylabel(label_y, labelpad=15, color='gray', fontdict={'size': 16})
    if legend_out:
        _ = plt.legend(bbox_to_anchor=(1.03, 1.0), loc='upper left')
    if legend_pers:
        _ = plt.legend(legend_p, bbox_to_anchor=(1.03, 1.0), loc='upper left')
    _ = plt.grid(linestyle='--', alpha=alpha_grid)

# this plots multiple seaborn histograms on different subplots


def plot_multiple_histograms(df, cols):
    num_plots = len(cols)
    num_cols = math.ceil(np.sqrt(num_plots))
    num_rows = math.ceil(num_plots/num_cols)

    fig, axs = plt.subplots(num_rows, num_cols)

    for ind, col in enumerate(cols):
        i = math.floor(ind/num_cols)
        j = ind - i*num_cols

        if num_rows == 1:
            if num_cols == 1:
                sns.distplot(df[col], kde=True, ax=axs)
            else:
                sns.distplot(df[col], kde=True, ax=axs[j])
        else:
            sns.distplot(df[col], kde=True, ax=axs[i, j])


def gini(arr):
    count = arr.size
    coefficient = 2 / count
    indexes = np.arange(1, count + 1)
    weighted_sum = (indexes * arr).sum()
    total = arr.sum()
    constant = (count + 1) / count
    return coefficient * weighted_sum / total - constant


def lorenz_f(arrg, title, xlabel, ylabel, arrg2=None, arrg3=None, arrg4=None, nb=1, fs=8, title_lorenz=None):
    patterns = ('X', '\\', '-', '+', 'O', '*', '.')
    fig = plt.figure(figsize=(fs, fs))

    x1 = [0, 1]
    y1 = [0, 1]
    ax = fig.add_subplot(111)
    _ = ax.plot(x1, y1, color='black', alpha=0.7)
    _ = ax.set(xlim=(0, 1), ylim=(0, 1))
    # _=ax.axis('equal')
    _ = ax.tick_params(axis='both', which='major', labelsize=14)
    plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    _ = plt.ylabel(ylabel, color='gray', labelpad=10, fontdict={'size': 16})
    _ = plt.xlabel(xlabel, color='gray', labelpad=15, fontdict={'size': 16})
    _ = plt.title(title, y=1.02, fontdict={'size': 20, 'weight': 500})

    if nb == 2:
        arrge = [arrg, arrg2]
    if nb == 3:
        arrge = [arrg, arrg2, arrg3]
    if nb == 4:
        arrge = [arrg, arrg2, arrg3, arrg4]
    for i in range(nb):
        if nb == 1:
            arr = arrg
        else:
            arr = arrge[i]

        arr = arr.sort_values()
        n = len(arr)
        lorenz = np.cumsum(arr) / arr.sum()
        lorenz = np.append([0], lorenz)
        xaxis = np.linspace(0, 1, n+1)
        if title_lorenz:
            _ = ax.plot(xaxis, lorenz, drawstyle='steps-post',
                        color=color2l[i], label='Courbe de Lorenz '+title_lorenz[i])
        else:
            _ = ax.plot(xaxis, lorenz, drawstyle='steps-post',
                        color=color2l[i], label='Courbe de Lorenz')
        if nb == 1:
            _ = ax.fill_between(xaxis, xaxis, lorenz, alpha=0.6, hatch="X",
                                color='teal', label='Surface de concentration')
            _ = ax.fill_between(np.linspace(0, 1, len(lorenz)),
                                lorenz, color='gray', alpha=0.3)
        else:
            _ = ax.fill_between(xaxis, xaxis, lorenz,
                                hatch=patterns[i],  color=color2l[i], alpha=0.3)

        gini_value = gini(arr)

        if title_lorenz:
            r = 0.7-(i/20)
            _ = ax.text(0.1, r, "Gini "+str(title_lorenz[i]) + " = "+str(
                round(gini_value, 2)), fontsize=16, color=color2l[i], weight='semibold')
        else:
            _ = ax.text(0.2, 0.6, "Gini = " + str(round(gini_value, 2)),
                        fontsize=18, color='darkred', weight='semibold')

        patch = []
        if nb == 1:
            a_patch = mpatches.Patch(
                color='teal', alpha=0.6, hatch="X", label='Surface de concentration')
            b_patch = mpatches.Patch(
                color=color2l[0], label='Courbe de Lorenz')
            _ = plt.legend(handles=[a_patch, b_patch])
        else:
            if title_lorenz:
                p = mpatches.Patch(
                    color=color2l[i], label='Courbe de Lorenz '+title_lorenz[i])
                patch.append(p)
                _ = plt.legend(handles=patch)
            else:
                p = mpatches.Patch(color=color2l[i], label='Courbe de Lorenz')
                patch.append(p)
                _ = plt.legend(handles=patch)

    _ = ax.legend(loc='upper left', bbox_to_anchor=(0.02, 0.98), fontsize=12,
                  frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=0.5)
    _ = ax.grid(linestyle='--', alpha=0.4)


def eta_squared(x, y):
    moyenne_y = y.mean()
    classes = []
    for classe in x.unique():
        yi_classe = y[x == classe]
        classes.append({'ni': len(yi_classe),
                        'moyenne_classe': yi_classe.mean()})
    SCT = sum([(yj-moyenne_y)**2 for yj in y])
    SCE = sum([c['ni']*(c['moyenne_classe']-moyenne_y)**2 for c in classes])
    return SCE/SCT


def linear_regression(x, y):     
    N = len(x)
    x_mean = x.mean()
    y_mean = y.mean()
    
    B1_num = ((x - x_mean) * (y - y_mean)).sum()
    B1_den = ((x - x_mean)**2).sum()
    B1 = B1_num / B1_den
    
    B0 = y_mean - (B1*x_mean)
    
    sign = '+'
    if B1 < 0:
        sign = '-'
        B3 = -B1
    else:
        B3 = B1
        
    reg_line = 'y = {} {} {}β'.format(B0, sign, round(B3, 3))
    
    return (B0, B1, reg_line)

def corr_coef(x, y):
    N = len(x)
    
    num = (N * (x*y).sum()) - (x.sum() * y.sum())
    den = np.sqrt((N * (x**2).sum() - x.sum()**2) * (N * (y**2).sum() - y.sum()**2))
    R = num / den
    return R

def linear_reg_aff(x,y):
    B0, B1, reg_line = linear_regression(x, y)
    R = corr_coef(x, y)
    text="Droite de régression : "+str(reg_line)+"\nCoef. de corrélation R : "+str(R)+"\nCoef. de détermination R² : "+str(R**2)
    return print(text)

def graph_droite_regression(dataset, x, y, text_x, text_y, titre, axes_x, axes_y, 
                            size_col, sizes_taille, titre_leg="", 
                            leg_x=0.1, leg_y=0.1, loc_leg='lower left', axe="ax1",
                            inter=np.arange(100), inter_droite=np.arange(100), size_title=20,
                            xlim_p=None, ylim_p=None, droite_reg=True):
    
    x1=dataset[x]
    y1=dataset[y]
    
    B0, B1, reg_line = linear_regression(x1,y1)
    R = corr_coef(x1,y1)
    
    sign = '+'
    if B1 < 0:
        sign = '-'
        B3 = -B1
    else:
        B3 = B1
    
    dic_ax = { "ax1":ax1, "ax2":ax2 }
    X=dic_ax[axe]

    X = plt.gca()
    text = "Moyenne X : {}\nMoyenne Y : {}\nR : {}\nR^2 : {}\ny = {} {} {}X".format(
        round(x1.mean(), 2), 
        round(y1.mean(), 2), 
        round(R, 4), 
        round(R**2, 4),
        round(B0, 3),
        sign,
        round(B3, 3))
    if droite_reg:
        _ = plt.text(x=text_x, y=text_y, s=text, fontsize=12, bbox={'facecolor': 'grey', 'edgecolor':'black', 'boxstyle':'round,pad=1', 'alpha': 0.2, 'pad': 10})
        _ = X.plot(inter_droite, [B0 + B1*x for x in inter_droite], c = 'r', linewidth=5, alpha=.5, solid_capstyle='round')
    _ = sns.scatterplot(data=dataset, x=x, y=y, marker="o", size=size_col, sizes=sizes_taille, ax=X)
    if xlim_p:
        _ = plt.xlim(xlim_p)
    if ylim_p:
        _ = plt.ylim(ylim_p)
    _ = plt.xticks(inter)
    _ = plt.xlabel(axes_x, color='gray', labelpad=15, fontdict={'size': 16})
    _ = plt.ylabel(axes_y, color='gray', labelpad=15, fontdict={'size': 16})
    _ = plt.title(titre, y=1.02, fontdict={'size': size_title, 'weight': 500})
    _ = plt.legend(title=titre_leg, loc=loc_leg, bbox_to_anchor=(leg_x, leg_y),frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=1,fontsize=12)


produits = pd.read_csv("data/dataset_P4/products.csv")
clients = pd.read_csv("data/dataset_P4/customers.csv")
ventes = pd.read_csv("data/dataset_P4/transactions.csv")
res = pd.read_csv("data/data_transition/res.csv", parse_dates=['s_id_date', 's_date'])
produits_cor = pd.read_csv("data/data_transition/produits_cor.csv")


produits[produits['categ'] == 0.]


produits_cor.describe().T
produits_cor.groupby('categ')['price'].describe()


fig = plt.figure(figsize=(18, 4))
ax = plt.axes()
_ = sns.boxplot(x='price', width=0.8, showmeans=True, data=produits)
_ = plt.xlim([-10, 310])
_ = plt.ylabel('Ensemble des produits', color='gray',
               labelpad=20, fontdict={'size': 16})
_ = plt.xlabel('Prix des produits', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Répartition des prix (toutes catégories)',
              fontdict={'size': 20, 'weight': 500})

_ = ax.xaxis.set_ticks(np.arange(0, 310, 10))
plt.show()

fig = plt.figure(figsize=(18, 6))
ax = plt.axes()
_ = plt.xlim([-10, 310])
_ = ax.xaxis.set_ticks(np.arange(0, 310, 10))
_ = sns.boxplot(x='price', y='categ', width=0.8, orient="h",
                palette="Set2", showmeans=True, data=produits)
_ = plt.ylabel('Catégories', color='gray', labelpad=10, fontdict={'size': 16})
_ = plt.xlabel('Prix des produits', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Répartition des prix par catégorie',
              fontdict={'size': 20, 'weight': 500})
plt.show()


px_tt_cat = produits['price']
px_tt_cat.value_counts().sort_values().tail(1);
px_cat_0 = produits.loc[produits['categ'] == 0, 'price']
px_cat_0.value_counts().sort_values().tail(1);
px_cat_1 = produits.loc[produits['categ'] == 1, 'price']
px_cat_1.value_counts().sort_values().tail(1);
px_cat_2 = produits.loc[produits['categ'] == 2, 'price']
px_cat_2.value_counts().sort_values().tail(1);


plt.figure(figsize=(14,8))
plt.hist(px_tt_cat, bins = range(0,300,5), alpha=0.7)
plt.axis([0, 300, 0, 750])
plt.xlabel('prix')
plt.ylabel('quantité')
plt.title('Répartition des produits proposés selon leur prix',fontsize=20)
plt.show();


fig, axes = plt.subplots(1, 3, figsize=(16, 2), dpi=100)
colors = ['tab:red', 'tab:blue', 'tab:green']
df = produits

for i, (ax, categ) in enumerate(zip(axes.flatten(), df.categ.unique())):
    x = df.loc[df.categ == i, 'price']
    ax.tick_params(labelsize=10, pad=5)
    ax.hist(x, alpha=0.5, bins=30, label='Catégorie ' +
            str(i), color=colors[i])
    ax.set_title('Catégorie '+str(i), fontdict={'size':14})
    ax.grid(linestyle='--', alpha=0.4)
    
    if i == 0:
        ax.set_ylabel("Quantité", color='gray',
                      labelpad=10, fontdict={'size': 14})
    if i == 1:
        ax.set_xlabel("Prix", color='gray', labelpad=10, fontdict={'size': 14})

plt.suptitle("Distribution des prix par catégories\n", y=1.2, size=18)
plt.show();


_ = plt.figure(figsize=(16, 6))
_ = plt.subplot(1, 1, 1)
_ = sns.kdeplot(px_cat_0, shade=True, bw_method=0.25, label='Catégorie 0 ')
_ = sns.kdeplot(px_cat_1, shade=True, bw_method=0.25, label='Catégorie 1 ')
_ = sns.kdeplot(px_cat_2, shade=True, bw_method=0.25, label='Catégorie 2 ')
_ = plt.ylabel('Densité / Distribution', color='gray',
               labelpad=10, fontdict={'size': 16})
_ = plt.xlabel('Prix des produits', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Distribution des prix par catégorie',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.legend(('Catégorie 0', 'Catégorie 1', 'Catégorie 2'), loc='center right', bbox_to_anchor=(
    0.98, 0.2), frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=1)
_ = plt.show()


skw_liste = [px_cat_0, px_cat_1, px_cat_2]
valid = 0
for i in range(3):
    sk = skw_liste[i].skew()
    print(f'Skewness empirique catégorie {i} : {round(sk, 4)}')
    if sk < 1:
        valid += 1
if valid == 3:
    print("\nLe skewness relatif au prix est inférieur à 1 pour les 3 catégories. \n  -> Les distribution des prix des 3 catégories sont donc asymétriques, avec un étalement à gauche.")
else:
    print("Les valeurs du skewness obtenues ne correspondent pas à celles attendues.")

Skewness empirique catégorie 0 : 0.8375
Skewness empirique catégorie 1 : 0.812
Skewness empirique catégorie 2 : 0.931

Le skewness relatif au prix est inférieur à 1 pour les 3 catégories. 
  -> Les distribution des prix des 3 catégories sont donc asymétriques, avec un étalement à gauche.


CA = res['p_prix'].sum()
print(f"Le CA sur un an s'élève à {int(CA)}€, soit {int(CA/12)}€ par mois")

Le CA sur un an s'élève à 5797786€, soit 483148€ par mois


rep_mens_ventes = res.groupby(['s_year', 's_month'])[
    'p_prix'].agg(['count', 'sum'])
rep_mens_ventes = rep_mens_ventes.reset_index()
rep_mens_ventes.columns = ['annee', 'mois', 'quantite', 'CA']
rep_mens_ventes


rep_mens_ventes['quantite'].sum()/12

28068.0


mois = ['Mars-21', 'Avril-21', 'Mai-21', 'Juin-21', 'Juilet-21',
        'Août-21', 'Sept-21', 'Oct-21', 'Nov-21', 'Dec-21', 'Janv-22', 'Fev-22']


def mois_date(ind):
    for i in range(12):
        if int(ind) == i:
            return mois[i]


#############################################################
#     Evolution mensuelle du CA en volume et en valeur      #
#############################################################

index = np.arange(12)

# Représentation sous forme de lignes brisées
fig, ax1 = plt.subplots(figsize=(18, 6))
_ = plt.xticks(index, mois)

# Evolution du CA en volume
ax1.tick_params(axis='y', labelcolor='orangered')
_ = plt.ylim(0, 35000)
_ = ax1.set_ylabel('Quantité (en nb d\'art. vendus)',
                   color='orangered', labelpad=20, fontdict={'size': 16})
a1 = rep_mens_ventes.plot(kind='line', y='quantite',
                          color='orangered', marker='o', linewidth=2, label='', ax=ax1)
_ = plt.legend(["CA volume (nb d'art)"], bbox_to_anchor=(
    0.83, 0.15), loc='upper left', fontsize=13)

# Evolution du CA en valeur
ax2 = ax1.twinx()
ax2.tick_params(axis='y', labelcolor='teal')
_ = plt.ylim(250000, 600000)
_ = ax2.set_ylabel('CA (en euros)', color='teal',
                   labelpad=20, fontdict={'size': 16})
a2 = rep_mens_ventes.plot(kind='line', y='CA', color='teal',
                          marker='o', linewidth=2, label='données estimées', ax=ax2)

_ = plt.title('Evolution mensuelle du CA en volume et en valeur',
              y=1.02, color='MidnightBlue', fontdict={'size': 20})
_ = plt.legend(["CA valeur (en €)"], bbox_to_anchor=(
    0.83, 0.1), loc='upper left', fontsize=13)

plt.grid(linestyle='--', alpha=0.3)
plt.show()

# Représentation sous forme de diagrammes en bâtons
fig, ax1 = plt.subplots(figsize=(18, 6))
width = 0.35

# Evolution du CA en volume
color = 'OrangeRed'
_ = ax1.set_xlabel('Mois', color='gray', labelpad=15, fontdict={'size': 16})
_ = ax1.set_ylabel('Quantité (en nb d\'art. vendus)',
                   color=color, labelpad=20, fontdict={'size': 16})
rects1 = ax1.bar(index - width / 2, rep_mens_ventes['quantite'],
                 width, color=color, alpha=0.8)
ax1.tick_params(axis='y', labelcolor=color)
plt.grid(linestyle='--', alpha=0.3)
_ = plt.xticks(index, mois)

# Evolution du CA en valeur
ax2 = ax1.twinx()
color = 'teal'
_ = ax2.set_ylabel('CA (en euros)', color=color,
                   labelpad=20, fontdict={'size': 16})
ax2.tick_params(axis='y', labelcolor=color)
rects2 = ax2.bar(index + width / 2, rep_mens_ventes['CA'],
                 width, color=color, alpha=0.8)
_ = plt.title('Evolution mensuelle du CA en volume et en valeur',
              y=1.02, color='MidnightBlue', fontdict={'size': 20})
_ = plt.legend((rects1[0], rects2[0]),
               ('CA volume (nb d\'art)', 'CA valeur (en €)'))
plt.grid(linestyle='--', alpha=0.3)
plt.show()


rep_week_ventes = res.groupby('s_week')['p_prix'].sum()
rep_week_ventes = rep_week_ventes.reset_index()
rep_week_ventes.columns = ['semaine', 'CA']

params = {
    'figure.figsize': (14, 6),
}
pylab.rcParams.update(params)

# Evolution hébdomadaire du CA en valeur
draw_bar_single(
    rep_week_ventes.loc[(rep_week_ventes['semaine'] > 35) & (
        rep_week_ventes['semaine'] < 45)]['semaine'],
    rep_week_ventes.loc[(rep_week_ventes['semaine'] > 35)
                        & (rep_week_ventes['semaine'] < 45)]['CA'],
    'Semaines',
    'CA (en €)',
    'Evolution hebdomadaire du CA en valeur',
    col_bar='SeaGreen',
    alpha_grid=0.2
)


rep_week_ventes_cat = res.groupby(['s_week', 'p_categ', ])['p_prix'].sum()
rep_week_ventes_cat = rep_week_ventes_cat.reset_index()
rep_week_ventes_cat.columns = ['semaine', 'categ', 'CA']
rep_semaine_cat = rep_week_ventes_cat.copy()
rep_semaine_cat['semaine'] = rep_semaine_cat['semaine'].apply(lambda x: int(x))
rep_semaine_cat['categ'] = rep_semaine_cat['categ'].astype('int')
x = rep_semaine_cat.loc[(rep_semaine_cat['semaine'] > 35)
                        & (rep_semaine_cat['semaine'] < 45)]
z = pd.pivot_table(x, values='CA', columns='categ', index='semaine')
z


colors_pers1 = ['#5893C2', '#EDC964', '#819A52', '#CBA364']
params = {
    'figure.figsize': (18, 6),
}
pylab.rcParams.update(params)
rep_semaine_cat = rep_week_ventes_cat.copy()
rep_semaine_cat['semaine'] = rep_semaine_cat['semaine'].apply(lambda x: int(x))
rep_semaine_cat['categ'] = rep_semaine_cat['categ'].astype('int')

draw_bar_several(
    dfgroupby=rep_semaine_cat.loc[(rep_semaine_cat['semaine'] > 35) & (
        rep_semaine_cat['semaine'] < 45)].groupby(['semaine', 'categ'])[['CA']].mean(),
    g_stacked=True,
    col_titre='DarkRed',
    label_x='Semaines',
    label_y='CA en valeur (€)',
    titre='\nEvolution hebdomadaire du CA par catégorie',
    alpha_grid=0.4,
    titre_y=1.05,
    title_linespacing=1,
    g_color=colors_pers1,
    legend_pers=True,
    legend=['Cat 0', 'Cat 1', 'Cat 2']
)
_ = plt.legend(['Categorie 0', 'Catégorie 1', 'Catégorie 2'],
               bbox_to_anchor=(0.6, 0.99), loc='upper left', fontsize=13)


mois2 = ['Janv-22', 'Fev-22', 'Mars-21', 'Avril-21', 'Mai-21', 'Juin-21',
         'Juilet-21', 'Août-21', 'Sept-21', 'Oct-21', 'Nov-21', 'Dec-21']

ca_jour = res[['p_prix', 's_date']]
ca_jour = ca_jour.groupby(['s_date']).sum()
ca_jour = ca_jour.reset_index()
ca_jour['mois'] = ca_jour['s_date'].dt.month
ca_jour['m'] = [mois2[x-1] for x in ca_jour['mois']]
ca_jour


ca_jour.plot(kind='line', x='s_date', y='p_prix', figsize=(16,6), linewidth=2, label='')
plt.title('Evolution quotidienne du CA en valeur',
              fontdict={'size': 20, 'weight': 500})
plt.xlabel('')
plt.ylabel('CA en €', labelpad=15, color='gray', fontdict={'size': 16})
plt.grid(linestyle='--', alpha=0.3)
plt.legend('')
plt.show();


ca_jour_limited = ca_jour.loc[(ca_jour['s_date'] >= pd.Timestamp(
    2021, 9, 15)) & (ca_jour['s_date'] < pd.Timestamp(2021, 11, 15)), :]
_ = ca_jour_limited.plot(kind='line', x='s_date', y='p_prix', linewidth=2, label='')
_ = plt.title('Evolution du CA en valeur',
              fontdict={'size': 20, 'weight': 500})
_ = plt.xlabel('')
_ = plt.ylabel('CA en €', labelpad=15, color='gray', fontdict={'size': 16})
_ = plt.legend([])
_ = plt.grid(linestyle='--', alpha=0.3)
_ = plt.show()


import locale
_ = locale.setlocale(locale.LC_ALL, 'fr_FR.utf8')

ca_jour_categ = res[['p_prix', 'p_categ', 's_date']]
ca_jour_categ.reset_index(inplace=True, drop=True)
ca_jour_categ = ca_jour_categ.groupby(
    ['s_date', 'p_categ']).agg(['count', 'sum'])
ca_jour_categ = ca_jour_categ.unstack()
ca_jour_categ.reset_index(inplace=True)
ca_jour_categ.columns = ['date', 'nb_p0',
                         'nb_p1', 'nb_p2', 'ca_p0', 'ca_p1', 'ca_p2']
ca_jour_categ['day'] = ca_jour_categ['date'].dt.day_name('fr_FR')
ca_jour_categ.loc[ca_jour_categ['ca_p1'].isna(), ['date', 'day']]


start = ca_jour_categ.iloc[215, 0]
end = ca_jour_categ.iloc[240, 0]
print('On constate l\'absence de vente de produits de catégorie 1 pour la période suivante : ')
print(f'  - du {ca_jour_categ.iloc[215,7]} {start.strftime("%d")} {start.strftime("%B")} au {ca_jour_categ.iloc[240,7]} {end.strftime("%d")} {end.strftime("%B")} {end.year}')

On constate l'absence de vente de produits de catégorie 1 pour la période suivante : 
  - du Samedi 02 octobre au Mercredi 27 octobre 2021


inter = [215+x for x in range(26)]


def _color_red(val):
    df = ca_jour_interpol
    for i in inter:
        if val == df.iloc[i, 2] or val == df.iloc[i, 5]:
            return 'background-color: %s' % '#eb2f0e'
        if val == df.iloc[i, 7] or val == df.iloc[i, 8]:
            return 'background-color: %s' % '#f9ab9d'


ca_jour_interpol = ca_jour_categ.copy()
ca_jour_interpol = ca_jour_interpol.drop('day', axis=1)
ca_jour_interpol = ca_jour_interpol.interpolate()
ca_jour_interpol['nb_total'] = ca_jour_interpol['nb_p0'] + ca_jour_interpol['nb_p1']+ca_jour_interpol['nb_p2']
ca_jour_interpol['ca_total'] = ca_jour_interpol['ca_p0'] + ca_jour_interpol['ca_p1']+ca_jour_interpol['ca_p2']
ca_jour_interpol.reset_index(inplace=True, drop=True)
ca_jour_interpol_es = ca_jour_interpol.iloc[220:223, :]
ca_jour_interpol_es = ca_jour_interpol_es.style.applymap(_color_red).set_precision(2)
ca_jour_interpol_es


ca_jour_limited = ca_jour.loc[(ca_jour['s_date'] >= pd.Timestamp(
    2021, 10, 1)) & (ca_jour['s_date'] < pd.Timestamp(2021, 10, 29)), :]

ca_jour_interpol['mois'] = ca_jour_interpol['date'].dt.month
ca_jour_interpol['m'] = [mois2[x-1] for x in ca_jour_interpol['mois']]
ca_jour_limited.columns=['date','p_prix','mois','m']
ca_jour_limited


ca_jour_limited = ca_jour.loc[(ca_jour['s_date'] >= pd.Timestamp(
    2021, 10, 1)) & (ca_jour['s_date'] < pd.Timestamp(2021, 10, 29)), :]
ca_jour_limited.columns=['date','p_prix','mois','m']

ca_jour_interpol['mois'] = ca_jour_interpol['date'].dt.month
ca_jour_interpol['m'] = [mois2[x-1] for x in ca_jour_interpol['mois']]

ax = plt.gca()
ca_jour_interpol.plot(kind='line', y='ca_total', x='date', color='red',
                          linewidth=2, label='données estimées', figsize=(16,6), ax=ax)
ca_jour_limited.plot(kind='line', y='p_prix', x='date', color='gray',
                         linewidth=1, label='données réelles', style='--', ax=ax)
plt.title('Evolution du CA en valeur',
              fontdict={'size': 20, 'weight': 500})
plt.xlabel('')
plt.ylabel('CA en €', labelpad=15, color='gray', fontdict={'size': 16})
plt.grid(linestyle='--', alpha=0.4)
plt.show();


_ = plt.gcf().subplots_adjust(left=0.125, bottom=0.2, right=1.5,
                              top=0.9, wspace=0.2, hspace=0)
_ = plt.figure(figsize=(18, 4))
_ = plt.subplot(1, 2, 1)

# Evolution du CA - sans estimation
draw_bar_single(
    rep_week_ventes['semaine'],
    rep_week_ventes['CA'],
    'Semaines',
    'CA (en €)',
    'CA en valeur par semaine - Sans estimation',
    col_bar='darkorange',
    alpha_grid=0.2
)

ca_jour_interpol['week'] = (
    ca_jour_interpol['date'].dt.strftime('%V')).astype('int')
ca_week_interpol = ca_jour_interpol.groupby('week').sum()
ca_week_interpol = ca_week_interpol.reset_index()
ca_week_interpol.columns = ['week', 'nb_p0', 'nb_p1', 'nb_p2',
                            'ca_p0', 'ca_p1', 'ca_p2', 'nb_total', 'ca_total', 'mois']
ca_week_interpol = ca_week_interpol[[
    'week', 'nb_p0', 'nb_p1', 'nb_p2', 'ca_p0', 'ca_p1', 'ca_p2', 'nb_total', 'ca_total']]

# Evolution du CA - avec estimation
_ = plt.subplot(1, 2, 2)
data = ca_week_interpol['ca_total']
x = ca_week_interpol['week']
_ = plt.style.use('seaborn-whitegrid')
_ = plt.xlabel('Semaine', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('', color='gray', fontdict={'size': 0})
_ = plt.title('CA en valeur par semaine - Avec estimation',
              color='MidnightBlue', fontdict={'size': 20})
for i in range(52):
    if i > 38 and i < 44:
        a1 = plt.bar(x[i], data[i],
                     align='center', color='red', alpha=0.7)
    else:
        b1 = plt.bar(x[i], data[i],
                     align='center', color='SeaGreen', alpha=0.7)
_ = plt.grid(linestyle='--', alpha=0.3)
_ = plt.legend([b1, a1], ['données réelles',
                          'données estimées'], prop={'size': 10})
_ = plt.show();

<Figure size 1296x432 with 0 Axes>


ca_week_interpol2 = ca_week_interpol.copy()
ca_week_interpol2.columns = ['week', 'cat_0_nb', 'cat_1_nb', 'cat_2_nb',
                             'cat_0_CA', 'cat_1_CA', 'cat_2_CA', 'nb_total', 'ca_total']
ca_week_interpol2['nb_total'] = ca_week_interpol2['cat_0_nb'] + \
    ca_week_interpol2['cat_1_nb'] + ca_week_interpol2['cat_2_nb']
ca_week_interpol2['ca_total'] = ca_week_interpol2['cat_0_CA'] + \
    ca_week_interpol2['cat_1_CA'] + ca_week_interpol2['cat_2_CA']


params = {
    'figure.figsize': (24, 6),
}
pylab.rcParams.update(params)

ca_week_interpol2['cat_0_CA'] = round(ca_week_interpol2['cat_0_CA'], 2)
ca_week_interpol2['cat_1_CA'] = round(ca_week_interpol2['cat_1_CA'], 2)
ca_week_interpol2['cat_2_CA'] = round(ca_week_interpol2['cat_2_CA'], 2)

ca1 = ca_week_interpol2.loc[ca_week_interpol2['week'] < 39]
ca2 = ca_week_interpol2.loc[(ca_week_interpol2['week'] > 37) & (
    ca_week_interpol2['week'] < 45)]
ca3 = ca_week_interpol2.loc[ca_week_interpol2['week'] > 43]

# Evolution hebdomadaire du CA en valeur par catégorie
ax2 = plt.subplot(1, 2, 2)
loc = mtick.MultipleLocator(base=4.0)
ax2.xaxis.set_major_locator(loc)
_ = ca_week_interpol2.plot(kind='line', x='week', y='cat_0_CA',
                           color='#5893C2', linewidth=3,  label='Catégorie 0', ax=ax2)
_ = ca1.plot(kind='line', x='week', y='cat_1_CA', color='#EDC964',
             linewidth=3,  label='Catégorie 1', ax=ax2)
_ = ca2.plot(kind='line', x='week', y='cat_1_CA', color='#EDC964',
             linewidth=3, linestyle=':', label='Estimation', ax=ax2)
_ = ca3.plot(kind='line', x='week', y='cat_1_CA',
             color='#EDC964', linewidth=3, label='', ax=ax2)
_ = ca_week_interpol2.plot(kind='line', x='week', y='cat_2_CA',
                           color='#819A52', linewidth=3, label='Catégorie 2', ax=ax2)

a_patch = mpatches.Patch(color='#5893C2', label='Catégorie 0')
b_patch = mpatches.Patch(color='#EDC964', label='Catégorie 1')
c_patch = mpatches.Patch(color='#EDC964', label='Estimation')
d_patch = mpatches.Patch(color='#819A52', label='Catégorie 2')
_ = plt.legend(handles=[a_patch, b_patch, c_patch, d_patch])

_ = plt.legend(bbox_to_anchor=(0.01, 0.18), loc='upper left', fontsize=9)
_ = plt.xlabel('Semaines', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('CA en valeur (en €)', labelpad=15,
               color='gray', fontdict={'size': 16})
_ = plt.title('Evolution hebdomadaire du CA en valeur par catégorie',
              color='darkred', fontdict={'size': 16})
plt.grid(linestyle='--', alpha=0.4)
plt.show()


index = np.arange(52)

fig, ax1 = plt.subplots(figsize=(18, 6))

ax1.tick_params(axis='y', labelcolor='orangered')
_ = plt.ylim(4000, 9000)
_ = plt.fill_between((39, 43), 9000, facecolor='orange', alpha=0.4)
_ = ax1.annotate('estimation', xy=(41, 8000), xytext=(48, 8600), size=14, color='0.4', va="center", ha="center",
                 bbox=dict(boxstyle="round4", fc="0.8"),
                 arrowprops=dict(arrowstyle="-|>", color='0.5',
                                 connectionstyle="arc3,rad=0.2",
                                 fc="w"),)
_ = ax1.set_ylabel('Quantité (en nb d\'art. vendus)',
                   color='orangered', labelpad=20, fontdict={'size': 16})
a1 = ca_week_interpol2.plot(kind='line', y='nb_total',
                            color='orangered', marker='o', linewidth=2, label='', ax=ax1)
_ = plt.legend(["CA volume (nb d'art)"], bbox_to_anchor=(
    0.83, 0.15), loc='upper left', fontsize=13)

ax2 = ax1.twinx()
ax2.tick_params(axis='y', labelcolor='teal')
_ = plt.ylim(90000, 150000)
_ = ax2.set_ylabel('CA (en euros)', color='teal',
                   labelpad=20, fontdict={'size': 16})
a2 = ca_week_interpol2.plot(kind='line', y='ca_total',
                            color='teal',  marker='o', linewidth=2, label='', ax=ax2)
_ = plt.title('Evolution hebdomadaire du CA en volume et en valeur',
              y=1.02, color='MidnightBlue', fontdict={'size': 20})
_ = plt.legend(["CA valeur (en €)"], bbox_to_anchor=(
    0.83, 0.1), loc='upper left', fontsize=13)
_ = plt.grid(linestyle='--', alpha=0.3)
_ = plt.show()


res_estimated = res.copy()
res_mask = res_estimated['s_month'] != 10.0
resultat = res_estimated[res_mask]
resultat.sample(3)


CA = resultat['p_prix'].sum()
print(f"Le CA corrigé sur un an s'élève à {int(CA)}€, soit {int(CA/11)}€ par mois")

Le CA corrigé sur un an s'élève à 5478544€, soit 498049€ par mois


params = {
    'figure.figsize': (6, 6),
}
pylab.rcParams.update(params)

labels = []
for i in range(0, 3):
    labels.append('Catégorie ' + str(i))

fig = plt.figure(figsize=(18, 6))

draw_bar_pie(
    z=ax1, a=1, b=2, c=1,
    titre='Répartition annuelle du CA en valeur (en €)',
    labels_in=labels,
    size_in=resultat.groupby('p_categ')['p_prix'].sum(),
    explode_in=(0, 0.1, 0),
    titre_legend='Répartition'
)

draw_bar_pie(
    z=ax2, a=1, b=2, c=2, def_angle=60,
    titre='Répartition annuelle du CA en volume (en nombre d\'articles)',
    labels_in=labels,
    size_in=resultat.groupby('p_categ')['p_prix'].count(),
    explode_in=(0, 0.1, 0),
    size_perc=14,
    titre_legend='Répartition',
    legend_available=False
)


def cal_mois(mois):
    for i in range(12):
        i = int(mois)
        if i > 2:
            return int(i-2)
        else:
            return int(i+10)

def calc2(mois):
    for i in range(12):
        i = int(mois)
        if i > 8:
            return int(i-1)
        else:
            return int(i)

rep_m_ventes = resultat.groupby(['s_year', 's_month', 'p_categ'])['p_prix'].sum()
rep_m_ventes = rep_m_ventes.reset_index()
rep_m_ventes_cat = rep_m_ventes.copy()
rep_m_ventes_cat['month'] = rep_m_ventes_cat['s_month'].apply(lambda x: cal_mois(x))
rep_m_ventes_cat['month'] = rep_m_ventes_cat['month'].apply(lambda x: calc2(x))
rep_m_ventes_cat['categ'] = rep_m_ventes_cat['p_categ'].astype('int')

rep_m_ventes_nb = resultat.groupby(['s_year', 's_month', 'p_categ'])['p_prix'].count()
rep_m_ventes_nb = rep_m_ventes_nb.reset_index()
rep_m_ventes_nb_cat = rep_m_ventes_nb.copy()
rep_m_ventes_nb_cat['month'] = rep_m_ventes_nb_cat['s_month'].apply(lambda x: cal_mois(x))
rep_m_ventes_nb_cat['month'] = rep_m_ventes_nb_cat['month'].apply(lambda x: calc2(x))
rep_m_ventes_nb_cat['categ'] = rep_m_ventes_nb_cat['p_categ'].astype('int')

colors_pers1 = ['#5893C2', '#EDC964', '#819A52', '#CBA364']
mois3 = ['', 'Mars-21', 'Avril-21', 'Mai-21', 'Juin-21', 'Juilet-21', 'Août-21', 'Sept-21', 'Nov-21', 'Dec-21', 'Janv-22', 'Fev-22']

fig = plt.figure(figsize=(32, 16))
plt.subplots_adjust(wspace=0.3, hspace=0.3)


ax1 = fig.add_subplot(2, 2, 1)
index1 = np.arange(12)
rep_m_ventes_cat.groupby(['month', 'categ'])[['p_prix']].mean().unstack().plot(
    kind='bar',
    stacked=True,
    width=0.8,
    color=colors_pers1, ax=ax1)
plt.title('CA mensuel en valeur par catégorie', color='DarkRed', fontdict={'size': 20, 'weight': 500})
plt.xlabel('Mois', color='white', fontdict={'size': 0})
plt.ylabel('CA en valeur (€)', color='gray', labelpad=10, fontdict={'size': 16})
plt.xticks(index1, mois3, rotation='horizontal')
plt.legend(['Categorie 0', 'Catégorie 1', 'Catégorie 2'], bbox_to_anchor=(1.02, 0.80),
               loc='upper left', fontsize=12,  frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True)
plt.grid(linestyle='--', alpha=0.4)


ax2 = fig.add_subplot(2, 2, 2)
index2 = np.arange(12)
rep_m_ventes_nb_cat.groupby(['month', 'categ'])[['p_prix']].mean().unstack().plot(
    kind='bar',
    stacked=True,
    width=0.8,
    color=colors_pers1, ax=ax2)
plt.title('CA mensuel en volume par catégorie', color='DarkRed', fontdict={'size': 20, 'weight': 500})
plt.xlabel('Mois', color='white', fontdict={'size': 0})
plt.ylabel('Nombre d\'articles vendus', color='gray', labelpad=10, fontdict={'size': 16})
plt.xticks(index2, mois3, rotation='horizontal')
ax2.get_legend().remove()
plt.grid(linestyle='--', alpha=0.4)


ax3 = fig.add_subplot(2, 2, 3)
rep_m_ventes_cat.groupby(['month', 'categ'])[['p_prix']].mean().groupby(level=0).apply(lambda x: 100 * x / x.sum()).unstack().plot(
    kind='bar',
    stacked=True,
    width=0.8,
    color=color1, ax=ax3)
index3 = np.arange(12)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.title('Répartition mensuelle du CA en valeur', color='DarkRed', fontdict={'size': 20, 'weight': 500})
plt.xlabel('Mois', color='white', fontdict={'size': 0})
plt.ylabel('CA en valeur (€)', color='gray', labelpad=10, fontdict={'size': 16})
plt.xticks(index3, mois3, rotation='horizontal')
plt.legend(['Categorie 0', 'Catégorie 1', 'Catégorie 2'], bbox_to_anchor=(1.025, 0.80),
               loc='upper left', fontsize=12,  frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True)
plt.grid(linestyle='--', alpha=0.4)


ax4 = fig.add_subplot(2, 2, 4)
index4 = np.arange(12)
rep_m_ventes_nb_cat.groupby(['month', 'categ'])[['p_prix']].mean().groupby(level=0).apply(lambda x: 100 * x / x.sum()).unstack().plot(
    kind='bar',
    stacked=True,
    width=0.8,
    color=color1, ax=ax4)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.title('Répartition mensuelle du CA en volume', color='DarkRed', fontdict={'size': 20, 'weight': 500})
plt.xlabel('Mois', color='white', fontdict={'size': 0})
plt.ylabel('Nombre d\'articles vendus', color='gray',labelpad=10, fontdict={'size': 16})
plt.xticks(index4, mois3, rotation='horizontal')
ax4.get_legend().remove()
plt.grid(linestyle='--', alpha=0.4)

plt.show();


rep_n = resultat.groupby(['p_categ', 'p_id'])[['p_prix']].count()
rep_n.reset_index(inplace=True)
n = rep_n.nlargest(10, 'p_prix').sort_values('p_prix')
color_seq_n = ['#f46a24', '#f56621', '#f6621e', '#f85e1b', '#f95a17', '#fa5513', '#fb510f', '#fd4c0a', '#fe4605', '#ff4000']

rep_t = resultat.groupby(['p_categ', 'p_id'])[['p_prix']].sum()
rep_t.reset_index(inplace=True)
t = rep_t.nlargest(10, 'p_prix').sort_values('p_prix')
color_seq_p = ['#008080', '#007d7d', '#007a7a', '#007677', '#007373', '#007070', '#006d6d', '#006a6a', '#006767', '#006364']

fig = plt.figure(figsize=(22, 6))
ax1 = fig.add_subplot(121)
x1 = n['p_id']
y1 = n['p_prix']
ax1.barh(x1, y1, color=color_seq_n)
for i, (p_prix, p_id) in enumerate(zip(n['p_prix'], n['p_id'])):
    plt.text(p_prix-10, p_id, int(p_prix), fontsize=14, color='white', va='center', ha='right')
plt.title('Les 10 produits les plus vendus (en terme de quantité)', color='DarkRed', fontdict={'size': 20, 'weight': 500})
plt.xlabel('Nombre d\'articles vendus sur l\'année', color='gray', labelpad=10, fontdict={'size': 16})
plt.ylabel('ID du produit', color='gray', labelpad=10, fontdict={'size': 16})

ax2 = fig.add_subplot(122)
x2 = t['p_id']
y2 = t['p_prix']
_ = ax2.barh(x2, y2, color=color_seq_p)
for i, (p_prix, p_id) in enumerate(zip(t['p_prix'], t['p_id'])):
    plt.text(p_prix-500, p_id, "{:.2f}€".format(p_prix), fontsize=14, color='white', va='center', ha='right')
plt.title('Les 10 produits les plus rémunérateurs (en terme de CA)', color='DarkRed', fontdict={'size': 20, 'weight': 500})
plt.xlabel('CA annuel généré (en euros)', color='gray', labelpad=10, fontdict={'size': 16})
plt.ylabel('')

plt.show();


t_liste = set(t['p_id'])
n_liste = set(n['p_id'])
x = t_liste & n_liste
xx = list(x)
print('Les produits les plus vendus, tant en volume que en valeur, sont les produits d\'ID :')
for i in range(len(x)):
    print(f'  - {xx[i]}')

Les produits les plus vendus, tant en volume que en valeur, sont les produits d'ID :
  - 1_369
  - 1_414


rep_q_f = resultat[resultat['c_sex'] == 'f'].groupby(['p_categ', 'p_id'])[['p_prix']].count()
rep_q_f.reset_index(inplace=True)
qf = rep_q_f.nlargest(10, 'p_prix').sort_values('p_prix')
qf
rep_q_h = resultat[resultat['c_sex'] == 'm'].groupby(['p_categ', 'p_id'])[['p_prix']].count()
rep_q_h.reset_index(inplace=True)
qh = rep_q_h.nlargest(10, 'p_prix').sort_values('p_prix')
qh

liste_f=list(qf['p_id'])
liste_h=list(qh['p_id'])
color_bar = ['darkorange' if x in liste_f  else 'teal' for x in liste_h]
fig = plt.figure(figsize=(26, 6))
plt.suptitle("Les 10 produits les plus vendus en volume (en terme de quantité)", color='DarkRed', y=0.98, fontsize=24)
ax1 = fig.add_subplot(121)
x = qf['p_id']
y = qf['p_prix']
ax1.barh(x, y, color=color_bar, alpha=0.8)
for i, (p_prix, p_id) in enumerate(zip(qf['p_prix'], qf['p_id'])):
    plt.text(p_prix-10, p_id, int(p_prix), fontsize=14, color='white', va='center', ha='right')
plt.xlabel('Femmes', color='navy', labelpad=20, fontdict={'size': 18})
plt.ylabel('ID du produit', color='gray',labelpad=10, fontdict={'size': 16})
plt.grid(linestyle='--', alpha=0.4)
or_patch = mpatches.Patch(color='darkorange', alpha=0.8, label='En commun aux 2 sexes')
teal_patch = mpatches.Patch(color='teal', alpha=0.8, label='Spécifique à un sexe')
plt.legend(handles=[or_patch, teal_patch], bbox_to_anchor=(1.19, -0.06),
               fontsize=12,  frameon=True, ncol=1, fancybox=False, framealpha=1, shadow=False)

ax2 = fig.add_subplot(122)
x = qh['p_id']
y = qh['p_prix']
ax2.barh(x, y, color=color_bar, alpha=0.8)
for i, (p_prix, p_id) in enumerate(zip(qh['p_prix'], qh['p_id'])):
    plt.text(p_prix-10, p_id, int(p_prix), fontsize=14, color='white', va='center', ha='right')
plt.xlabel('Hommes', color='navy', labelpad=20, fontdict={'size': 18})
plt.ylabel('')
plt.grid(linestyle='--', alpha=0.4)
plt.show();


rep_m_f = resultat[resultat['c_sex'] == 'f'].groupby(['p_categ', 'p_id'])[['p_prix']].sum()
rep_m_f.reset_index(inplace=True)
mf = rep_m_f.nlargest(10, 'p_prix').sort_values('p_prix')

rep_m_h = resultat[resultat['c_sex'] == 'm'].groupby(['p_categ', 'p_id'])[['p_prix']].sum()
rep_m_h.reset_index(inplace=True)
mh = rep_m_h.nlargest(10, 'p_prix').sort_values('p_prix')

liste_f=list(mf['p_id'])
liste_h=list(mh['p_id'])
color_bar = ['blue' if x in liste_f  else 'gray' for x in liste_h]
fig = plt.figure(figsize=(26, 6))
plt.suptitle('Les 10 produits les plus rémunérateurs (en terme de CA)', color='DarkRed', y=0.98, fontsize=24)
ax1 = fig.add_subplot(121)
x = mf['p_id']
y = mf['p_prix']
ax1.barh(x, y, color=color_bar, alpha=0.8)
for i, (p_prix, p_id) in enumerate(zip(mf['p_prix'], mf['p_id'])):
    plt.text(p_prix-500, p_id, "{:.2f}€".format(p_prix), fontsize=14, color='white', va='center', ha='right')
plt.xlabel("Femmes\n", color='navy', labelpad=25, fontdict={'size': 18, 'weight':500})
plt.ylabel('ID du produit', color='gray',labelpad=10, fontdict={'size': 16})
navy_patch = mpatches.Patch(color='navy', alpha=0.8, label='En commun aux 2 sexes')
gray_patch = mpatches.Patch(color='gray', alpha=0.8, label='Spécifique à un sexe')
plt.legend(handles=[navy_patch, gray_patch], bbox_to_anchor=(1.23, -0.12),
               fontsize=12,  frameon=True, ncol=1, fancybox=False, framealpha=1, shadow=False)
plt.grid(linestyle='--', alpha=0.4)

ax2 = fig.add_subplot(122)
x = mh['p_id']
y = mh['p_prix']
ax2.barh(x, y, color=color_bar, alpha=0.8)
for i, (p_prix, p_id) in enumerate(zip(mh['p_prix'], mh['p_id'])):
    plt.text(p_prix-500, p_id, "{:.2f}€".format(p_prix), fontsize=14, color='white', va='center', ha='right')
plt.xlabel('Hommes', color='navy', labelpad=25, fontdict={'size': 18, 'weight':500})
plt.ylabel('')
plt.grid(linestyle='--', alpha=0.4)
plt.show();


h_cat2 = mh[mh['p_categ'] == 2.]['p_categ'].count()/10;
f_cat2 = mf[mf['p_categ'] == 2.]['p_categ'].count()/10;

print('La proportion de produits de catégorie 2 parmi les 10 produits les plus rémunérateurs :')
print(f'  - {int(h_cat2*100)}% chez les hommes')
print(f'  - {int(f_cat2*100)}% chez les femmes')
print()
print('Info intéressante dans l\'optique d\'une segmentation clients ou d\'une opération marketing ciblée')

La proportion de produits de catégorie 2 parmi les 10 produits les plus rémunérateurs :
  - 90% chez les hommes
  - 50% chez les femmes

Info intéressante dans l'optique d'une segmentation clients ou d'une opération marketing ciblée


buyers = resultat.groupby(['c_id', 'c_sex']).count()
buyers.reset_index(inplace=True)
a = buyers.loc[buyers.c_sex == 'm']['c_sex'].count()
b = buyers.loc[buyers.c_sex == 'f']['c_sex'].count()
_ = plt.figure(figsize=(12, 5))
x = ['Hommes', 'Femmes']
y = [int(a), int(b)]
_ = plt.bar(x, y, width=0.6, color='teal')
_ = plt.title('Répartition Hommes-Femmes des clients actifs', size=20, y=1.03)
_ = plt.ylabel('Effectif', color='gray', labelpad=12, fontsize=16)
_ = plt.xticks(color='darkred', fontsize=16)
for i, v in enumerate(y):
    _ = plt.text(i-.04, v-500, v, fontsize=18, color='white')
_ = plt.show()


buyers = resultat.groupby(['c_id', 'c_age', 'c_sex']).size().to_frame('size')
buyers = buyers.unstack().reset_index()
buyers.columns = ['c_id', 'c_age', 'f', 'm']
buy = buyers.copy()
buyers = buyers.groupby('c_age').count()
buyers = buyers.reset_index()
buyers.columns = ['c_age', 'total', 'f', 'm']
buyers


data = buy['c_age']
_ = plt.xlim(10, 100)
_ = plt.hist(data, bins=40)
_ = plt.title('Distribution de l\'âge des clients', size=20, y=1.03)
_ = plt.ylabel('Effectif', color='gray', labelpad=12, fontsize=16)
_ = plt.xlabel('Age', color='gray', labelpad=12, fontsize=16)
_ = plt.xticks(color='darkred', fontsize=16)
plt.show()
buy


buyers_s = resultat.groupby(['c_id', 'c_age', 'c_sex']).size().to_frame('size')
buyers_s.reset_index(inplace=True)
buyers_s.columns = ['c_id', 'c_age', 'c_sex', 't']
buyers_s = buyers_s[['c_age', 'c_sex']]


data_t = buyers_s['c_age']
data_h = buyers_s[buyers_s['c_sex'] == 'm']['c_age']
data_f = buyers_s[buyers_s['c_sex'] == 'f']['c_age']
fig = plt.figure(figsize=(16, 4))


ax2 = fig.add_subplot(1, 2, 1)
_ = plt.xlim(10, 100)
_ = plt.hist(data_h, bins=40, label='femmes')
_ = plt.title('Distribution de l\'âge des clients hommes', size=20, y=1.03)
_ = plt.xlabel('Age', color='gray', labelpad=12, fontsize=16)
_ = plt.xticks(color='darkred', fontsize=16)

ax3 = fig.add_subplot(1, 2, 2)
_ = plt.xlim(10, 100)
_ = plt.hist(data_f, bins=40, label='hommes')
_ = plt.title('Distribution de l\'âge des clients femmes', size=20, y=1.03)
_ = plt.xlabel('Age', color='gray', labelpad=12, fontsize=16)
_ = plt.xticks(color='darkred', fontsize=16)

plt.show()


res_clients = resultat.copy()
res_clients = res_clients.groupby(['c_id', 'c_sex', 'c_age', 's_panier_id', 's_id_date', 'p_categ']).agg({
    'p_prix': ['count', 'sum']
})
res_clients = res_clients.unstack()
res_clients = res_clients.reset_index()

res_clients.columns = ['c_id', 'c_sex', 'c_age', 's_panier_id', 's_id_date', 'nb_art_cat_0',
                       'nb_art_cat_1', 'nb_art_cat_2', 'montant_cat_0', 'montant_cat_1', 'montant_cat_2']
res_clients = res_clients.fillna(0)

res_clients.insert(6, 'nb_art_panier', res_clients['nb_art_cat_0'] + res_clients['nb_art_cat_1'] + res_clients['nb_art_cat_2'])
res_clients['montant_panier'] = res_clients['montant_cat_0'] + res_clients['montant_cat_1'] + res_clients['montant_cat_2']
res_clients_all_orders = res_clients.groupby(['c_id', 'c_sex', 'c_age', 's_panier_id']).agg({
    's_id_date': 'min',
    'nb_art_panier': 'sum',
    'montant_panier': 'sum',
    'nb_art_cat_0': 'sum',
    'nb_art_cat_1': 'sum',
    'nb_art_cat_2': 'sum',
    'montant_cat_0': 'sum',
    'montant_cat_1': 'sum',
    'montant_cat_2': 'sum'
})
res_clients_all_orders = res_clients_all_orders.reset_index()
res_clients_all_orders.columns = ['c_id', 'c_sex', 'c_age', 's_panier_id', 's_id_date', 'nb_art_panier',
                                  'montant_panier', 'nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2', 'montant_cat_0', 'montant_cat_1', 'montant_cat_2']
#res_clients_all_orders.groupby(['c_id', 'c_sex', 'c_age', 's_panier_id']).max().loc[:, ['s_id_date', 'nb_art_panier',
#                                                                                        'montant_panier', 'nb_art_cat_0', 'nb_art_cat_1', 
#                                                                                        'nb_art_cat_2', 'montant_cat_0', 'montant_cat_1', 
#                                                                                        'montant_cat_2']]
res_clients_year_temp = res_clients_all_orders.groupby(['c_id', 'c_sex', 'c_age']).agg({
    's_panier_id': 'nunique',
    'nb_art_panier': 'sum',
    'montant_panier': 'sum',
    'nb_art_cat_0': 'sum',
    'nb_art_cat_1': 'sum',
    'nb_art_cat_2': 'sum',
    'montant_cat_0': 'sum',
    'montant_cat_1': 'sum',
    'montant_cat_2': 'sum'
})
res_clients_year = res_clients_year_temp.copy()
res_clients_year = res_clients_year.reset_index()
res_clients_year.columns = ['c_id', 'c_sex', 'c_age', 'nb_commandes', 'nb_articles', 'montant_total',
                            'nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2', 'montant_cat_0', 'montant_cat_1', 'montant_cat_2']


print("Df des commandes ('res_clients_all_orders') :")
res_clients_all_orders.sample(2)
print()
print("Df récapitulatif des clients ('res_clients_year') :")
res_clients_year.sample(2)

Df des commandes ('res_clients_all_orders') :

Df récapitulatif des clients ('res_clients_year') :


plt.figure(figsize=(14,8))
plt.hist(resultat['p_prix'], bins = range(0,300,5), alpha=0.7)
plt.axis([0, 300, 0, 85000])
plt.xlabel('prix')
plt.ylabel('Nombre de produits')
plt.title('Répartition des produits achetés selon leur prix',fontsize=20)
plt.show();


res_clients_year.describe().T


res_clients_year['montant_total'].nlargest(7)

677     150729.070000
4387    137151.480000
6336     69405.635589
2723     52744.145589
7790      2436.232795
7119      2406.170000
7005      2366.200000
Name: montant_total, dtype: float64


clients_outliers = ['c_1609', 'c_4958', 'c_6714', 'c_3454']
df_outliers = []
for i in range(4):
    x = str(clients_outliers[i])
    df = resultat.loc[resultat['c_id'] == x]
    df = df.groupby(['s_panier_id', 's_id_date', 'p_categ']).agg({
        'p_prix': ['min', 'count']
    })
    df = df.reset_index()
    df.columns = ['s_panier_id', 'date', 'p_categ', 'p_prix', 'p_id']
    df = df.groupby(['s_panier_id', 'p_categ']).agg(**{
        'qte': pd.NamedAgg(column='p_categ', aggfunc='count'),
        'montant': pd.NamedAgg(column='p_prix', aggfunc='sum'),
        'date': pd.NamedAgg(column='date', aggfunc='min')
    })
    df = df.reset_index()
    df.columns = ['id_panier', 'categ', 'quantite', 'montant', 'date_session']
    df.insert(0, 'client_id', x)
    df_outliers.append(df)
    df.head(2)


colors_pers1 = ['#5893C2', '#EDC964', '#819A52', '#CBA364']
figure = plt.figure(figsize=(20, 10))
plt.gcf().subplots_adjust(left=0.125, bottom=0.1, right=0.9,
                          top=0.9, wspace=0.2, hspace=0.3)

for i in range(4):
    _ = plt.subplot(2, 2, i+1)
    x = str(clients_outliers[i])
    df = df_outliers[i]
    df0 = df[df['categ'] == 0.0]
    df1 = df[df['categ'] == 1.0]
    df2 = df[df['categ'] == 2.0]
    df_cat = [df0, df1, df2]
    r0 = plt.scatter(df_cat[0]['date_session'], df_cat[0]
                     ['montant'], s=15, c=colors_pers1[0])
    r1 = plt.scatter(df_cat[1]['date_session'],
                     df_cat[1]['montant'], s=15, c='tomato')
    r2 = plt.scatter(df_cat[2]['date_session'], df_cat[2]
                     ['montant'], s=15, c=colors_pers1[2])
    _ = plt.yticks(position=(0.01,0))
    _ = plt.title('Achats du client id='+x+' par catégories',
                  color='navy', fontdict={'size': 16, 'weight': 500})
    if i % 2 == 0:
        _ = plt.ylabel('Montant de la commande', labelpad=15,
                       color='gray', fontdict={'size': 16})
    _ = plt.grid(linestyle='--', alpha=0.3)
    legend = ['Cat 0', 'Cat 1', 'Cat 2']
    if i == 2:
        _ = plt.legend([r0, r1, r2], ['Cat 0', 'Cat 1', 'Cat 2'], bbox_to_anchor=(
            1.01, 1.2), loc='upper left', frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=False)

_ = plt.show()


resultat.insert(0, 'c_nature', 'particulier')
res_clients_all_orders.insert(0, 'c_nature', 'particulier')
res_clients_year.insert(0, 'c_nature', 'particulier')

resultat.loc[resultat['c_id'].isin(
    clients_outliers), 'c_nature'] = 'professionnel'
res_clients_all_orders.loc[res_clients_all_orders['c_id'].isin(clients_outliers), 'c_nature'] = 'professionnel'
res_clients_all_orders.groupby(['c_nature', 'c_id', 'c_sex', 'c_age', 's_panier_id']).max().loc[:, ['s_id_date', 'nb_art_panier', 'montant_panier', 
                                                                                                    'nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2', 
                                                                                                    'montant_cat_0', 'montant_cat_1', 'montant_cat_2']]
res_clients_year.loc[res_clients_year['c_id'].isin(clients_outliers), 'c_nature'] = 'professionnel'


ca_annuel = res_clients_year['montant_total'].sum()
ca_clients_pro = res_clients_year[res_clients_year['c_nature']
                                  == 'professionnel']['montant_total'].sum()
tx_ca_premiums = round((ca_clients_pro/ca_annuel)*100, 2)

print(f'Ces 4 clients représentent {tx_ca_premiums}% du CA total')

Ces 4 clients représentent 7.48% du CA total


def gini(arr):
    count = arr.size
    coefficient = 2 / count
    indexes = np.arange(1, count + 1)
    weighted_sum = (indexes * arr).sum()
    total = arr.sum()
    constant = (count + 1) / count
    return coefficient * weighted_sum / total - constant


arr = res_clients_year['montant_total'].sort_values()
gini_value = gini(arr)
n = len(arr)
lorenz = np.cumsum(np.sort(arr)) / arr.sum()
lorenz = np.append([0], lorenz)

x = [0, 1]
y = [0, 1]
fig = plt.figure(figsize=(8, 8.15))
ax = fig.add_subplot(111)
ax.plot(x, y)
plt.axis('equal')
plt.vlines(0.8, ymin=0, ymax=0.51, ls='--', color='orange', alpha=0.5)
plt.hlines(0.51, xmin=0, xmax=0.8, ls='--', color='orange', alpha=0.5)
plt.vlines(0.5, ymin=0, ymax=0.2, ls='--', color='orange', alpha=0.5)
plt.hlines(0.2, xmin=0, xmax=0.5, ls='--', color='orange', alpha=0.5)
plt.tick_params(axis='both', which='major', labelsize=14)
xaxis = np.linspace(0, 1, n+1)
ax.plot(xaxis, lorenz, drawstyle='steps-post', color='red', label='Courbe de Lorenz')
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
plt.fill_between(xaxis, xaxis, lorenz, alpha=0.6, hatch="X", color='teal', label='Surface de concentration')
plt.fill_between(np.linspace(0, 1, len(lorenz)), lorenz, color='gray', alpha=0.3)
plt.xlabel('Proportions cumulées des clients', color='gray', labelpad=10, fontdict={'size': 16})
plt.ylabel('Proportions cumulées du CA en valeur', color='gray', labelpad=15, fontdict={'size': 16})
plt.title('Concentration du CA en valeur par clients', y=1.02, fontdict={'size': 20, 'weight': 500})
plt.text(0.2, 0.6, "Gini = " + str(round(gini_value, 2)), fontsize=18, color='darkred', weight='semibold')
plt.legend(loc='upper left', bbox_to_anchor=(0.02, 0.98), fontsize=12, frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=0.5)
plt.grid(linestyle='--', alpha=0.4)
plt.show();


res_clients_year_part = res_clients_year.loc[res_clients_year['c_nature'] == 'particulier']
res_clients_year_pro = res_clients_year[res_clients_year['c_nature'] == 'professionnel']

res_clients_all_orders_part = res_clients_all_orders.loc[res_clients_all_orders['c_nature'] == 'particulier']
res_clients_all_orders_pro = res_clients_all_orders.loc[res_clients_all_orders['c_nature'] == 'professionnel']


lorenz_f(arrg=res_clients_year_part['montant_total'],
         title='Concentration du CA des clients non professionnels',
         xlabel='Proportions cumulées des clients',
         ylabel='Proportions cumulées du CA en valeur')


lorenz_f(arrg=res_clients_year['montant_cat_0'],
         title='Concentration du CA par catégorie',
         xlabel='Proportions cumulées des clients',
         ylabel='Proportions cumulées du CA par catégorie en valeur',
         nb=3,
         arrg2=res_clients_year['montant_cat_1'],
         arrg3=res_clients_year['montant_cat_2'],
         title_lorenz=['catégorie 0', 'catégorie 1', 'catégorie 2'])


res_clients_all_orders.sample(1)


z = resultat.copy()
z.sample(1)
z=z.groupby(['p_id', 'p_categ', 'c_sex'])[['c_sex', 'p_id', 'p_prix']].agg(**{
    'nb_sex':pd.NamedAgg(column='c_sex', aggfunc='count'),
    'qte_total':pd.NamedAgg(column='p_id', aggfunc='count'),
    'prix_total':pd.NamedAgg(column='p_prix', aggfunc='sum'),
})
z=z.unstack().reset_index()
z.columns = ['p_id', 'p_categ', 'nb_f', 'nb_h', 'qte_f', 'qte_h', 'montant_f', 'montant_h']
z['montant_total'] = z['montant_f']+z['montant_h']
z['qte_totale'] = z['qte_f']+z['qte_h']

z=z[['p_id', 'p_categ', 'qte_f', 'qte_h', 'montant_f', 'montant_h', 'montant_total', 'qte_totale']]
z=z.fillna(0)

z_montant = z[['p_id', 'p_categ', 'montant_f', 'montant_h', 'montant_total']]
z_montant=z_montant.sort_values('montant_total', ascending=False)
z_qte = z[['p_id', 'p_categ', 'qte_f', 'qte_h', 'qte_totale']]
z_qte = z_qte.sort_values('qte_totale', ascending=False)


lorenz_f(arrg=z_qte['qte_totale'],
         title='Concentration du CA en volume par produits',
         xlabel='Proportions cumulées des produits',
         ylabel='Proportions cumulées du CA en volume')
plt.vlines(0.8, ymin=0, ymax=0.26, ls='--', color='orange', alpha=0.5)
plt.hlines(0.26, xmin=0, xmax=0.8, ls='--', color='orange', alpha=0.5)
plt.show();


z_montant


lorenz_f(arrg=z_montant['montant_total'],
         title='Concentration du CA en valeur par produits',
         xlabel='Proportion cumulées des produits',
         ylabel='Proportion cumulées du CA en valeur')
plt.vlines(0.8, ymin=0, ymax=0.21, ls='--', color='orange', alpha=0.5)
plt.hlines(0.21, xmin=0, xmax=0.8, ls='--', color='orange', alpha=0.5)
plt.show();


df = res_clients_all_orders.copy()

fig = plt.figure(figsize=(18, 6))
ax = fig.add_subplot(111)
df['hour'] = (df['s_id_date'].dt.hour)
hours = [hour for hour, z in df.groupby(['hour'])]
_ = ax.plot(hours, df.groupby('hour')[
            'nb_art_cat_0'].sum(), color='navy', label='Catégorie 0')
_ = ax.plot(hours, df.groupby('hour')[
            'nb_art_cat_1'].sum(), color='red', label='Catégorie 1')
_ = ax.plot(hours, df.groupby('hour')[
            'nb_art_cat_2'].sum(), color='teal', label='Catégorie 2')
_ = plt.xlabel('Heures', color='gray', labelpad=10, fontdict={'size': 16})
_ = plt.ylabel('Nombre d\'articles vendus dans l\'année',
               color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.title('Répartition horaire des ventes (ventes en volume cumulées sur l\'année)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.legend(loc='upper right', bbox_to_anchor=(0.98, 0.85), fontsize=12,
               frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=0.5)
_ = plt.xticks(hours)
_ = plt.grid(linestyle='--', alpha=0.4)
_ = plt.show()


jour = ['Lundi', 'Mardi', 'Mercredi',
        'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']
index = np.arange(7)

fig = plt.figure(figsize=(18, 6))
_ = plt.xticks(index, jour)
width = 0.3

df['day'] = (df['s_id_date'].dt.dayofweek)

days = [day for day, z in df.groupby(['day'])]
_ = plt.bar(index - width, df.groupby('day')
            ['nb_art_cat_0'].sum(), width, color='teal', label='Catégorie 0', alpha=0.8)
_ = plt.bar(index, df.groupby('day')['nb_art_cat_1'].sum(
), width, color='darkred', label='Catégorie 1', alpha=0.8)
_ = plt.bar(index + width, df.groupby('day')
            ['nb_art_cat_2'].sum(), width, color='darkorange', label='Catégorie 2', alpha=0.8)

_ = ax.set_ylabel('Nombre d\'articles vendus dans l\'année',
                  color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.title('Répartition quotidienne des ventes (ventes en volume cumulées sur l\'année)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.legend(loc='upper right', bbox_to_anchor=(1, 1), fontsize=12,
               frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=0.5)
_ = plt.grid(linestyle='--', alpha=0.4)
_ = plt.show()


res_clients_all_orders.shape

(157648, 14)


res_clients_all_orders[res_clients_all_orders['montant_panier']>200].count()

c_nature          917
c_id              917
c_sex             917
c_age             917
s_panier_id       917
s_id_date         917
nb_art_panier     917
montant_panier    917
nb_art_cat_0      917
nb_art_cat_1      917
nb_art_cat_2      917
montant_cat_0     917
montant_cat_1     917
montant_cat_2     917
dtype: int64


plt.figure(figsize=(14,8))
plt.hist(res_clients_all_orders['montant_panier'], bins = range(0,550,10), alpha=0.7)
plt.axis([0, 550, 0, 45000])
plt.xlabel('Montant du panier')
plt.ylabel('Nombre de paniers')
plt.title('Répartition des paniers de 200€ en fonction de leur montant',fontsize=20)
plt.show();


r2 = res_clients_all_orders[res_clients_all_orders['montant_panier']<200]

plt.figure(figsize=(14,8))
plt.hist(r2['montant_panier'], bins = range(0,200,5), alpha=0.7)
plt.axis([0, 200, 0, 25000])
plt.xlabel('Montant du panier')
plt.ylabel('Nombre de paniers')
plt.title('Répartition des paniers de moins de 200€ en fonction de leur montant',fontsize=20)
plt.show();


res_clients_all_orders.describe()

res_clients_year_moy = res_clients_year.copy()
res_clients_year_moy['panier_moyen'] = res_clients_year_moy['montant_total'] / res_clients_year_moy['nb_commandes']
res_clients_year_moy_part = res_clients_year_moy[res_clients_year_moy['c_nature'] == 'particulier']
res_clients_year_moy_pro = res_clients_year_moy[res_clients_year_moy['c_nature'] == 'professionnel']
res_clients_all_orders_part.describe()
res_clients_all_orders_part.sample(1)


fig = plt.figure(figsize=(24, 4))
ax = plt.axes()
sns.boxplot(x='montant_panier', width=0.8, showmeans=True, data=res_clients_all_orders, palette='Set2')
plt.xlim([-10, 570])
plt.xlabel('Montant', color='gray', labelpad=5, fontdict={'size': 16})
plt.title('Répartition du panier moyen (paniers de session)', fontdict={'size': 20, 'weight': 500})
ax.xaxis.set_ticks(np.arange(0, 560, 20))
plt.grid(linestyle='--', alpha=0.4)
plt.show();


x_60 =res_clients_all_orders_part[res_clients_all_orders_part['montant_panier'] < 60.]['s_panier_id'].count()
x_60_part = (x_60/res_clients_all_orders_part.shape[0])*100
x = res_clients_all_orders_part[res_clients_all_orders_part['montant_panier'] > 85.]['s_panier_id'].count()
part_x = (x/res_clients_all_orders_part.shape[0])*100

print(f"Ainsi, les paniers inférieurs à 60€ représentent {x_60_part:.2f}% des commandes alors que ceux de plus de 85€ ne représentent que {part_x:.2f}% de celles-ci.")

Ainsi, les paniers inférieurs à 60€ représentent 87.13% des commandes alors que ceux de plus de 85€ ne représentent que 5.41% de celles-ci.


fig = plt.figure(figsize=(6, 12))
_ = sns.boxplot(
    x=res_clients_all_orders['c_sex'], y=res_clients_all_orders['montant_panier'])


mois4 = ['Mars-21', 'Avril-21', 'Mai-21', 'Juin-21', 'Juilet-21',
         'Août-21', 'Sept-21', 'Nov-21', 'Dec-21', 'Janv-22', 'Fev-22']
res_clients_month = res_clients_all_orders.copy()
res_clients_month['mois'] = res_clients_month['s_id_date'].dt.month
res_clients_month['annee'] = res_clients_month['s_id_date'].dt.year
res_clients_month = res_clients_month.groupby(['annee', 'mois']).agg({
    'montant_panier':'mean',
    'nb_art_panier':'mean'
})
res_clients_month.reset_index(inplace=True)
res_clients_month


mois4 = ['Mars-21', 'Avril-21', 'Mai-21', 'Juin-21', 'Juilet-21',
         'Août-21', 'Sept-21', 'Nov-21', 'Dec-21', 'Janv-22', 'Fev-22']
res_clients_month = res_clients_all_orders.copy()
res_clients_month['mois'] = res_clients_month['s_id_date'].dt.month
res_clients_month['annee'] = res_clients_month['s_id_date'].dt.year
res_clients_month = res_clients_month.groupby(['annee', 'mois']).agg({
    'montant_panier':'mean',
    'nb_art_panier':'mean'
}).reset_index()

fig, ax1 = plt.subplots(figsize=(18, 6))

ax1.tick_params(axis='y', labelcolor='orangered')
plt.ylim(20,40)
ax1.set_ylabel('Montant du panier moyen', color='orangered', labelpad=20, fontdict={'size': 16})
a1 = res_clients_month.plot(kind='line', y='montant_panier', color='orangered', marker='o', linewidth=2, label='', ax=ax1)
plt.legend(["Panier moyen (en €)"], bbox_to_anchor=(0.7, 0.15), loc='upper left', fontsize=13)
plt.grid(linestyle='--', alpha=0.3)

ax2 = ax1.twinx()
ax2.tick_params(axis='y', labelcolor='teal')
plt.ylim(0, 4)
ax2.set_ylabel('Quantité panier moyen', color='teal', labelpad=20, fontdict={'size': 16})
a2 = res_clients_month.plot(kind='line', y='nb_art_panier', color='teal',  marker='o', linewidth=2, label='', ax=ax2)
plt.title('Evolution mensuelle du panier moyen en volume et en valeur', y=1.02, color='MidnightBlue', fontdict={'size': 20})
plt.legend(["Nombre d'articles du panier moyen"], bbox_to_anchor=(0.7, 0.1), loc='upper left', fontsize=13)
plt.grid(linestyle='--', alpha=0.3)
plt.show();


z=res_clients_all_orders.copy()
z['meme_cat'] = z[['nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2']].max(axis=1) / z['nb_art_panier']
x=round((z['meme_cat'].mean())*100,2)
print(f'  - {x}% des produits d\'une commande sont de la même catégorie')


z_select=z[z['meme_cat'] == 1.]
print(f"  - Part des commandes qui n'ont que des produits de la même catégorie : {round((z_select.shape[0]/z.shape[0])*100,2)} %.")
print()
print("  Ces commandes 'mono-catégorie' se répartissent de la façon suivante :")
for i in range(3):
    z_cat = z_select[z_select['nb_art_cat_'+str(i)] != 0.]
    moy_z_cat = z_cat['nb_art_panier'].mean()
    x=round((z_cat.shape[0]/z_select.shape[0])*100, 2)
    print(f'       - {x}% sont de catégorie {i}, avec un nombre moyen d\'articles commandés de {moy_z_cat:.2f}')

  - 89.05% des produits d'une commande sont de la même catégorie
  - Part des commandes qui n'ont que des produits de la même catégorie : 72.52 %.

  Ces commandes 'mono-catégorie' se répartissent de la façon suivante :
       - 54.7% sont de catégorie 0, avec un nombre moyen d'articles commandés de 1.84
       - 36.9% sont de catégorie 1, avec un nombre moyen d'articles commandés de 1.23
       - 8.4% sont de catégorie 2, avec un nombre moyen d'articles commandés de 1.23


plt.gcf().subplots_adjust(left=0.05, bottom=0.2, right=0.2,
                              top=0.3, wspace=0.1, hspace=0)
labels = []
for i in range(0, 3):
    labels.append('Catégorie ' + str(i))

fig = plt.figure(figsize=(18, 6));

draw_bar_pie(
    z=ax1, a=1, b=2, c=1,
    titre='Composition du panier : 1 ou plusieurs catégories',
    labels_in=labels,
    size_in=[72.52, 100-72.52],
    explode_in=(0, 0.1),
    titre_legend='Répartition',
    legend_available=True,
    g_color=[color_seq_p[0], 'tomato']
)
plt.legend(["Seulement des produits d'une seule catégorie", "Des produits de plusieurs catégories"], bbox_to_anchor=(0.65, 0.12), frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=0.5)
draw_bar_pie(
    z=ax2, a=1, b=2, c=2, def_angle=60,
    titre="Répartition des paniers d'une seule catégorie",
    labels_in=labels,
    size_in=[54.7, 36.9, 8.4],
    explode_in=(0, 0.1, 0),
    size_perc=14,
    titre_legend='',
    legend_available=True,
    g_color=["#0092a8", "#23bcdb", "#59e1ff"]
)
plt.legend(["Catégorie 0", "Catégorie 1", "Catégorie 2"], bbox_to_anchor=(0.2, 0.17), frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=0.5)
plt.show();

<Figure size 432x432 with 0 Axes>


c = res_clients_year.copy()
c['meme_cat'] = c[['nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2']].max(axis=1) / c['nb_articles']
x1 = round((c['meme_cat'].mean())*100,2)
print(f"Sur l'année étudiée, en moyenne pour chaque client, {x1}% des produits qu'il commande sont de la même catégorie.")
c_select=c[c['meme_cat'] == 1.];
x2 = round((c_select.shape[0]/c.shape[0])*100,2)
print(f"Mais seulement {x2}% des clients n'ont acheté que des produits d'une seule catégorie.")

Sur l'année étudiée, en moyenne pour chaque client, 65.35% des produits qu'il commande sont de la même catégorie.
Mais seulement 2.37% des clients n'ont acheté que des produits d'une seule catégorie.


ry = res_clients_year.copy()
ry['panier_moyen']=ry['montant_total']/ry['nb_commandes']
ry.sample(1)
x = ry['c_age']
y = ry['panier_moyen']
fig = plt.figure(figsize=(16, 6))
ax = fig.add_subplot(111)
_= plt.scatter(x, y, color='darkred', alpha=0.5, s=15)
_= plt.xlabel('Age', color='gray', labelpad=10, fontdict={'size': 16})
_=ax.set_xlim(10, 100);
_= plt.ylabel('Montant du panier moyen (en €)', color='gray',
               labelpad=15, fontdict={'size': 16})
_=ax.set_ylim(0, 250);
_= plt.title('Dispersion du montant du panier en fonction de l\'âge (tous les clients)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
plt.show()


ry_part=ry[ry['c_nature'] == 'particulier']
cov = np.cov(ry_part['c_age'], ry_part['panier_moyen'], ddof=0)[1, 0]
pearson = st.pearsonr(ry_part['c_age'], ry_part['panier_moyen'])[0]

print(f'Le coefficient de corrélation est égal à {pearson}')

Le coefficient de corrélation est égal à -0.6273551383097524


r = res_clients_all_orders.copy()
r.sample(1)
x = r['c_age']
y = r['montant_panier']
fig = plt.figure(figsize=(16, 6))
ax = fig.add_subplot(111)
_= plt.scatter(x, y, color='darkred', alpha=0.5, s=30,)
_= plt.xlabel('Age', color='gray', labelpad=10, fontdict={'size': 16})
_=ax.set_xlim(10, 100);
_= plt.ylabel('Montant du panier moyen (en €)', color='gray',
               labelpad=15, fontdict={'size': 16})
_=ax.set_ylim(0, 600);
_= plt.title('Dispersion montant du panier en fonction de l\'âge (tous les clients)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
plt.show()


r = res_clients_all_orders_part
cov = np.cov(r['c_age'], r['montant_panier'], ddof=0)[1, 0]
pearson = st.pearsonr(r['c_age'], r['montant_panier'])[0]

Y = r['montant_panier']
X = r[['c_age']]
X = X.copy() 
X['intercept'] = 1.
result = sm.OLS(Y, X).fit()
p_value = result.f_pvalue
if p_value < 0.05:
    p_text = 'le résultat est significatif'
else:
    p_text = 'le résultat est non significatif'

print(f'Le coefficient de corrélation est égal à {pearson} (avec une p_value de {p_value}, donc {p_text})')
print('Ainsi, il existe une relation négative (faible) entre l\'âge des clients et le montant qu\'ils dépensent par commande')

Le coefficient de corrélation est égal à -0.33534422157615773 (avec une p_value de 0.0, donc le résultat est significatif)
Ainsi, il existe une relation négative (faible) entre l'âge des clients et le montant qu'ils dépensent par commande


r = r.groupby('c_age')['montant_panier'].mean().reset_index()
fig = plt.figure(figsize=(14, 6))
_ = ax = fig.add_subplot(111)
_ = sns.scatterplot(data=r, x='c_age', y='montant_panier', marker="o")
_ = plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.xlim(15, 100)
_ = plt.ylabel('Montant du panier (en €)', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.ylim(0, 80)
_ = plt.title('Montant du panier moyen en fonction de l\'âge des clients',
              y=1.02, fontdict={'size': 20, 'weight': 500})
plt.show()


def classe(age):
    if age < 30:
        return 'A'
    elif age > 49:
        return 'C'
    else:
        return 'B'
    
rc = res_clients_year_part.copy()

rc['classe'] = rc['c_age'].apply(lambda x: classe(x))
rc['montant_panier']=rc['montant_total'] / rc['nb_commandes']
rc
# On crée également un df par classe
rc_17_29 = rc.loc[rc['classe'] == 'A']
rc_30_49 = rc.loc[rc['classe'] == 'B']
rc_50_92 = rc.loc[rc['classe'] == 'C']


fig = plt.figure(figsize=(18, 6))
plt.gcf().subplots_adjust(left=0.1, bottom=0.1,
                          right=0.9, top=0.9, wspace=0.1, hspace=0)
ax1 = fig.add_subplot(121)

meanpointprops = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')

# affichage des boxplots
ax1 = sns.boxplot(data=rc, x='classe', order=['A', 'B', 'C'], y='montant_panier', meanprops=meanpointprops,
                  width=0.6, palette='Set2', showfliers=False, showmeans=True)
plt.xlabel('', color='white', fontdict={'size': 0})
plt.ylabel('Montant du panier (en €)', color='gray', labelpad=15, fontdict={'size': 14})
ax1.yaxis.set_ticks_position('left')
plt.title('Montant du panier moyen par classe d\'âges', y=1.02, fontdict={'size': 16, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)

ax2 = fig.add_subplot(122)
ax2 = sns.boxplot(data=rc, x='classe',  order=['A', 'B', 'C'], y='montant_panier', meanprops=meanpointprops,
                  width=0.6, hue='c_sex', showfliers=False, showmeans=True, )
plt.xlabel('', color='white', fontdict={'size': 0})
plt.ylabel('Montant du panier (en €)', color='grey',  labelpad=15, fontdict={'size': 14})
ax2.yaxis.set_ticks_position('right')
ax2.yaxis.set_label_position('right')
plt.title('Montant du panier moyen par classe d\'âges et par sexe', y=1.02, fontdict={'size': 16, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)

# affichage des effectifs de chaque classe
for ax in [ax1, ax2]:
    ax.xaxis.set_ticklabels(['17-29 ans', '30-49 ans', '50-92 ans'])
    nobs = rc['classe'].value_counts().values
    nobs = [str(x) for x in nobs.tolist()]
    nobs = ["n: " + i for i in nobs]
    pos = range(len(nobs))
    for tick, label in zip(pos, ax.get_xticklabels()):
        ax.text(pos[tick], -4, nobs[tick], horizontalalignment='center', size=14, color='gray', weight='semibold')
    ax.tick_params(axis='x', which='major', pad=20)
    
plt.show();


# Ici :
# X = classe d'âge
# Y = Montant du panier

eta_age_panier = eta_squared(rc["classe"], rc["montant_panier"])
eta = '{:.5f}'.format(float(eta_age_panier))
print(f'Le eta² est égal à {eta}')
print('   -> Il existe donc une corrélation entre les différentes classes d\'âges et le montant du panier moyen')

Le eta² est égal à 0.61538
   -> Il existe donc une corrélation entre les différentes classes d'âges et le montant du panier moyen


df = rc
aov = pg.anova(dv='montant_panier', between='classe', data=df, detailed=True, effsize='n2')
aov
SCT = aov['SS'][0]+aov['SS'][1]


rcg = rc.copy()
#rcg = rc.groupby(['classe', 'c_age'])['montant_panier']
lm = ols('montant_panier ~ classe', data=rcg).fit()
table = sm.stats.anova_lm(lm)
print(table)
print(lm.summary())
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
print(table)
table['sum_sq'][0]+table['sum_sq'][1]

              df        sum_sq       mean_sq            F  PR(>F)
classe       2.0  2.760389e+06  1.380194e+06  6872.676357     0.0
Residual  8591.0  1.725274e+06  2.008234e+02          NaN     NaN
                            OLS Regression Results                            
==============================================================================
Dep. Variable:         montant_panier   R-squared:                       0.615
Model:                            OLS   Adj. R-squared:                  0.615
Method:                 Least Squares   F-statistic:                     6873.
Date:              lun., 01 mars 2021   Prob (F-statistic):               0.00
Time:                        01:21:35   Log-Likelihood:                -34977.
No. Observations:                8594   AIC:                         6.996e+04
Df Residuals:                    8591   BIC:                         6.998e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      70.0858      0.298    235.425      0.000      69.502      70.669
classe[T.B]   -36.9147      0.385    -95.851      0.000     -37.670     -36.160
classe[T.C]   -43.7689      0.395   -110.682      0.000     -44.544     -42.994
==============================================================================
Omnibus:                     3690.431   Durbin-Watson:                   2.012
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            54776.912
Skew:                           1.661   Prob(JB):                         0.00
Kurtosis:                      14.914   Cond. No.                         4.15
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
              df        sum_sq       mean_sq            F  PR(>F)    EtaSq
classe       2.0  2.760389e+06  1.380194e+06  6872.676357     0.0  0.61538
Residual  8591.0  1.725274e+06  2.008234e+02          NaN     NaN      NaN

4485662.612356715


# Calcul de la moyenne globale
moy_globale = rcg['montant_panier'].mean()
moy_globale

40.54051219696055


# Calcul de SCT (Somme des Carrés Totale) : Sum of Squares -> Variation totale
rcg['moy_globale'] = moy_globale
ss_total = sum((rcg['montant_panier'] - rcg['moy_globale'])**2)
ss_total

4485662.612356712


## Calcul de SCR (Somme des Carrés Résiduelle) : Sum of Squares of the Error -> Variation interclasse : variance au sein de la classe i

# Calcul des moyennes des classes
moy_classes = rcg.groupby('classe').mean();
moy_classes = moy_classes.rename(columns={'montant_panier': 'moy_classe'});
moy_classes = moy_classes[['moy_classe', 'moy_globale']];

# Merge des moyennes de classe dans le df initial
rcg = rcg.merge(moy_classes, left_on='classe', right_index=True);
rcg['moy_globale'] = moy_globale;
rcg.drop(['moy_globale_x', 'moy_globale_y'], axis=1);


# Calcul de la SCR
ss_residual = sum((rcg['montant_panier'] - rcg['moy_classe'])**2);
ss_residual

1725273.951240069


# Calcul de SCE (Somme des Carrés Expliquée) : Sum of Squares of the Model -> Variation interclasse : variance entre les classes
ss_explained = sum((rcg['moy_globale'] - rcg['moy_classe'])**2)
ss_explained

2760388.6611165768


eta2 = ss_explained / ss_total
eta2='{:.5f}'.format(float(eta2))
print(f'Le eta² est égal à {eta2}')
if eta2 == eta:
    print()
    print('  -> On retrouve par le calcul la même valeur pour eta2 que celle obtenue par le module python statsmodels')

Le eta² est égal à 0.61538

  -> On retrouve par le calcul la même valeur pour eta2 que celle obtenue par le module python statsmodels


# Calcul du carré des moyennes de la variation inexpliquée
n_classes = len(set(rcg['classe']))
n_obs = rcg.shape[0]
df_residual = n_obs - n_classes
ms_residual = ss_residual / df_residual
ms_residual

200.82341418229183


# Calcul du carré des moyennes de la variation du modèle
df_explained = n_classes - 1
ms_explained = ss_explained / df_explained
ms_explained

1380194.3305582884


# Calcul de la Statistique F
f = ms_explained / ms_residual
f

6872.676356878664


# Calcul de la p_value
import scipy.stats
p_value = 1 - scipy.stats.f.cdf(f, df_explained, df_residual)
"{:.8f}".format(float(p_value))

print('La valeur p_value est < à 0.05')
print('   -> On peut donc rejeter l\'hypothèse HO et accepter l\'alternative H1 : au moins une moyenne de groupe est significativement différente.')

'0.00000000'

La valeur p_value est < à 0.05
   -> On peut donc rejeter l'hypothèse HO et accepter l'alternative H1 : au moins une moyenne de groupe est significativement différente.


r_session=res_clients_all_orders.copy()
r_session['classe'] = r_session['c_age'].apply(lambda x: classe(x))
r_session=r_session.loc[r_session['c_nature']=='particulier']
r_session.sample(2)


fig = plt.figure(figsize=(18, 6))
plt.gcf().subplots_adjust(left=0.1, bottom=0.1,
                          right=0.9, top=0.9, wspace=0.1, hspace=0)
ax1 = fig.add_subplot(121)

meanpointprops = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')

# affichage des boxplots
ax1 = sns.boxplot(data=r_session, x='classe', order=['A', 'B', 'C'], y='montant_panier', meanprops=meanpointprops,
                  width=0.6, palette='Set2', showfliers=False, showmeans=True, )
plt.xlabel('', color='white', fontdict={'size': 0})
plt.ylabel('Montant du panier (en €)', color='gray',
               labelpad=15, fontdict={'size': 14})
ax1.yaxis.set_ticks_position('left')
plt.title('Montant du panier par classes d\'âges',
              y=1.02, fontdict={'size': 16, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)

ax2 = fig.add_subplot(122)
ax2 = sns.boxplot(data=r_session, x='classe',  order=['A', 'B', 'C'],y='montant_panier', meanprops=meanpointprops,
                  width=0.6, hue='c_sex', showfliers=False, showmeans=True, )
plt.xlabel('', color='white', fontdict={'size': 0})
plt.ylabel('Montant du panier (en €)', color='grey',  labelpad=15, fontdict={'size': 14})
ax2.yaxis.set_ticks_position('right')
ax2.yaxis.set_label_position('right')
plt.title('Montant du panier par classes d\'âges et par sexe',
              y=1.02, fontdict={'size': 16, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)

# affichage des effectifs de chaque classe
for ax in [ax1, ax2]:
    ax.xaxis.set_ticklabels(['17-29 ans', '30-49 ans', '50-92 ans'])
    nobs = rc['classe'].value_counts().values
    nobs = [str(x) for x in nobs.tolist()]
    nobs = ["n: " + i for i in nobs]
    pos = range(len(nobs))
    for tick, label in zip(pos, ax.get_xticklabels()):
        ax.text(pos[tick], -17, nobs[tick], horizontalalignment='center',
                    size=14, color='gray', weight='semibold')
    ax.tick_params(axis='x', which='major', pad=20)
    
plt.show();


lm = ols('montant_panier ~ classe', data=r_session).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
print(table)
print()
print(f'La valeur de eta² est de {esq_sm} : il y a donc une corrélation entre les classes d\'âges et le montant des paniers de sessions.')
print(f"La p_value est de {table['PR(>F)']['classe']} : la valeur de eta² obtenue est donc significative :")
print('  -> on rejette donc l\'hypothèse nulle au profit de l\'hypothèse alternative H1 : "au moins une moyenne de groupe est significativement différente."')

                df        sum_sq       mean_sq             F  PR(>F)     EtaSq
classe         2.0  3.117490e+07  1.558745e+07  19703.081871     0.0  0.211351
Residual  147043.0  1.163283e+08  7.911174e+02           NaN     NaN       NaN

La valeur de eta² est de 0.21135071325014812 : il y a donc une corrélation entre les classes d'âges et le montant des paniers de sessions.
La p_value est de 0.0 : la valeur de eta² obtenue est donc significative :
  -> on rejette donc l'hypothèse nulle au profit de l'hypothèse alternative H1 : "au moins une moyenne de groupe est significativement différente."


res_clients_year.describe().T


res_clients_year_part.describe().T


fig = plt.figure(figsize=(24, 4))
ax = plt.axes()
_ = sns.boxplot(x='nb_commandes', width=0.3, showmeans=True,
                data=res_clients_year_part, palette='Set2')
_ = plt.xlim([-5, 85])
_ = plt.xlabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Répartition de la fréquence d\'achat', y=1.05, fontdict={'size': 20, 'weight': 500})
_ = ax.xaxis.set_ticks(np.arange(0, 80, 5))
plt.show()


fig = plt.figure(figsize=(6, 12))
_ = sns.boxplot(x=res_clients_year_part['c_sex'],
                y=res_clients_year_part['nb_commandes'], width=0.4, showmeans=True)


res = res_clients_year_part.copy()
_ = sns.lmplot(data=res, x='c_age', y='nb_commandes', scatter_kws={
               "color": 'Coral'},  height=5, x_jitter=0.4, aspect=14/5)
_ = plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Dispersion de la fréquence d\'achat en fonction de l\'âge (clients non pro)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)


res = res_clients_year_part.copy()
sns.set(style="white", color_codes=True, font_scale = 1.3)
g = sns.jointplot(data=res, x='c_age', y='nb_commandes', color='teal', kind='reg')
r, p = st.pearsonr(res['c_age'], res['nb_commandes'])
phantom, = g.ax_joint.plot([], [], linestyle="", alpha=0)
_=g.ax_joint.legend([phantom], ['r = {:f} \np = {:f}'.format(r, p)], fontsize=12, facecolor= 'lightgrey', edgecolor='grey', 
                    frameon=True, framealpha=0.2, borderpad=0.6, labelspacing=0.1, handlelength=0, handletextpad=0)
_=g.set_axis_labels('Age', 'Nombre de commandes', color='gray',labelpad=15, fontdict={'size': 16},)
g.fig.set_figwidth(16)
g.fig.set_figheight(6)
_ = plt.title('Dispersion de la fréquence d\'achat en fonction de l\'âge (clients non pro)',x=-3, y=1.22, fontdict={'size': 16, 'weight': 500})


cov = np.cov(res_clients_year_part['c_age'],
             res_clients_year_part['nb_commandes'], ddof=0)[1, 0]
pearson = st.pearsonr(
    res_clients_year_part['c_age'], res_clients_year_part['nb_commandes'])[0]
print(f'Le coefficient de corrélation est égal à {pearson}')
print('Ainsi, le coefficient étant relativement proche de 0, on peut estimer que les 2 variables "âge" et "fréquence d\'achat" sont linéairement indépendantes.')

Le coefficient de corrélation est égal à 0.17468217495371718
Ainsi, le coefficient étant relativement proche de 0, on peut estimer que les 2 variables "âge" et "fréquence d'achat" sont linéairement indépendantes.


res = res_clients_year_part.copy()

res = res.groupby('c_age').agg(**{
    'effectif_age':pd.NamedAgg(column='c_age', aggfunc='count'),
    'nb_commandes_moy':pd.NamedAgg(column='nb_commandes', aggfunc='mean'),
}).reset_index()

res_effectif_age = res[['c_age', 'effectif_age']]

Y = res['nb_commandes_moy']
X = res[['c_age']]
X = X.copy()  # On modifiera X, on en crée donc une copie
X['intercept'] = 1.
# OLS = Ordinary Least Square (Moindres Carrés Ordinaire)
result = sm.OLS(Y, X).fit()
a, b = result.params['c_age'], result.params['intercept']

fig = plt.figure(figsize=(16,6))
ax = plt.axes()
sns.scatterplot(data=res, x='c_age', y='nb_commandes_moy', marker="o", size='effectif_age', sizes=(20,300), ax=ax)
#ax.plot(np.arange(15,95,1), [a*x+b for x in np.arange(15,95,1)], color='red', linewidth=3)
plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
ax.set_xlim(10, 100);
plt.xticks(np.arange(10, 100, step=10)) 
plt.ylabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 16})
ax.set_ylim(5, 30)
plt.legend(title='Effectif Age', loc='upper left', bbox_to_anchor=(
    0.001, 0.999),frameon=True, ncol=1, fancybox=True, framealpha=1, shadow=True, borderpad=1)
plt.title("Fréquence d'achat en fonction de l'âge",
              y=1.02, fontdict={'size': 20, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)
plt.show();


res = res_clients_year_part.copy()
res=pd.merge(res, res_effectif_age, on='c_age')

res_30_100=res.loc[res['c_age'] > 29.]

Y = res_30_100['nb_commandes']
X = res_30_100[['c_age']]
X = X.copy()  # On modifiera X, on en crée donc une copie
X['intercept'] = 1.
# OLS = Ordinary Least Square (Moindres Carrés Ordinaire)
result = sm.OLS(Y, X).fit()
#print(dir(result))
#print(result.summary())
a, b = result.params['c_age'], result.params['intercept']

fig = plt.figure(figsize=(16,6))
ax = plt.axes()
sns.scatterplot(data=res_30_100, x='c_age', y='nb_commandes', marker="o", ax=ax)
ax.plot(np.arange(25,95,1), [a*x+b for x in np.arange(25,95,1)], color='red', linewidth=3)
plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
ax.set_xlim(10, 100);
plt.xticks(np.arange(10, 100, step=10)) 
plt.ylabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 16})
#ax.set_ylim(5, 30)
plt.title('Régression linéaire entre l\'âge et la fréquence d\'achat des clients de 30 ans et plus',
              y=1.02, fontdict={'size': 20, 'weight': 500})
plt.show();


res_a = res_clients_year_part.copy()
res_a=pd.merge(res_a,res_effectif_age, on='c_age')
res_a.sample(1)
r_17_29=res_a.loc[res_a['c_age'] < 30.]
r_30_100=res_a.loc[res_a['c_age'] > 29.]


pearson = st.pearsonr(res_30_100['c_age'], res_30_100['nb_commandes'])[0]
p_value='{:.5f}'.format(float(result.f_pvalue))

print(f'Le coefficient de corrélation est égal à {pearson}, et la p_value est de {p_value} (donc la valeur est significative)')
print('  -> Il existe donc, pour les clients de 30 ans et plus, une corrélation faible négative importante entre leur âge et leur fréquence d\'achat :')

Le coefficient de corrélation est égal à -0.13636836366724303, et la p_value est de 0.00000 (donc la valeur est significative)
  -> Il existe donc, pour les clients de 30 ans et plus, une corrélation faible négative importante entre leur âge et leur fréquence d'achat :


x=res_30_100['c_age']
y=res_30_100['nb_commandes']

### Détermination des paramètres b0 et b1 par la méthode des moindres carrés

# Calcul des moyennes de X et Y
N = len(x)
x_mean = x.mean()
y_mean = y.mean()

# Estimation de B1
B1_num = ((x - x_mean) * (y - y_mean)).sum()
B1_den = ((x - x_mean)**2).sum()
B1 = B1_num / B1_den

# Estimation de B0
B0 = y_mean - (B1*x_mean)

# On obtient la droite de régression d'équation :
reg_line = 'y = {} + {}β'.format(round(B0, 3), round(B1, 3))

B0, B1, reg_line = linear_regression(x, y)
print('Droite de régression : ', reg_line)

Droite de régression :  y = 27.777596321134595 - 0.152β


num = (N * (x*y).sum()) - (x.sum() * y.sum())
den = np.sqrt((N * (x**2).sum() - x.sum()**2) * (N * (y**2).sum() - y.sum()**2))
R = num / den
print(f'Le coefficient de corrélation R entre l\'âge des clients (de 30 ans et plus) et la fréquence d\'achat est de {R} : il existe donc un lien entre ces 2 variables')
print(f'Le coefficient de détermination R² est quand à lui égal à {R**2}')

Le coefficient de corrélation R entre l'âge des clients (de 30 ans et plus) et la fréquence d'achat est de -0.13636836366724306 : il existe donc un lien entre ces 2 variables
Le coefficient de détermination R² est quand à lui égal à 0.018596330609281456


Y = r_17_29['nb_commandes']
X = r_17_29[['c_age']]
X = X.copy()  # On modifiera X, on en crée donc une copie
X['intercept'] = 1.
# OLS = Ordinary Least Square (Moindres Carrés Ordinaire)
result = sm.OLS(Y, X).fit()
#print(dir(result))
#print(result.summary())
a, b = result.params['c_age'], result.params['intercept']

fig = plt.figure(figsize=(16,6))
ax = plt.axes()
sns.scatterplot(data=r_17_29, x='c_age', y='nb_commandes', marker="o", ax=ax)
ax.plot(np.arange(15,35,1), [a*x+b for x in np.arange(15,35,1)], color='red', linewidth=3)
plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
ax.set_xlim(10, 40);
plt.xticks(np.arange(10, 40, step=10)) 
plt.ylabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 16})
#ax.set_ylim(5, 30)
plt.title('Régression linéaire entre l\'âge et la fréquence d\'achat des clients de moins de 30 ans',
              y=1.02, fontdict={'size': 20, 'weight': 500})
plt.show();


xj=r_17_29['c_age']
yj=r_17_29['nb_commandes']
pearson_jeunes = pg.corr(xj, yj)
pearson_jeunes
lm = pg.linear_regression(xj, yj)
reg_jeunes = lm.round(5)
reg_jeunes


print(f"Le coefficient de corrélation est égal à {round(pearson_jeunes.r['pearson'], 5)} : il n\'y a donc pas de corrélation")
print(f"La p_value étant de {reg_jeunes['pval'][0]}, le résultat obtenu est donc significatif")

Le coefficient de corrélation est égal à -0.01913 : il n'y a donc pas de corrélation
La p_value étant de 0.0, le résultat obtenu est donc significatif


sns.set(style="white", color_codes=True, font_scale = 1.3)
meanpointprops = dict(marker='8', markersize=6, markeredgecolor='firebrick', markerfacecolor='firebrick')
meanpointprops2 = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')
sns.set_palette(sns.color_palette("Dark2", 10))
res = res_clients_year_part.copy()
res['ind2'] = res['c_age']//10
res['ind2'] = res['ind2'].astype(int)

fig = plt.figure(figsize=(16, 14))
plt.gcf().subplots_adjust(left=0.1, bottom=0.1,
                          right=0.9, top=0.9, wspace=0, hspace=0.3)
ax1 = fig.add_subplot(211)

# affichage des boxplots
ax1 = sns.boxplot(data=res, x='ind2', y='nb_commandes', showfliers=False, 
                  meanprops=meanpointprops, meanline=False, showmeans=True)
_ = plt.xlabel('', color='white', fontdict={'size': 0})
_ = plt.ylabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Distribution de la fréquence d\'achat par tranches d\'ages',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

ax2 = fig.add_subplot(212)
ax2 = sns.boxplot(data=res, x='ind2', y='nb_commandes', hue='c_sex', showfliers=False, 
                  meanprops=meanpointprops2, meanline=False, showmeans=True)
_ = plt.xlabel('Classes d\'âge', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Distribution de la fréquence d\'achat par tranches d\'ages et par sexe',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

# affichage des effectifs de chaque classe
for ax in [ax1, ax2]:
    _ = ax.xaxis.set_ticklabels(
        ['18-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-92'])
    nobs = res['ind2'].value_counts().values
    nobs = [str(x) for x in nobs.tolist()]
    nobs = ["n: " + i for i in nobs]
    pos = range(len(nobs))
    for tick, label in zip(pos, ax.get_xticklabels()):
        _ = ax.text(pos[tick], -6, nobs[tick], horizontalalignment='center',
                    size='14', color='gray', weight='semibold')
    _ = ax.tick_params(axis='x', which='major', pad=25)


eta_age_commandes = eta_squared(res["ind2"], res["nb_commandes"])
eta_sex_commandes = eta_squared(res["c_sex"], res["nb_commandes"])

print('Les rapports de corrélations (cf eta squared) sont les suivants :')
print(
    f'   - entre la classe d\'âge et la fréquence d\'achat : {eta_age_commandes} -> faible corrélation')
print(
    f'   - entre le sexe et et la fréquence d\'achat : {f"{eta_sex_commandes:.9f}"} -> absence de corrélation')

Les rapports de corrélations (cf eta squared) sont les suivants :
   - entre la classe d'âge et la fréquence d'achat : 0.14972180027748663 -> faible corrélation
   - entre le sexe et et la fréquence d'achat : 0.000015671 -> absence de corrélation


ry=res_a.copy()
ry.loc[ry['c_age'] < 30., 'classe'] = 'A'
ry.loc[ry['c_age'] > 29., 'classe'] = 'B'
ry['panier_moyen']=ry['montant_total'] / ry['nb_commandes']
ry.sample(5)
ry[ry['c_age'] < 30]['classe'].count()
ry[ry['c_age'] >= 30]['classe'].count()

2266

6328


fig = plt.figure(figsize=(18, 6))
plt.gcf().subplots_adjust(left=0.1, bottom=0.1,
                          right=0.9, top=0.9, wspace=0.1, hspace=0)
ax1 = fig.add_subplot(121)

meanpointprops = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')

# affichage des boxplots
ax1 = sns.boxplot(data=ry, x='classe', order=['A', 'B'], y='nb_commandes', meanprops=meanpointprops,
                  width=0.6, palette='Set1_r', showfliers=False, showmeans=True, )
plt.xlabel('', color='white', fontdict={'size': 0})
plt.ylabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 14})
ax1.yaxis.set_ticks_position('left')
plt.title('Fréquence d\'achat par classes d\'âges',
              y=1.02, fontdict={'size': 16, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)

ax2 = fig.add_subplot(122)
ax2 = sns.boxplot(data=ry, x='classe',  order=['A', 'B'],y='nb_commandes', meanprops=meanpointprops,
                  width=0.6, hue='c_sex', showfliers=False, showmeans=True, )
plt.xlabel('', color='white', fontdict={'size': 0})
plt.ylabel('Nombre de commandes)', color='grey',  labelpad=15, fontdict={'size': 14})
ax2.yaxis.set_ticks_position('right')
ax2.yaxis.set_label_position('right')
plt.title('Fréquence d\'achat par classes d\'âges et par sexe',
              y=1.02, fontdict={'size': 16, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)

# affichage des effectifs de chaque classe
for ax in [ax1, ax2]:
    ax.xaxis.set_ticklabels(['17-29 ans', '30-92 ans'])
    nobs = ry['classe'].value_counts().values
    nobs = [str(x) for x in nobs.tolist()]
    nobs = ["n: " + i for i in nobs]
    pos = range(len(nobs))
    for tick, label in zip(pos, ax.get_xticklabels()):
        ax.text(pos[tick], -6, nobs[tick], horizontalalignment='center',
                    size='14', color='gray', weight='semibold')
    ax.tick_params(axis='x', which='major', pad=25)
    
plt.show();


lm = ols('nb_commandes ~ classe', data=ry).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
#print(dir(lm))
#print(lm.summary())
#print(lm.tvalues)
p_value='{:.5f}'.format(float(table['PR(>F)']['classe']))
print(f'La valeur de eta² est de {round(esq_sm, 5)}, ce qui correspond à une corrélation quasi nulle')
print(f"La p_value est de {p_value} : la valeur de eta² obtenue est donc significative :")
print('   -> On peut donc rejeter l\'hypothèse HO et accepter l\'alternative H1 : au moins une moyenne de groupe est significativement différente.')
print('mais cette variation n\'est pas liée au modèle, elle n\'est pas causé par des variations entre les groupes (seulement par des variations dans les groupes)')

La valeur de eta² est de 0.13126, ce qui correspond à une corrélation quasi nulle
La p_value est de 0.00000 : la valeur de eta² obtenue est donc significative :
   -> On peut donc rejeter l'hypothèse HO et accepter l'alternative H1 : au moins une moyenne de groupe est significativement différente.
mais cette variation n'est pas liée au modèle, elle n'est pas causé par des variations entre les groupes (seulement par des variations dans les groupes)


cat1 = ry[ry['classe']=='A']
cat2 = ry[ry['classe']=='B']

t, p = ttest_ind(cat1['nb_commandes'], cat2['nb_commandes'])
print("t = " + str(t))
print("p = " + str(p))
print()
print("Comme la valeur p est inférieure à 0,05, cela veut dire que l'on peut rejeter H0 au profit de H1 :")
print("  -> Les 2 groupes d'âges ont des moyennes significativement différentes")

t = -36.03103756957004
p = 6.86630907072447e-265

Comme la valeur p est inférieure à 0,05, cela veut dire que l'on peut rejeter H0 au profit de H1 :
  -> Les 2 groupes d'âges ont des moyennes significativement différentes


resultat
res_part = resultat.loc[resultat['c_nature'] == 'particulier'];


df = res_part
X = 'c_sex'
Y = 'p_categ'
cont = df[[X, Y]].pivot_table(index=X, columns=Y, aggfunc=len, margins=True, margins_name="Total")
cont
tx = cont.loc[:, ["Total"]]
ty = cont.loc[["Total"], :]
n = len(df)
indep = tx.dot(ty) / n

c = cont.fillna(0)  # On remplace les valeurs nulles par 0
measure = (c-indep)**2/indep
xi_n = measure.sum().sum()
table = measure/xi_n

plt.style.use('seaborn-deep')
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)

ax = sns.heatmap(table.iloc[:-1, :-1], annot=True, annot_kws={'size': 14})

_ = plt.title("Relation entre sexe et catégories de produits achetés", y=1.03, fontdict={'size': 18, 'weight': 500, 'color': 'darkred'})
_ = plt.ylabel("sexe")
_ = plt.xlabel("Catégorie")
_ = plt.show()


khi2 = chi2_contingency(cont.iloc[:-1, :-1])
khi2

(8.965632630745457,
 0.011301539593042658,
 2,
 array([[92620.107572  , 52624.77707011,  7654.11535789],
        [85235.892428  , 48429.22292989,  7043.88464211]]))


print(f'La valeur du Khi-2 est de {round(khi2[0],6)}')
print('En consultant la table de loi de Khi deux et en définissant notre de seuil de significativité à 5%, on constate que la p_value est inférieure à 5% :')
print('  -> on rejette donc l\'hypothèse HO d\'indépendance des variables au profit de l\'alternative H1 :')
print("  -> Les variables 'sexe' et 'catégories de produits' sont dépendantes")

La valeur du Khi-2 est de 8.965633
En consultant la table de loi de Khi deux et en définissant notre de seuil de significativité à 5%, on constate que la p_value est inférieure à 5% :
  -> on rejette donc l'hypothèse HO d'indépendance des variables au profit de l'alternative H1 :
  -> Les variables 'sexe' et 'catégories de produits' sont dépendantes


contingence = res_clients_year_part.groupby('c_sex').sum()[['nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2']]
contingence


contingence['total_sex'] = contingence['nb_art_cat_0'] + contingence['nb_art_cat_1'] + contingence['nb_art_cat_2']
x_cat_0 = contingence['nb_art_cat_0'].sum()
x_cat_1 = contingence['nb_art_cat_1'].sum()
x_cat_2 = contingence['nb_art_cat_2'].sum()
tt = x_cat_0 + x_cat_1 + x_cat_2
data = [x_cat_0, x_cat_1, x_cat_2, tt]
contingence = contingence.append(pd.DataFrame([data], index=['total_cat'], columns=[
                                 'nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2', 'total_sex']))
contingence


eff_theo = contingence.copy()
# eff_theo.iloc[0:2,[0,1,2]]=0

for i in range(2):
    for j in range(3):
        eff_theo.iloc[i, j] = (eff_theo.iloc[i, 3] * eff_theo.iloc[2, j]) / eff_theo.iloc[2, 3]
eff_theo


r = contingence - eff_theo

# On n'a pas besoin des totaux pour le moment
r = r.iloc[0:2, [0, 1, 2]]
r


r2 = r**2
r2


x_theo = eff_theo.iloc[0:2, :]
r2 = r2 / x_theo
r2['total_sex'] = r2['nb_art_cat_0'] + r2['nb_art_cat_1'] + r2['nb_art_cat_2']
x_cat_0 = r2['nb_art_cat_0'].sum()
x_cat_1 = r2['nb_art_cat_1'].sum()
x_cat_2 = r2['nb_art_cat_2'].sum()
tt = x_cat_0 + x_cat_1 + x_cat_2
data = [x_cat_0, x_cat_1, x_cat_2, tt]
r2 = r2.append(pd.DataFrame([data], index=['total_cat'], columns=[
               'nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2', 'total_sex']))
r2


v=sqrt(79.17987649877874/315232)
print(f'Le V de Cramer est ici égal à {v}')

Le V de Cramer est ici égal à 0.0158486495153833


r = res_clients_year_part.copy()
r.sample(1)


r_montant = res_clients_year_part.copy()
fig = plt.figure(figsize=(16,6))
_ = sns.regplot(data=r_montant, x='c_age', y='montant_total', x_jitter=.5, color='r',  scatter_kws={"color": 'teal'})
_ = plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('Montant total annuel', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Montant total d\'achat en fonction de l\'âge (clients non pro)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)


cov = np.cov(r_montant['c_age'], r_montant['montant_total'], ddof=0)[1, 0]
pearson = st.pearsonr(r_montant['c_age'], r_montant['montant_total'])[0]
print(f'Le coefficient de corrélation est égal à {pearson}')
print('Ainsi, le coefficient étant relativement proche de 0, on peut estimer que les 2 variables "âge" et "montant total d\'achat" sont linéairement indépendantes.')

Le coefficient de corrélation est égal à -0.18102019458888824
Ainsi, le coefficient étant relativement proche de 0, on peut estimer que les 2 variables "âge" et "montant total d'achat" sont linéairement indépendantes.


r_montant['classe'] = r_montant['c_age'].apply(lambda x: classe(x))
r_com_1 = r_montant.loc[r_montant['c_age'] < 30]
r_com_2 = r_montant.loc[(r_montant['c_age'] >= 30) & (r_montant['c_age'] < 50)]
r_com_3 = r_montant.loc[r_montant['c_age'] >= 50]
r_tab_com = [r_com_1, r_com_2, r_com_3]
age_tab_com = ["- de 30 ans", "30-49 ans", "50 ans et +"]

for i in range(3):
    df = r_tab_com[i]
    pearson = st.pearsonr(df['c_age'], df['montant_total'])[0]
    print(f'Pour la classe d\'âges "{age_tab_com[i]}", le coefficient de corrélation est égal à {pearson}')

Pour la classe d'âges "- de 30 ans", le coefficient de corrélation est égal à -0.02009973339699622
Pour la classe d'âges "30-49 ans", le coefficient de corrélation est égal à -0.01865886711667209
Pour la classe d'âges "50 ans et +", le coefficient de corrélation est égal à -0.03369750642651832


r_montant['ind2'] = r_montant['c_age']//10
r_montant['ind2'] = r_montant['ind2'].astype(int)

r_montant.sample(1)


fig = plt.figure(figsize=(16, 14))
plt.gcf().subplots_adjust(left=0.1, bottom=0.1,
                          right=0.9, top=0.9, wspace=0, hspace=0.3)
ax1 = fig.add_subplot(211)
meanpointprops = dict(marker='8', markersize=6, markeredgecolor='firebrick', markerfacecolor='firebrick')
meanpointprops2 = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')
# affichage des boxplots
ax1 = sns.boxplot(data=r_montant, x='ind2', y='montant_total', showfliers=False, 
                  meanprops=meanpointprops, meanline=False, showmeans=True)
_ = plt.xlabel('', color='white', fontdict={'size': 0})
_ = plt.ylabel('Montant total (en €)', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Distribution du montant total d\'achat par tranches d\'ages',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

ax2 = fig.add_subplot(212)
ax2 = sns.boxplot(data=r_montant, x='ind2', y='montant_total', hue='c_sex' ,showfliers=False, 
                  meanprops=meanpointprops2, meanline=False, showmeans=True)
_ = plt.xlabel('Classes d\'âge', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('Montant total (en €)', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Distribution du montant total d\'achat par tranches d\'ages et par sexe',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

# affichage des effectifs de chaque classe
for ax in [ax1, ax2]:
    _ = ax.xaxis.set_ticklabels(
        ['18-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-92'])
    nobs = r_montant['ind2'].value_counts().values
    nobs = [str(x) for x in nobs.tolist()]
    nobs = ["n: " + i for i in nobs]
    pos = range(len(nobs))
    for tick, label in zip(pos, ax.get_xticklabels()):
        _ = ax.text(pos[tick], -200, nobs[tick], horizontalalignment='center',
                    size='14', color='gray', weight='semibold')
    _ = ax.tick_params(axis='x', which='major', pad=25)


lm = ols('montant_total ~ ind2', data=r_montant).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
p_value='{:.3f}'.format(float(table['PR(>F)']['ind2']))
table
print(f'Le rapport de corrélations (cf eta squared) entre les classes d\'âges de 10 ans et le montant total d\'achat est de {round(esq_sm,6)}, avec une p_value de {p_value}')
print('La valeur de eta² est donc significative : au moins une moyenne de groupe est significativement différente')

Le rapport de corrélations (cf eta squared) entre les classes d'âges de 10 ans et le montant total d'achat est de 0.032016, avec une p_value de 0.000
La valeur de eta² est donc significative : au moins une moyenne de groupe est significativement différente


fig = plt.figure(figsize=(16, 14))
plt.gcf().subplots_adjust(left=0.1, bottom=0.1,
                          right=0.9, top=0.9, wspace=0, hspace=0.3)
ax = fig.add_subplot(211)
meanpointprops = dict(marker='8', markersize=6, markeredgecolor='firebrick', markerfacecolor='firebrick')
meanpointprops2 = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')
# affichage des boxplots
ax = sns.boxplot(data=r_montant, y='classe', x='montant_total', order=['A', 'B', 'C'], width=0.4, orient='h', showfliers=False, 
                  meanprops=meanpointprops, meanline=False, showmeans=True)
_ = plt.xlabel('Montant (en €)', color='grey', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('', color='white', labelpad=15, fontdict={'size': 0})
_ = plt.title('Distribution du montant total d\'achat par tranches d\'ages',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

# affichage des effectifs de chaque classe
_ = ax.yaxis.set_ticklabels(
    ['  17-29 ans', '  30-49 ans', '  50-92 ans'])
nobs = r_montant['classe'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ["(n: " + i + ")" for i in nobs ]
pos = range(len(nobs))
for tick, label in zip(pos, ax.get_yticklabels()):
    _ = ax.text(-205, pos[tick]+0.2, nobs[tick], horizontalalignment='center',
                size='14', color='gray', weight='semibold')
_ = ax.tick_params(axis='y', which='major', pad=5)


lm = ols('montant_total ~ classe', data=r_montant).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
p_value='{:.3f}'.format(float(table['PR(>F)']['classe']))
table
print(f'Le rapport de corrélations (cf eta squared) entre les 3 groupes d\'âges et le montant total d\'achat est de {round(esq_sm,6)}, avec une p_value de {p_value}')
print('(même conclusions que précédemment avec les groupes de 10 ans)')

Le rapport de corrélations (cf eta squared) entre les 3 groupes d'âges et le montant total d'achat est de 0.077704, avec une p_value de 0.000
(même conclusions que précédemment avec les groupes de 10 ans)


r_freq = res_clients_year_part.copy()
r_freq.sample(1)
r_freq = r_freq[['c_id', 'c_sex', 'c_age', 'nb_commandes', 'nb_articles']]
r_freq['f_com_mois'] = r_freq['nb_commandes']/11
r_freq['f_prod_mois'] = r_freq['nb_articles']/11
r_freq.sample(2)


fig = plt.figure(figsize=(14,5))
_ = sns.regplot(data=r_freq, x='c_age', y='f_com_mois', color='red', x_jitter=0.3, scatter_kws={"color": 'SteelBlue'})
_ = plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('Nombre de commandes', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Fréquence des commandes en fonction de l\'âge (clients non pro)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)


cov = np.cov(r_freq['c_age'], r_freq['f_com_mois'], ddof=0)[1, 0]
pearson = st.pearsonr(r_freq['c_age'], r_freq['f_com_mois'])[0]
print(f'Le coefficient de corrélation est égal à {pearson}')

Le coefficient de corrélation est égal à 0.17468217495371718


r_freq_com_1 = r_freq.loc[r_freq['c_age'] < 30]
r_freq_com_1.sample(1)
r_freq_com_2 = r_freq.loc[r_freq['c_age'] >= 30]
r_freq_com_2.sample(1)

r_tab_com = [r_freq_com_1, r_freq_com_2]
age_tab_com = ["- de 30 ans", "30 ans et +"]


for i in range(2):
    df = r_tab_com[i]
    pearson = st.pearsonr(df['c_age'], df['f_com_mois'])[0]
    print(f'Pour la classe d\'âges "{age_tab_com[i]}", le coefficient de corrélation est égal à {pearson}')

Pour la classe d'âges "- de 30 ans", le coefficient de corrélation est égal à -0.019130918414251418
Pour la classe d'âges "30 ans et +", le coefficient de corrélation est égal à -0.13636836366724303


fig = plt.figure(figsize=(14,5))
_ = sns.regplot(data=r_freq, x='c_age', y='f_prod_mois', color='red', x_jitter=0.4, scatter_kws={"color": 'teal'})
_ = plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('Nombre de produits achetés', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Fréquence d\'achat de produits en fonction de l\'âge (clients non pro)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)


cov = np.cov(r_freq['c_age'], r_freq['f_prod_mois'], ddof=0)[1, 0]
pearson = st.pearsonr(r_freq['c_age'], r_freq['f_prod_mois'])[0]
print(f'Le coefficient de corrélation est égal à {pearson}')

Le coefficient de corrélation est égal à 0.03521099015954457


r_freq_prod_1 = r_freq.loc[r_freq['c_age'] < 30]
r_freq_prod_2 = r_freq.loc[(r_freq['c_age'] >= 30) & (r_freq['c_age'] <50)]
r_freq_prod_3 = r_freq.loc[r_freq['c_age'] >= 50]
r_freq_prod_2.sample(2)

r_tab_prod = [r_freq_prod_1, r_freq_prod_2, r_freq_prod_3]
age_tab_prod = ["- de 30 ans", "30-49 ans", "50 ans et +"]


for i in range(3):
    df = r_tab_prod[i]
    pearson = st.pearsonr(df['c_age'], df['f_prod_mois'])[0]
    print(f'Pour la classe d\'âges "{age_tab_prod[i]}", le coefficient de corrélation est égal à {pearson}')

Pour la classe d'âges "- de 30 ans", le coefficient de corrélation est égal à -0.015497177126043449
Pour la classe d'âges "30-49 ans", le coefficient de corrélation est égal à -0.01758815702129382
Pour la classe d'âges "50 ans et +", le coefficient de corrélation est égal à -0.03244828620281044


meanpointprops = dict(marker='8', markersize=6, markeredgecolor='firebrick', markerfacecolor='firebrick')
meanpointprops2 = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')

rc = r_freq.copy()
rc['classe'] = rc['c_age'].apply(lambda x: classe(x))

fig = plt.figure(figsize=(16, 5))
plt.gcf().subplots_adjust(left=0.1, bottom=0.1,
                          right=0.9, top=0.9, wspace=0.25, hspace=0)
ax1 = fig.add_subplot(121)

# affichage des boxplots
ymax1 = rc['f_com_mois'].max()
ax1 = sns.boxplot(data=rc, y='classe', width=0.6, orient='h', order=['A', 'B', 'C'], x='f_com_mois',
                  meanprops=meanpointprops2, showfliers=False, showmeans=True, palette="deep")
_ = plt.ylabel('', color='white', fontdict={'size': 0})
_ = plt.xlabel('Nombre de commandes mensuelles', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Fréquence des commandes',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

ymax2 = rc['f_prod_mois'].max()
ax2 = fig.add_subplot(122)
ax2 = sns.boxplot(data=rc, y='classe',width=0.6, orient='h', order=['A', 'B', 'C'], x='f_prod_mois',
                   meanprops=meanpointprops2, showfliers=False, showmeans=True, palette='deep')
_ = plt.ylabel('', color='white',
               labelpad=15, fontdict={'size': 0})
_ = plt.xlabel('Nombre d\'achats de produits mensuels', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Fréquence des produits commandés',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

# affichage des effectifs de chaque classe
d = {ax1: ymax1, ax2: ymax2}
z=0
for k, v in d.items():
#for ax in [ax1, ax2]:
    z+=1
    _ = k.yaxis.set_ticklabels(['  17-29 ans', '  30-49 ans', '  50-92 ans'])
    nobs = rc['classe'].value_counts().values
    nobs = [str(x) for x in nobs.tolist()]
    nobs = ["(n: " + i +")" for i in nobs]
    pos = range(len(nobs))
    for tick, label in zip(pos, ax.get_yticklabels()):
        if z == 1:
            _ = k.text(-0.85, pos[tick]+0.25,  nobs[tick], horizontalalignment='center',
                    size='14', color='gray', weight='semibold')
        else:
            _ = k.text(-2.2, pos[tick]+0.25,  nobs[tick], horizontalalignment='center',
                    size='14', color='gray', weight='semibold')
    _ = k.tick_params(axis='y', which='major', pad=5)


lm = ols('f_com_mois ~ classe', data=rc).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
p_value='{:.3f}'.format(float(table['PR(>F)']['classe']))
table
print(f'Le rapport de corrélations (cf eta squared) entre les 3 groupes d\'âges et la fréquence mensuelle de commande est de {round(esq_sm,6)}, avec une p_value de {p_value}')

Le rapport de corrélations (cf eta squared) entre les 3 groupes d'âges et la fréquence mensuelle de commande est de 0.148889, avec une p_value de 0.000


lm = ols('f_prod_mois ~ classe', data=rc).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
p_value='{:.3f}'.format(float(table['PR(>F)']['classe']))
table
print(f'Le rapport de corrélations (cf eta squared) entre les 3 groupes d\'âges et la fréquence mensuelle de produits commandés est de {round(esq_sm,6)}, avec une p_value de {p_value}')

Le rapport de corrélations (cf eta squared) entre les 3 groupes d'âges et la fréquence mensuelle de produits commandés est de 0.282101, avec une p_value de 0.000


freq_col = ['f_com_mois', 'f_prod_mois']
f_tab = ['COMMANDES', 'PRODUITS commandés']
eta_age_tab = []

# On crée également un df par classe
rc_17_29 = rc.loc[rc['classe'] == 'A']
rc_30_49 = rc.loc[rc['classe'] == 'B']
rc_50_92 = rc.loc[rc['classe'] == 'C']

df=[rc, rc_17_29, rc_30_49, rc_50_92]
ages=['moins de 30 ans', '30-49 ans','50 ans et +']
pearson_tab=[]

print('############################################################################################')
print('#                             Coefficient de corrélation (Pearson)                         #')
print('############################################################################################')
print()
for j in range(2):
    for i in range(4):
        pearson = st.pearsonr(df[i]['c_age'], df[i][freq_col[j]])[0]
        pearson_tab.append(pearson)
    
    print(f'## Fréquence des {f_tab[j]}')
    print(f'  - Pour le df flobal, R = {pearson_tab[4*j+0]} (R² = {pearson_tab[4*j+0]**2})')
    print('  - Pour les différentes classes d\'ages :')
    for i in range(1, 4, 1):
        print(f'        - Pour les {ages[i-1]} : R = {pearson_tab[4*j+i]} (R² = {pearson_tab[4*j+i]**2})')
    print()
print()       
print('############################################################################################')
print('#                             Rapport de corrélation (eta squared)                         #')
print('############################################################################################')
print()
print('  Le rapport de corrélation (cf eta squared)  entre les classes d\'âges et : ')
for j in range(2):
    eta_age = eta_squared(rc["classe"], rc[freq_col[j]])
    eta_age_tab.append(eta_age)
    print(f'   - la fréquence des {f_tab[j]} : {eta_age_tab[j]}')

############################################################################################
#                             Coefficient de corrélation (Pearson)                         #
############################################################################################

## Fréquence des COMMANDES
  - Pour le df flobal, R = 0.17468217495371718 (R² = 0.03051386224656106)
  - Pour les différentes classes d'ages :
        - Pour les moins de 30 ans : R = -0.019130918414251418 (R² = 0.000365992039372744)
        - Pour les 30-49 ans : R = -0.02124321432611368 (R² = 0.00045127415490520156)
        - Pour les 50 ans et + : R = -0.03675783195523097 (R² = 0.001351138210048999)

## Fréquence des PRODUITS commandés
  - Pour le df flobal, R = 0.03521099015954457 (R² = 0.0012398138280155446)
  - Pour les différentes classes d'ages :
        - Pour les moins de 30 ans : R = -0.015497177126043449 (R² = 0.0002401624988759643)
        - Pour les 30-49 ans : R = -0.01758815702129382 (R² = 0.0003093432674056871)
        - Pour les 50 ans et + : R = -0.03244828620281044 (R² = 0.0010528912774994984)


############################################################################################
#                             Rapport de corrélation (eta squared)                         #
############################################################################################

  Le rapport de corrélation (cf eta squared)  entre les classes d'âges et : 
   - la fréquence des COMMANDES : 0.14888933253499387
   - la fréquence des PRODUITS commandés : 0.2821014819584637


ry = res_clients_year.copy()
ry['panier_moyen']=ry['montant_total']/ry['nb_commandes']
ry['nb_art_moyen']=ry['nb_articles']/ry['nb_commandes']
linear_reg_aff(ry['c_age'],ry['nb_art_moyen'])
fig = plt.figure(figsize=(14,5))
_ = sns.regplot(data=ry, x='c_age', y='nb_art_moyen', color='red', x_jitter=0.4, scatter_kws={"color": 'SteelBlue'})
_ = plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('Nombre d\'articles', color='gray',
               labelpad=15, fontdict={'size': 16})
_ = plt.title('Taille du panier moyen (base clients - panier moyen annuel)',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

Droite de régression : y = 2.3678915837374648 - 0.009β
Coef. de corrélation R : -0.22131439458781255
Coef. de détermination R² : 0.04898006125176999


pearson = st.pearsonr(ry['c_age'], ry['nb_art_moyen'])[0]
pearson

-0.2213143945878132


ra = res_clients_all_orders.copy()
linear_reg_aff(ra['c_age'],ra['nb_art_panier'])
fig = plt.figure(figsize=(14,5))
_ = sns.regplot(data=ra, x='c_age', y='nb_art_panier', color='red', x_jitter=0.1, y_jitter=0.4, scatter_kws={"color": 'SteelBlue'})
_ = plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.ylabel('Nombre d\'articles', color='gray', labelpad=15, fontdict={'size': 16})
_ = plt.title('Taille du panier (base globale - panier moyen par session) selon âge client',
              y=1.02, fontdict={'size': 20, 'weight': 500})
_ = plt.grid(linestyle='--', alpha=0.4)

Droite de régression : y = 2.685163752848047 - 0.015β
Coef. de corrélation R : -0.17783131608922845
Coef. de détermination R² : 0.03162397698202708


ry=pd.merge(ry, res_effectif_age, on='c_age', how='outer')
ry=ry.loc[ry['c_nature'] == 'particulier']
ry.sample(1)

ry_group = ry.groupby('c_age').agg({
    'effectif_age': 'min',
    'nb_art_moyen': 'mean'
}).reset_index()


ra=pd.merge(ra, res_effectif_age, on='c_age', how='outer')
ra=ra.loc[ra['c_nature'] == 'particulier']
ra.sample(1)

ra_group = ra.groupby('c_age').agg({
    'effectif_age': 'min',
    'nb_art_panier': 'mean'
}).reset_index()


sns.set_style(style='white')
fig=plt.figure(figsize=(15,8))
plt.gcf().subplots_adjust(left=0.1, bottom=0.1,
                          right=0.9, top=0.9, wspace=0.3, hspace=0.2)
ax1 = fig.add_subplot(211)
ax1=graph_droite_regression(ry_group, 'c_age', 'nb_art_moyen', 88, 2.15, 'Taille du panier moyen (base clients) selon âge', '', 'Nombre d\'articles', 
                          'effectif_age', (20,300), titre_leg="Effectif Age",
                         leg_x=0, leg_y=0.01, loc_leg='lower left', axe="ax1", droite_reg=False)
plt.grid(linestyle='--', alpha=0.4)
plt.xticks([])
plt.ylim(1,2.8)

ax2 = fig.add_subplot(212)
graph_droite_regression(ra_group, 'c_age', 'nb_art_panier',88, 2.15, 'Taille du panier (base globale - panier moyen par session) selon âge client', '', 'Nombre d\'articles', 
                          'effectif_age', (20,300), titre_leg="Effectif Age",
                         leg_x=0, leg_y=0.01, loc_leg='lower left', axe="ax2", droite_reg=False)
plt.grid(linestyle='--', alpha=0.4)
plt.ylim(1,2.8)
plt.xticks(np.arange(0, 100, 10))
plt.show();


for df in [ry, ra]:
    df['classe'] = df['c_age'].apply(lambda x: classe(x))


fig = plt.figure(figsize=(16, 5))
fig.suptitle("Nombre d'articles", y=-0.03, size=16, color='gray')
meanpointprops = dict(marker='8', markersize=6, markeredgecolor='firebrick', markerfacecolor='firebrick')
meanpointprops2 = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')
plt.gcf().subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.05, hspace=0)
ax1 = fig.add_subplot(121)

# affichage des boxplots
ax1 = sns.boxplot(data=ry, y='classe', width=0.6, orient='h', order=['A', 'B', 'C'], x='nb_art_moyen',
                  meanprops=meanpointprops2, showfliers=False, showmeans=True, palette="deep")
plt.ylabel('', color='white', fontdict={'size': 0})
plt.xlabel('')
plt.title('Taille du panier moyen (clients)', y=1.02, fontdict={'size': 20, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)

ax2 = fig.add_subplot(122)
ax2 = sns.boxplot(data=ra, y='classe',width=0.6, orient='h', order=['A', 'B', 'C'], x='nb_art_panier',
                   meanprops=meanpointprops2, showfliers=False, showmeans=True, palette='deep')
plt.ylabel('')
plt.xlabel('')
plt.title('Taille moyen des paniers (sessions)', y=1.02, fontdict={'size': 20, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)
plt.yticks([])


ax1.yaxis.set_ticklabels(['  17-29 ans', '  30-49 ans', '  50-92 ans'])
nobs = ry['classe'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ["(n: " + i +")" for i in nobs]
pos = range(len(nobs))
for tick, label in zip(pos, ax.get_yticklabels()):
    ax1.text(0.65, pos[tick]+0.25,  nobs[tick], horizontalalignment='center',
            size='14', color='gray', weight='semibold')
ax1.tick_params(axis='y', which='major', pad=5)

plt.show();


ry_1 = ry.loc[ry['classe'] == 'A']
ry_2 = ry.loc[ry['classe'] == 'B']
ry_3 = ry.loc[ry['classe'] == 'C']
ry_tab = [ry_1, ry_2, ry_3]
lab = ['- de 30 ans', '30-49 ans', '50 ans et +']

print('Pour chaque groupe d\'âges, le coefficient de Pearson est le suivant :')
for i in range(3):
    R = corr_coef(ry_tab[i]['c_age'], ry_tab[i]['nb_art_moyen'])
    print(f'   - classe des "{lab[i]}" : {R}')

Pour chaque groupe d'âges, le coefficient de Pearson est le suivant :
   - classe des "- de 30 ans" : 0.008719376830199696
   - classe des "30-49 ans" : 0.01895020107299685
   - classe des "50 ans et +" : 0.017889315976885367


lm = ols('nb_art_moyen ~ classe', data=ry).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
p_value='{:.3f}'.format(float(table['PR(>F)']['classe']))
table
print(f'Le rapport de corrélations (cf eta squared) entre les 3 groupes d\'âges et la taille du panier moyen (panier client) est de  {round(esq_sm,6)}, avec une p_value de {p_value}')

Le rapport de corrélations (cf eta squared) entre les 3 groupes d'âges et la taille du panier moyen (panier client) est de  0.366039, avec une p_value de 0.000


lm = ols('nb_art_panier ~ classe', data=ra).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
p_value='{:.3f}'.format(float(table['PR(>F)']['classe']))
table
print(f'Le rapport de corrélations (cf eta squared) entre les 3 groupes d\'âges et la taille moyenne des paniers de session est de {round(esq_sm,6)}, avec une p_value de {p_value}')

Le rapport de corrélations (cf eta squared) entre les 3 groupes d'âges et la taille moyenne des paniers de session est de 0.128288, avec une p_value de 0.000


f_cat = res_clients_all_orders_part.copy()
f_cat.sample(1)
f_cat = f_cat[['c_id', 's_panier_id', 'c_sex', 'c_age',
               'nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2']]
f_cat_0 = f_cat[['c_id', 's_panier_id', 'c_sex', 'c_age', 'nb_art_cat_0']]
f_cat_0['cat'] = '0'
f_cat_0.columns = ['c_id', 's_panier_id', 'c_sex', 'c_age', 'nb_art', 'categ']
f_cat_1 = f_cat[['c_id', 's_panier_id', 'c_sex', 'c_age', 'nb_art_cat_1']]
f_cat_1['cat'] = '1'
f_cat_1.columns = ['c_id', 's_panier_id', 'c_sex', 'c_age', 'nb_art', 'categ']
f_cat_2 = f_cat[['c_id', 's_panier_id', 'c_sex', 'c_age', 'nb_art_cat_2']]
f_cat_2['cat'] = '2'
f_cat_2.columns = ['c_id', 's_panier_id', 'c_sex', 'c_age', 'nb_art', 'categ']
fc = pd.concat([f_cat_0, f_cat_1, f_cat_2], ignore_index=True)
fc = fc.reindex(columns=['c_id', 's_panier_id', 'c_sex', 'c_age', 'categ', 'nb_art'])
fc = fc[fc['nb_art'] != 0]
fc.sample(5)


fig = plt.figure(figsize=(14, 8))
ax = fig.add_subplot(111)
sns.set_palette('turbo')
meanpointprops = dict(marker='8', markersize=6, markeredgecolor='firebrick', markerfacecolor='firebrick')
meanpointprops2 = dict(marker='8', markersize=4, markeredgecolor='firebrick', markerfacecolor='firebrick')
# affichage des boxplots
ax = sns.boxplot(data=fc, y='categ', x='c_age', width=0.6, orient='h', showfliers=False, 
                  meanprops=meanpointprops, meanline=False, showmeans=True)

plt.xlabel('Age', color='grey', labelpad=15, fontdict={'size': 16})
plt.ylabel('', color='white', labelpad=15, fontdict={'size': 0})
plt.title('Les catégories de produits achetés par âge des clients',
              y=1.02, fontdict={'size': 20, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)

# affichage des effectifs de chaque classe
ax.yaxis.set_ticklabels(['  Catégorie 0', '  Catégorie 1', '  Catégorie 2'])
nobs = r_montant['classe'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ["(n: " + i + ")" for i in nobs ]
pos = range(len(nobs))
for tick, label in zip(pos, ax.get_yticklabels()):
    ax.text(8.5, pos[tick]+0.17, nobs[tick], horizontalalignment='center',
                size=14, color='gray', weight='semibold')
ax.tick_params(axis='y', which='major', pad=5)
plt.show();


lm = ols('c_age ~ categ', data=fc).fit()
table = sm.stats.anova_lm(lm)
esq_sm = table['sum_sq'][0]/(table['sum_sq'][0]+table['sum_sq'][1])
table['EtaSq'] = [esq_sm, 'NaN']
table
print()
p_value='{:.3f}'.format(float(table['PR(>F)']['categ']))
print(f'Le rapport de corrélation (cf eta squared) entre les catégories de produits et l\'âge des clients est de {round(esq_sm,6)}, avec une p_value de {p_value}')

Le rapport de corrélation (cf eta squared) entre les catégories de produits et l'âge des clients est de 0.121612, avec une p_value de 0.000


fc['nb_art_commande']=fc.groupby(['c_age', 'categ'])['nb_art'].transform(lambda x: x.sum())
fc
fcg=fc.groupby(['c_age', 'categ', 'nb_art_commande'])['nb_art'].sum().reset_index()
fcg=pd.merge(fcg, res_effectif_age, on='c_age')
fcg['nb_art_com_moyen'] = fcg['nb_art_commande']/fcg['effectif_age']
fcg.columns = ['c_age', 'Catégorie', 'Nb Total d\'Art par Cat', 'nb_art', 'effectif_age', 'nb_art_com_moyen']
fcg_aff = fcg[['c_age', 'Catégorie', 'nb_art', 'effectif_age', 'nb_art_com_moyen']]
fcg_aff.columns = ['age', 'categ', 'nb_art_cat_total', 'effectif_age', 'nb_art_cat_moyen']
fcg_aff = fcg_aff.reindex(columns=['categ', 'age', 'effectif_age', 'nb_art_cat_total', 'nb_art_cat_moyen'])
fcg_aff.sort_values('age').head(7)


plt.rc('legend', fontsize='small')
sns.relplot(data=fcg, x='c_age', y='nb_art_com_moyen', hue='Catégorie',  style='Catégorie',
    kind='scatter', palette='Set1', alpha=.8, size='Nb Total d\'Art par Cat', sizes=(50, 300), height=5, aspect=14/5)
plt.xlabel('Age', color='gray', labelpad=15, fontdict={'size': 16})
plt.ylabel('Nombre de produits achetés', color='gray',
    labelpad=15, fontdict={'size': 16})
plt.title('Catégories de produits achetés en fonction de l\'âge (clients non pro)',
    y=1.02, fontdict={'size': 20, 'weight': 500})
plt.grid(linestyle='--', alpha=0.4)
plt.show();


r_cat = res_clients_year_part.copy()
r_cat['classe'] = r_cat['c_age'].apply(lambda x: classe(x))
r_cat_0 = r_cat[['c_id', 'c_sex', 'c_age', 'nb_art_cat_0', 'classe']]
r_cat_1 = r_cat[['c_id', 'c_sex', 'c_age', 'nb_art_cat_1', 'classe']]
r_cat_2 = r_cat[['c_id', 'c_sex', 'c_age', 'nb_art_cat_2', 'classe']]
cat_tab = [r_cat_0, r_cat_1, r_cat_2]
nb_art_cat_tab = ['nb_art_cat_0', 'nb_art_cat_1', 'nb_art_cat_2']
r_cat.sample(1)


fig = plt.figure(figsize=(24, 5))
fig.suptitle('Répartition du panier moyen par catégorie (base clients)', y=1.08, color='darkred', size='large')
sns.set_palette('Set2')
meanpointprops = dict(marker='8', markersize=4, markeredgecolor='tomato', markerfacecolor='tomato')
for i in range(3):
    cat = cat_tab[i]
    nb_art_cat = nb_art_cat_tab[i]

    if i == 0:
        ax1 = fig.add_subplot(131)
        ax1 = sns.boxplot(data=cat, x='classe', order=['A', 'B', 'C'], y=nb_art_cat,
                          width=0.6, showfliers=False, showmeans=True, meanprops=meanpointprops)
    if i == 1:
        ax2 = fig.add_subplot(132)
        ax2 = sns.boxplot(data=cat, x='classe', order=['A', 'B', 'C'], y=nb_art_cat,
                          width=0.6, showfliers=False, showmeans=True, meanprops=meanpointprops)
    if i == 2:
        ax3 = fig.add_subplot(133)
        ax3 = sns.boxplot(data=cat, x='classe', order=['A', 'B', 'C'], y=nb_art_cat,
                          width=0.6, showfliers=False, showmeans=True, meanprops=meanpointprops)

    # affichage des boxplots
    if i == 0:
        plt.ylabel('Nombre d\'articles', color='gray', labelpad=10, fontdict={'size': 14})
    else:
        plt.ylabel('', color='white', fontdict={'size': 0})

    plt.xlabel('')
    plt.title('Articles de catégorie '+str(i) +' dans le panier moyen', y=1.02, fontdict={'size': 16, 'weight': 500})
    plt.grid(linestyle='--', alpha=0.4)

    # affichage des effectifs de chaque classe
for ax in [ax1, ax2, ax3]:
    pos = 0
    ax.xaxis.set_ticklabels(['17-29 ans', '30-49 ans', '50-92 ans'])
    nobs = r_cat_0['classe'].value_counts().values
    nobs = [str(x) for x in nobs.tolist()]
    nobs = ["n: " + i for i in nobs]
    pos = range(len(nobs))
    ax.tick_params(axis='x', which='major', pad=25)

for tick, label in zip(pos, ax.get_xticklabels()):
    ax1.text(x=pos[tick], y=-10.0, s=nobs[tick], ha='center',
                 va='top', size=14, color='gray', weight='semibold')
    ax2.text(x=pos[tick], y=-10/3, s=nobs[tick], ha='center',
                 va='top', size=14, color='gray', weight='semibold')
    ax3.text(x=pos[tick], y=-10/3/2.3, s=nobs[tick], ha='center',
                 va='top', size=14, color='gray', weight='semibold')

plt.show();


fc['classe'] = fc['c_age'].apply(lambda x: classe(x))
fc


df = fc
X = 'classe'
Y = 'categ'
cont = df[[X, Y]].pivot_table(
    index=X, columns=Y, aggfunc=len, margins=True, margins_name="Total")
cont
cont_test=cont.iloc[:-1, :-1]
cont_test
tx = cont.loc[:, ["Total"]]
ty = cont.loc[["Total"], :]
n = len(df)
indep = tx.dot(ty) / n

c = cont.fillna(0)  # On remplace les valeurs nulles par 0
measure = (c-indep)**2/indep
xi_n = measure.sum().sum()
table = measure/xi_n

plt.style.use('seaborn-deep')
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)

ax = sns.heatmap(table.iloc[:-1, :-1], annot=True,
                annot_kws={'size': 14})

plt.title("Relation entre classes d\'âges et catégories de produits achetés", y=1.03, fontdict={'size': 18, 'weight': 500, 'color': 'darkred'})
plt.xlabel("")
plt.ylabel("")
ax.set_yticklabels(["17-29\nans", "30-49\nans", "50-92\nans"], x=-0.05, fontdict={'rotation':'horizontal', 'horizontalalignment':'center', 'verticalalignment': 'center'})
ax.set_xticklabels(["Produits\nCatégorie 0", "Produits\nCatégorie 1", "Produits\nCatégorie 2"], y=-0.02, fontdict={'ha':'center', 'va':'center'})
plt.show();


x=table.iloc[:-1, :-1]


st.chi2_contingency(cont_test)

(80058.50215280065,
 0.0,
 4,
 array([[12523.92667708, 10350.07740468,  1601.99591824],
        [52357.34079531, 43269.37899805,  6697.28020664],
        [31393.73252761, 25944.54359727,  4015.72387512]]))


st_chi2, st_p, st_dof, st_exp = st.chi2_contingency(cont_test)


print(f'La valeur du Khi-2 est de {st_chi2}, avec une p_value de {st_p} : la valeur du Khi-2 obtenue est donc significative :')
print('  -> on rejette donc l\'hypothèse HO d\'indépendance des variables au profit de l\'alternative H1 :')
print("  -> Les variables 'âge' et 'catégories de produits' sont dépendantes")

La valeur du Khi-2 est de 80058.50215280065, avec une p_value de 0.0 : la valeur du Khi-2 obtenue est donc significative :
  -> on rejette donc l'hypothèse HO d'indépendance des variables au profit de l'alternative H1 :
  -> Les variables 'âge' et 'catégories de produits' sont dépendantes


v=sqrt(st_chi2/(188154*2))
print(f'Le V de Cramer est ici égal à {v}')
print("  -> on peut donc qualifier la corrélation entre les variables 'classes d'âges' et 'catégories de produits' comme forte")

Le V de Cramer est ici égal à 0.46124535398575767
  -> on peut donc qualifier la corrélation entre les variables 'classes d'âges' et 'catégories de produits' comme forte

	id_prod	price	categ
0	0_1421	19.99	0
1	0_1368	5.13	0
2	0_731	17.99	0
4	0_1507	3.99	0
5	0_1163	9.99	0
...	...	...	...
3280	0_1314	20.63	0
3281	0_607	14.99	0
3283	0_146	17.14	0
3284	0_802	11.22	0
3286	0_1920	25.16	0

	Nombre	Part	Prix Moyen
Categorie 0	2309	70.25%	11.73€
Categorie 1	739	22.5%	25.53€
Categorie 2	239	7.25%	108.35€
Total	3287	100%	21.86€

	Nb de références	Prix Moyen
Catégorie 0	17	20.080
Catégorie 1	2	35.775
Catégorie 2	3	141.320

	count	mean	std	min	25%	50%	75%	max
categ
0	2309.0	11.732795	7.564116	0.62	5.590	10.32	16.65	40.99
1	739.0	25.531421	15.425162	2.00	13.390	22.99	33.99	80.99
2	239.0	108.354686	49.561431	30.99	71.065	101.99	136.53	300.00


	Mode	Médiane	Moyenne
Catégorie 0	4.99	10.32	11.73
Catégorie 1	22.99	22.99	25.53
Catégorie 2	50.99	101.99	108.35
Total	4.99	13.07	21.86

	count	mean	std	min	25%	50%	75%	max
price	3287.0	21.860515	29.845766	0.62	6.99	13.06	22.99	300.0
categ	3287.0	0.370246	0.615387	0.00	0.00	0.00	1.00	2.0

	annee	mois	quantite	CA
0	2021.0	3.0	28568	479417.625152
1	2021.0	4.0	28415	473144.979125
2	2021.0	5.0	28245	489461.812357
3	2021.0	6.0	26819	481340.069562
4	2021.0	7.0	24716	480945.591178
5	2021.0	8.0	25619	479410.785152
6	2021.0	9.0	33266	502912.073535
7	2021.0	10.0	21584	319242.089562
8	2021.0	11.0	28277	513007.497946
9	2021.0	12.0	32424	523019.809562
10	2022.0	1.0	29316	523013.973973
11	2022.0	2.0	29567	532869.940741

categ	0	1	2
semaine
36	52412.621179	43000.57	19244.25
37	58069.885589	45176.99	13890.91
38	63233.178384	45377.30	12751.62
39	59548.660000	33918.16	13643.64
40	47225.795589	NaN	17161.00
41	46985.742795	NaN	19863.59
42	42328.738384	NaN	21137.58
43	41890.682795	26758.53	21103.85
44	38753.285589	52547.98	24181.45

	s_date	p_prix	mois	m
0	2021-03-01	16427.972795	3	Mars-21
1	2021-03-02	15349.202795	3	Mars-21
2	2021-03-03	14751.750000	3	Mars-21
3	2021-03-04	15047.090000	3	Mars-21
4	2021-03-05	17173.410000	3	Mars-21
...	...	...	...	...
360	2022-02-24	20201.452795	2	Fev-22
361	2022-02-25	18205.425589	2	Fev-22
362	2022-02-26	19759.580000	2	Fev-22
363	2022-02-27	19021.830000	2	Fev-22
364	2022-02-28	18721.022795	2	Fev-22

	date	day
215	2021-10-02	Samedi
216	2021-10-03	Dimanche
217	2021-10-04	Lundi
218	2021-10-05	Mardi
219	2021-10-06	Mercredi
220	2021-10-07	Jeudi
221	2021-10-08	Vendredi
222	2021-10-09	Samedi
223	2021-10-10	Dimanche
224	2021-10-11	Lundi
225	2021-10-12	Mardi
226	2021-10-13	Mercredi
227	2021-10-14	Jeudi
228	2021-10-15	Vendredi
229	2021-10-16	Samedi
230	2021-10-17	Dimanche
231	2021-10-18	Lundi
232	2021-10-19	Mardi
233	2021-10-20	Mercredi
234	2021-10-21	Jeudi
235	2021-10-22	Vendredi
236	2021-10-23	Samedi
237	2021-10-24	Dimanche
238	2021-10-25	Lundi
239	2021-10-26	Mardi
240	2021-10-27	Mercredi

	date	nb_p0	nb_p1	nb_p2	ca_p0	ca_p1	ca_p2	nb_total	ca_total
220	2021-10-07 00:00:00	597.00	337.78	26.00	6404.01	6851.39	1787.07	960.78	15042.47
221	2021-10-08 00:00:00	669.00	336.74	44.00	7069.53	6825.99	3137.82	1049.74	17033.34
222	2021-10-09 00:00:00	640.00	335.70	35.00	6808.69	6800.59	2616.67	1010.70	16225.95

	date	p_prix	mois	m
214	2021-10-01	16909.360000	10	Oct-21
215	2021-10-02	9033.150000	10	Oct-21
216	2021-10-03	8844.070000	10	Oct-21
217	2021-10-04	9154.165589	10	Oct-21
218	2021-10-05	9390.460000	10	Oct-21
219	2021-10-06	9341.710000	10	Oct-21
220	2021-10-07	8191.080000	10	Oct-21
221	2021-10-08	10207.350000	10	Oct-21
222	2021-10-09	9425.360000	10	Oct-21
223	2021-10-10	8676.670000	10	Oct-21
224	2021-10-11	10230.560000	10	Oct-21
225	2021-10-12	8822.170000	10	Oct-21
226	2021-10-13	9427.220000	10	Oct-21
227	2021-10-14	9320.800000	10	Oct-21
228	2021-10-15	9261.582795	10	Oct-21
229	2021-10-16	10229.020000	10	Oct-21
230	2021-10-17	9557.980000	10	Oct-21
231	2021-10-18	9083.720000	10	Oct-21
232	2021-10-19	8625.175589	10	Oct-21
233	2021-10-20	9032.922795	10	Oct-21
234	2021-10-21	9571.790000	10	Oct-21
235	2021-10-22	8318.880000	10	Oct-21
236	2021-10-23	8910.640000	10	Oct-21
237	2021-10-24	9923.190000	10	Oct-21
238	2021-10-25	8076.040000	10	Oct-21
239	2021-10-26	8883.542795	10	Oct-21
240	2021-10-27	9789.530000	10	Oct-21
241	2021-10-28	14758.420000	10	Oct-21

	c_id	c_sex	c_age	c_birth	p_id	p_categ	p_prix	s_panier_id	s_id_date	s_year	s_month	s_week	s_day_n	s_day	s_hour	s_minute	s_date
334191	c_7527	m	56.0	1965.0	1_162	1.0	19.58	s_161799	2022-02-08 07:00:31.685907	2022.0	2.0	6.0	Mardi	8.0	7.0	0.0	2022-02-08
267861	c_7889	f	39.0	1982.0	1_462	1.0	16.66	s_2211	2021-03-05 19:07:51.165399	2021.0	3.0	9.0	Vendredi	5.0	19.0	7.0	2021-03-05
13419	c_8213	f	43.0	1978.0	0_1383	0.0	12.99	s_56375	2021-07-01 10:59:05.720043	2021.0	7.0	26.0	Jeudi	1.0	10.0	59.0	2021-07-01

	c_age	total	f	m
0	17.0	437	231	206
1	18.0	145	65	80
2	19.0	145	65	80
3	20.0	125	70	55
4	21.0	136	78	58
...	...	...	...	...
71	88.0	8	5	3
72	89.0	6	5	1
73	90.0	4	2	2
74	91.0	4	3	1
75	92.0	3	1	2

	c_id	c_age	f	m
0	c_1	66.0	NaN	19.0
1	c_10	65.0	NaN	28.0
2	c_100	29.0	NaN	6.0
3	c_1000	55.0	53.0	NaN
4	c_1001	39.0	NaN	52.0
...	...	...	...	...
8593	c_995	66.0	NaN	8.0
8594	c_996	51.0	41.0	NaN
8595	c_997	27.0	22.0	NaN
8596	c_998	20.0	NaN	28.0
8597	c_999	57.0	NaN	20.0

	count	mean	std	min	25%	50%	75%	max
c_age	8598.0	42.739591	16.909801	17.00	29.0000	42.000	55.0000	92.00
nb_commandes	8598.0	18.335427	66.536835	1.00	7.0000	12.000	23.0000	5042.00
nb_articles	8598.0	36.663410	144.875160	1.00	13.0000	24.000	44.0000	11839.00
montant_total	8598.0	637.188202	2417.248311	4.15	260.9825	475.675	822.5025	150729.07
nb_art_cat_0	8598.0	22.187834	107.144057	0.00	4.0000	11.000	26.0000	9303.00
nb_art_cat_1	8598.0	12.569086	40.788855	0.00	5.0000	9.000	16.0000	2535.00
nb_art_cat_2	8598.0	1.906490	17.196863	0.00	0.0000	0.000	1.0000	1558.00
montant_cat_0	8598.0	236.291121	1141.198347	0.00	41.9250	120.815	276.2775	99015.95
montant_cat_1	8598.0	257.457791	835.691078	0.00	93.7200	182.340	334.5750	51671.81
montant_cat_2	8598.0	143.439289	1306.427120	0.00	0.0000	0.000	110.0550	118353.71

	client_id	id_panier	categ	quantite	montant	date_session
0	c_1609	s_10008	0.0	1	9.04	2021-03-22 18:16:59.411694
1	c_1609	s_1003	0.0	1	12.99	2021-03-03 02:51:47.619607

	client_id	id_panier	categ	quantite	montant	date_session
0	c_4958	s_1008	2.0	1	48.99	2021-03-03 02:55:54.945116
1	c_4958	s_10132	2.0	1	68.99	2021-03-23 00:29:40.871562

	client_id	id_panier	categ	quantite	montant	date_session
0	c_6714	s_10014	0.0	2	24.91	2021-03-22 18:37:16.261354
1	c_6714	s_10062	0.0	3	31.11	2021-03-22 20:56:21.017969

	c_id	c_sex	c_age	s_panier_id	s_id_date	nb_art_panier	montant_panier	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	montant_cat_0	montant_cat_1	montant_cat_2
157029	c_963	f	33.0	s_28038	2021-04-30 17:04:12.673892	2.0	41.78	1.0	1.0	0.0	17.79	23.99	0.00
123536	c_6972	f	17.0	s_172051	2022-02-28 06:20:12.369988	1.0	62.83	0.0	0.0	1.0	0.00	0.00	62.83

	c_id	c_sex	c_age	nb_commandes	nb_articles	montant_total	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	montant_cat_0	montant_cat_1	montant_cat_2
5593	c_6045	m	54.0	11	13.0	241.29	4.0	9.0	0.0	30.42	210.87	0.00
464	c_1417	m	19.0	3	4.0	163.30	0.0	2.0	2.0	0.00	38.52	124.78

	client_id	id_panier	categ	quantite	montant	date_session
0	c_3454	s_10055	1.0	1	12.19	2021-03-22 20:32:28.136961
1	c_3454	s_10065	1.0	2	49.27	2021-03-22 21:03:54.822285

					s_id_date	nb_art_panier	montant_panier	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	montant_cat_0	montant_cat_1	montant_cat_2
c_nature	c_id	c_sex	c_age	s_panier_id
particulier	c_1	m	66.0	s_114737	2021-11-04 17:28:13.934070	5.0	92.62	4.0	0.0	1.0	37.75	0.00	54.87
				s_120172	2021-11-15 20:40:00.586010	2.0	44.29	0.0	2.0	0.0	0.00	44.29	0.00
				s_134971	2021-12-15 23:32:41.632729	1.0	10.30	0.0	1.0	0.0	0.00	10.30	0.00
				s_136532	2021-12-19 02:44:12.827475	1.0	13.78	1.0	0.0	0.0	13.78	0.00	0.00
				s_139610	2021-12-25 03:53:43.623598	1.0	16.99	0.0	1.0	0.0	0.00	16.99	0.00
...	...	...	...	...	...	...	...	...	...	...	...	...	...
professionnel	c_6714	f	53.0	s_97533	2021-09-29 18:05:03.844078	7.0	93.37	6.0	1.0	0.0	74.38	18.99	0.00
				s_97726	2021-09-30 03:27:58.500427	6.0	72.52	5.0	1.0	0.0	50.53	21.99	0.00
				s_97791	2021-09-30 06:06:51.038156	8.0	121.43	4.0	4.0	0.0	39.42	82.01	0.00
				s_97978	2021-09-30 15:05:55.449774	3.0	32.12	2.0	1.0	0.0	14.01	18.11	0.00
				s_97988	2021-09-30 15:36:43.001780	5.0	73.19	3.0	2.0	0.0	22.21	50.98	0.00

	p_id	p_categ	montant_f	montant_h	montant_total
3068	2_135	2.0	16350.63	15591.74	31942.37
3043	2_112	2.0	13311.29	16081.66	29392.95
3032	2_102	2.0	12241.98	14903.28	27145.26
3150	2_209	2.0	12388.23	13298.10	25686.33
2589	1_369	1.0	12978.59	12594.75	25573.34
...	...	...	...	...	...
1446	0_234	0.0	76.29	0.00	0.00
2014	0_750	0.0	71.22	0.00	0.00
1498	0_281	0.0	43.98	0.00	0.00
517	0_147	0.0	11.96	0.00	0.00
1808	0_565	0.0	0.00	5.97	0.00

	c_age	nb_art_panier	montant_panier	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	montant_cat_0	montant_cat_1	montant_cat_2
count	157648.000000	157648.000000	157648.000000	157648.000000	157648.000000	157648.000000	157648.000000	157648.000000	157648.000000
mean	44.951994	1.999594	34.751752	1.210107	0.685508	0.103978	12.887135	14.041549	7.823068
std	14.946958	1.281880	32.065671	1.257211	0.770708	0.362481	14.424640	16.999662	30.082457
min	17.000000	1.000000	0.620000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	35.000000	1.000000	15.870000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	43.000000	2.000000	25.990000	1.000000	1.000000	0.000000	9.280000	11.760000	0.000000
75%	54.000000	3.000000	43.510000	2.000000	1.000000	0.000000	19.630000	23.830000	0.000000
max	92.000000	14.000000	539.230000	10.000000	8.000000	5.000000	113.010000	181.550000	539.230000

	c_age	nb_art_panier	montant_panier	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	montant_cat_0	montant_cat_1	montant_cat_2
count	147046.000000	147046.000000	147046.000000	147046.000000	147046.000000	147046.000000	147046.000000	147046.000000	147046.000000
mean	45.180440	1.996709	34.468900	1.209526	0.687227	0.099955	12.879701	14.075142	7.514057
std	15.201614	1.280324	31.672006	1.254715	0.770591	0.355512	14.400977	17.006705	29.463633
min	17.000000	1.000000	0.620000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	34.000000	1.000000	15.810000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	43.000000	2.000000	25.990000	1.000000	1.000000	0.000000	9.280000	11.760000	0.000000
75%	55.000000	3.000000	43.250000	2.000000	1.000000	0.000000	19.570000	23.830000	0.000000
max	92.000000	14.000000	539.230000	10.000000	8.000000	5.000000	113.010000	181.550000	539.230000

	annee	mois	montant_panier	nb_art_panier
0	2021	3	33.759629	2.011828
1	2021	4	33.875618	2.034364
2	2021	5	34.555025	1.993999
3	2021	6	34.799428	1.938706
4	2021	7	35.457510	1.822068
5	2021	8	35.268555	1.885029
6	2021	9	33.870291	2.240385
7	2021	11	35.125693	1.936405
8	2021	12	33.610240	2.083483
9	2022	1	34.924300	1.957599
10	2022	2	37.169907	2.062299

	Source	SS	DF	MS	F	p-unc	n2
0	classe	2.760389e+06	2	1.380194e+06	6872.676357	0.0	0.61538
1	Within	1.725274e+06	8591	2.008234e+02	NaN	NaN	NaN

	c_nature	c_id	c_sex	c_age	s_panier_id	s_id_date	nb_art_panier	montant_panier	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	montant_cat_0	montant_cat_1	montant_cat_2	classe
39331	particulier	c_2811	f	34.0	s_55902	2021-06-30 09:37:26.556751	2.0	14.31	2.0	0.0	0.0	14.31	0.00	0.0	B
118812	particulier	c_6720	m	57.0	s_152458	2022-01-20 06:04:47.086987	1.0	27.99	0.0	1.0	0.0	0.00	27.99	0.0	C

Table des matières

Paramètres config¶

Import des données¶

Mission 2 : Analyse des données¶

1. Etude de l'offre : produits et CA¶

1.1 Analyse des produits proposés¶

1.1.1 Données générales¶

1.1.2 Répartition des prix¶

1.1.2.1 Mesures de tendance centrale¶

1.1.2.1 Mesures de dispersion¶

1.2 Evolution du CA global¶

1.3 Analyse des ventes pour le mois d'octobre 2021¶

1.3.1 Constat : forte baisse du CA¶

1.3.2 Analyse par catégories¶

1.3.3 Estimation des données manquantes¶

1.3.4 Suppression du mois d'octobre dans le df global¶

1.4 Analyse du CA par catégories de produits¶

1.4.1 Répartition du CA par catégories¶

1.4.2 Evolution du CA par catégories¶

1.5 Les "meilleurs" produits¶

1.5.1 Produits les plus vendus, en volume et en valeur, pour tous les clients¶

1.5.2 Les meilleurs produits, en volume et en valeur, en fonction du sexe¶

2 Etude de la demande : clients et achats¶

2.1 Données générales¶

2.2 Analyse des achats¶

2.2.1 Création des df clients¶

2.2.2 Total des achats au cours de l'année¶

2.2.2.1 Les achats en fonction du prix de vente¶

2.2.2.2 Les clients 'premiums'¶

2.2.2.3 Analyse de la concentration du CA¶

2.2.2 Répartition des achats au cours de l'année¶

2.3 Le panier moyen¶

2.3.1 Données générales¶

2.3.2 Relation entre âge des clients et le montant du panier moyen¶

2.3.2.1 Base clients : panier moyen des clients¶

2.3.2.2 Base globale : panier moyen de session¶

2.3.3 Relation entre la classe d'âge des clients et le montant du panier¶

2.3.3.1 Base clients (panier moyen des clients)¶

2.3.3.2 Base globale (panier moyen de session)¶

2.4 La fréquence d'achat (en terme de commandes par client)¶

2.4.1 Relation entre l'âge des clients et la fréquence d'achat¶

2.4.1 Relation entre classes d'âge des clients et la fréquence d'achat¶

Mission 3 : Demandes spécifiques¶

1 Etude de la corrélation entre le sexe des clients et les catégories de produits achetés¶

2 Etude de la corrélation entre l'âge des clients et diverses variables :¶

2.1 Le montant total des achats¶

2.1.1 Relation entre âge des clients et montant total des achats¶

2.1.2 Relation entre classe d'âge des clients et montant total des achats¶

2.2 La fréquence d’achat¶

2.2.1 Relation entre âge des clients et la fréquence d'achat¶

2.2.1.1 Fréquence des commandes¶

2.2.1.2 Fréquence des produits¶

2.2.2 Relation entre classes d'âges des clients et la fréquence d'achat¶

2.3 La taille du panier moyen (en nombre d'articles)¶

2.3.1 Relation entre âge des clients et la taille du panier moyen¶

2.3.1.1 Base clients : panier moyen des clients¶

2.3.1.2 Base globale : panier moyen de session¶

2.3.1.3 Détermination de 3 groupes d'âges aux profils différents¶

2.3.2 Relation entre la classe d'âge des clients et le montant du panier¶

2.4 Les catégories de produits achetés¶

2.4.1 Relation entre âge des clients et catégorie de produits achetés¶

2.4.2 Relation entre classe d'âges des clients et catégories de produits achetés¶

	count	mean	std	min	25%	50%	75%	max
c_age	8594.0	42.739935	16.911589	17.00	29.0000	42.000	55.0000	92.000000
nb_commandes	8594.0	17.110310	14.136932	1.00	7.0000	12.000	23.0000	73.000000
nb_articles	8594.0	34.164301	31.205225	1.00	13.0000	24.000	44.0000	184.000000
montant_total	8594.0	589.773543	432.282718	4.15	260.9475	475.475	822.1775	2436.232795
nb_art_cat_0	8594.0	20.695369	25.245492	0.00	4.0000	11.000	26.0000	144.000000
nb_art_cat_1	8594.0	11.758669	9.618279	0.00	5.0000	9.000	16.0000	62.000000
nb_art_cat_2	8594.0	1.710263	3.491016	0.00	0.0000	0.000	1.0000	27.000000
montant_cat_0	8594.0	220.375666	269.658382	0.00	41.9250	120.800	276.1075	1606.682795
montant_cat_1	8594.0	240.830036	199.117796	0.00	93.6650	182.175	334.2450	1299.070000
montant_cat_2	8594.0	128.567840	266.187479	0.00	0.0000	0.000	109.3800	1934.540000

	names	coef	se	T	pval	r2	adj_r2	CI[2.5%]	CI[97.5%]
0	Intercept	9.09864	0.61069	14.89890	0.00000	0.00037	-0.00008	7.90107	10.29621
1	c_age	-0.02438	0.02678	-0.91044	0.36269	0.00037	-0.00008	-0.07689	0.02813

	c_nature	c_id	c_sex	c_age	nb_commandes	nb_articles	montant_total	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	montant_cat_0	montant_cat_1	montant_cat_2	effectif_age	classe	panier_moyen
1279	particulier	c_8123	f	27.0	3	4.0	203.88	0.0	1.0	3.0	0.00	17.99	185.89	156	A	67.960000
825	particulier	c_5046	m	48.0	16	50.0	695.30	33.0	17.0	0.0	354.55	340.75	0.00	151	B	43.456250
2461	particulier	c_1874	m	25.0	6	19.0	471.11	6.0	9.0	4.0	56.26	195.42	219.43	179	A	78.518333
7160	particulier	c_6503	m	47.0	21	50.0	609.37	41.0	9.0	0.0	432.69	176.68	0.00	160	B	29.017619
3823	particulier	c_1352	f	45.0	10	26.0	384.45	15.0	10.0	1.0	134.32	200.52	49.61	161	B	38.445000

p_categ	0.0	1.0	2.0	Total
c_sex
f	92331	52993	7575	152899
m	85525	48061	7123	140709
Total	177856	101054	14698	293608

	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	total_sex
f	92331.0	52993.0	7575.0	152899.0
m	85525.0	48061.0	7123.0	140709.0
total_cat	177856.0	101054.0	14698.0	293608.0

	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	total_sex
f	92620.107572	52624.77707	7654.115358	152899.0
m	85235.892428	48429.22293	7043.884642	140709.0
total_cat	177856.000000	101054.00000	14698.000000	293608.0

	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2
f	-289.107572	368.22293	-79.115358
m	289.107572	-368.22293	79.115358

	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2
f	83583.188188	135588.126099	6259.239854
m	83583.188188	135588.126099	6259.239854

	nb_art_cat_0	nb_art_cat_1	nb_art_cat_2	total_sex
f	0.90243	2.576507	0.817761	4.296699
m	0.98061	2.799717	0.888606	4.668934
total_cat	1.88304	5.376225	1.706368	8.965633

	df	sum_sq	mean_sq	F	PR(>F)	EtaSq
ind2	1.0	5.141044e+07	5.141044e+07	284.182276	9.329320e-63	0.032016
Residual	8592.0	1.554349e+09	1.809066e+05	NaN	NaN	NaN