# importation des différents modules
import statistics
import seaborn as sns
import matplotlib
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
import scipy.stats as st
import scipy as sp
import math
from math import sqrt
import numpy as np
import pandas as pd
from PIL import Image
import os, glob
from IPython.core.interactiveshell import InteractiveShell
from IPython.core.display import display, HTML
InteractiveShell.ast_node_interactivity = "all"
plt.style.use('seaborn-white')
%matplotlib inline
import matplotlib
matplotlib.rcParams.update(matplotlib.rcParamsDefault)
pd.options.mode.chained_assignment = None # default='warn'
%%html
<style>
.container{
width: 80% !important;
margin-left: 10% !important;
margin-right: 10% !important;
}
.MathJax {
font-size: 1.3em;
}
.rendered_html tr, .rendered_html th, .rendered_html td {
text-align: right !important;
}
a[data-snippet-code]::after {
background: #262931 !important;
}
.titre-pers{
font-family: arial;
font-size: 250% !important;
line-height: 200% !important;
text-align: center !important;
color: #4c8be2 !important;
}
.rendered_html h1,
.text_cell_render h1 {
color: #86bed9 !important;
line-height: 150% !important;
}
.rendered_html h2,
.text_cell_render h2 {
color: #b08c20 !important;
padding-left: .5rem !important;
line-height: 150% !important;
}
.rendered_html h3,
.text_cell_render h3 {
color: #3aa237 !important;
padding-left: 1rem !important;
line-height: 150% !important;
font-size: 120% !important;
}
.rendered_html h4,
.text_cell_render h4 {
color: #29858a !important;
padding-left: 2rem !important;
font-size: 110% !important;
}
.rendered_html h5,
.text_cell_render h5 {
color: #21417d !important;
padding-left: 2.5rem !important;
font-size: 110% !important;
}
.rendered_html h6,
.text_cell_render h6 {
color: #d8a802c2 !important;
padding-left: 1rem !important;
font-family: sans-serif !important;
font-size: 120% !important;
font-weight: normal !important;
font-style: normal !important;
}
.renf{
font-size: 18px !important;
font-family: Arial !important;
color: #14db9a !important;
}
.renf2{
font-size: 18px !important;
font-family: Arial !important;
color: orangered !important;
}
.alert{
padding: 5px 0 5px 15px;
border-radius: 5px;
margin-left: 10px;
width: auto;
}
.output_subarea jupyter-widgets-view{
padding: 0 !important;
}
</style>
# importations de modules "local"
import sys
sys.path.append('functions/')
import fonctions_ocr
import acp_perso
import adjust_text
import fonctions_perso
data = pd.read_csv("data/inputs/notes.csv")
data.sample(3)
is_genuine | diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|---|
2 | True | 171.83 | 103.76 | 103.76 | 4.40 | 2.88 | 113.84 |
114 | False | 172.10 | 104.22 | 103.99 | 5.26 | 3.24 | 111.94 |
98 | True | 172.10 | 103.98 | 103.86 | 4.47 | 3.06 | 113.00 |
var_actives = ['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up', 'length']
data_actives = data[var_actives]
data_actives.sample(2)
diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|
39 | 171.13 | 104.28 | 103.14 | 4.16 | 2.92 | 113.00 |
160 | 172.50 | 104.07 | 103.71 | 3.82 | 3.63 | 110.74 |
#analyse avec le module ProfileReport
from pandas_profiling import ProfileReport
prof = ProfileReport(data)
prof.to_file(output_file='data/exports/rapport_ProfileReport.html')
print("Le dataset comprend :")
print(f" - {data.shape[0]} observations")
print(f" - {data.shape[1]} variables")
Le dataset comprend : - 170 observations - 7 variables
data.describe()
data.info()
diagonal | height_left | height_right | margin_low | margin_up | length | |
---|---|---|---|---|---|---|
count | 170.000000 | 170.000000 | 170.000000 | 170.000000 | 170.000000 | 170.000000 |
mean | 171.940588 | 104.066353 | 103.928118 | 4.612118 | 3.170412 | 112.570412 |
std | 0.305768 | 0.298185 | 0.330980 | 0.702103 | 0.236361 | 0.924448 |
min | 171.040000 | 103.230000 | 103.140000 | 3.540000 | 2.270000 | 109.970000 |
25% | 171.730000 | 103.842500 | 103.690000 | 4.050000 | 3.012500 | 111.855000 |
50% | 171.945000 | 104.055000 | 103.950000 | 4.450000 | 3.170000 | 112.845000 |
75% | 172.137500 | 104.287500 | 104.170000 | 5.127500 | 3.330000 | 113.287500 |
max | 173.010000 | 104.860000 | 104.950000 | 6.280000 | 3.680000 | 113.980000 |
<class 'pandas.core.frame.DataFrame'> RangeIndex: 170 entries, 0 to 169 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 is_genuine 170 non-null bool 1 diagonal 170 non-null float64 2 height_left 170 non-null float64 3 height_right 170 non-null float64 4 margin_low 170 non-null float64 5 margin_up 170 non-null float64 6 length 170 non-null float64 dtypes: bool(1), float64(6) memory usage: 8.3 KB
data.dtypes
is_genuine bool diagonal float64 height_left float64 height_right float64 margin_low float64 margin_up float64 length float64 dtype: object
Types des variables :
is_genuine
):# calcul de la taille (et de la proportion) des vrais et faux billets
nb_true = data[data["is_genuine"] == True].is_genuine.count()
part_true = nb_true /data.shape[0]*100
nb_false = data[data["is_genuine"] == False].is_genuine.count()
part_false = nb_false / data.shape[0]*100
print("Vrais billets :")
print(f" - effectif : {nb_true}")
print(f" - proportion : {part_true:.2f}%")
print()
print("Faux billets :")
print(f" - effectif : {nb_false}")
print(f" - proportion : {part_false:.2f}%")
Vrais billets : - effectif : 100 - proportion : 58.82% Faux billets : - effectif : 70 - proportion : 41.18%
data.mean()
is_genuine 0.588235 diagonal 171.940588 height_left 104.066353 height_right 103.928118 margin_low 4.612118 margin_up 3.170412 length 112.570412 dtype: float64
On peut schématiser les 6 longueurs de la façon suivante :
data.isna().sum()
data.duplicated().sum()
is_genuine 0 diagonal 0 height_left 0 height_right 0 margin_low 0 margin_up 0 length 0 dtype: int64
0
Absence de doublons et de valeurs nulles
color=sns.color_palette("Set2")
plt.figure(figsize=(12, 4))
plt.gcf().subplots_adjust(wspace=0.5)
j=0
for i in var_actives:
j+=1
ax = plt.subplot(2, 3, j)
sns.kdeplot(data=data, x=i)
plt.tight_layout()
plt.savefig('data/exports/img_charts/1.distribution_globale_var_actives.png', dpi = 300)
plt.show();
plt.figure(figsize=(12, 4))
j=0
for i in var_actives:
j+=1
ax = plt.subplot(2, 3, j)
sns.boxplot(data=data, y=i)
plt.tight_layout()
plt.savefig('data/exports/img_charts/2.repartition_globale_var_actives.png', dpi = 300)
plt.show();
On cherche à différencier les vrais billets des faux billets
--> on peut donc créer 2 groupes (is_genuine = True et is_genuine = False) et étudier leur comportement
----> on recherche les variables pour lesquelles il existe des différences entre les 2 groupes car elles nous
permettront de caractériser le groupe constitué de faux billets
sns.set_theme(style="white", font_scale= 0.8)
my_pal = {val: "r" if val == False else "g" for val in data.is_genuine.unique()}
plt.figure(figsize=(10, 16))
count = 0
for col in var_actives:
count+=1
plt.subplot(6, 2, count)
sns.boxplot(x='is_genuine', y=col, data=data, palette=my_pal)
count+=1
plt.subplot(6, 2, count)
#g=sns.kdeplot(data=data, x=col, hue="is_genuine")
g = sns.kdeplot(data[col][(data["is_genuine"] == False) & (data[col].notnull())], color="r", shade = True)
g = sns.kdeplot(data[col][(data["is_genuine"] == True) & (data[col].notnull())], ax =g, color="g", shade= True)
g.set_xlabel(col)
g.set_ylabel("Frequence")
g = g.legend(["Faux Billet","Vrai Billet"], loc="best")
plt.tight_layout()
plt.savefig('data/exports/img_charts/3.repartition_differenciee_var_actives.png', dpi = 300)
plt.show();