Seleksi Sampel
Mari kita buat beberapa sampel
import pandas as pd import statistics, itertools from IPython.display import HTML, display from tabulate import tabulate import scipy.spatial.distance as spad def table(df): display(HTML(tabulate(df, tablefmt='html', headers='keys', showindex=False)))
df = pd.read_csv('outlier.csv', usecols=['user_id', 'pause_video', 'play_video', 'seek_video', 'stop_video'], nrows=20) table(df)
user_id | pause_video | play_video | seek_video | stop_video |
---|---|---|---|---|
0 | 1 | 4 | 1 | 1 |
1 | 14 | 14 | 0 | 1 |
2 | 0 | 0 | 0 | 0 |
3 | 2 | 2 | 0 | 1 |
4 | 3 | 22 | 18 | 0 |
5 | 1 | 5 | 9 | 1 |
6 | 5 | 9 | 6 | 1 |
7 | 1 | 18 | 16 | 0 |
8 | 7 | 9 | 2 | 1 |
9 | 1 | 1 | 0 | 0 |
10 | 32 | 33 | 1 | 1 |
11 | 0 | 1 | 0 | 1 |
12 | 0 | 0 | 0 | 0 |
13 | 18 | 23 | 13 | 6 |
14 | 0 | 0 | 1 | 0 |
15 | 0 | 6 | 10 | 1 |
16 | 0 | 0 | 0 | 0 |
17 | 10 | 16 | 4 | 3 |
18 | 1 | 2 | 0 | 1 |
19 | 1 | 106 | 1 | 1 |
Outlier Detection
Outlier adalah samples janggal yang keluar dari kerumuman. Mereka membuat integritas data tidak sehat.
Suatu sampel A dapat dikatakan sebagai outlier dalam data (D), jika $$ \left(\sum^n_{i=1}\left[\operatorname{dist}(A, D_i) > r\right]\right) > \pi{n} $$
dimana r adalah batas normal jarak dan \pi adalah rasio toleransi (antara 0...1). Kedua r dan \pi dapat diatur secara empiris untuk mendapatkan data yang ideal
r = 20 pi = 0.5 d = df.values def is_outlier(i): count = 0 n = len(d) for j in range(n): delta = spad.euclidean(d[i,1:],d[j,1:]) if (i!=j and delta <= r): count += 1 if count >= pi*n: return False return True print("Deteksi outlier dengan r =",r,'dan pi =',pi) table(pd.DataFrame([[*d[i], 'Y' if is_outlier(i) else '-'] for i in range(len(d))], columns=[*df.columns,"Outliers?"]))
Deteksi outlier dengan r = 20 dan pi = 0.5
user_id | pause_video | play_video | seek_video | stop_video | Outliers? |
---|---|---|---|---|---|
0 | 1 | 4 | 1 | 1 | - |
1 | 14 | 14 | 0 | 1 | - |
2 | 0 | 0 | 0 | 0 | - |
3 | 2 | 2 | 0 | 1 | - |
4 | 3 | 22 | 18 | 0 | Y |
5 | 1 | 5 | 9 | 1 | - |
6 | 5 | 9 | 6 | 1 | - |
7 | 1 | 18 | 16 | 0 | Y |
8 | 7 | 9 | 2 | 1 | - |
9 | 1 | 1 | 0 | 0 | - |
10 | 32 | 33 | 1 | 1 | Y |
11 | 0 | 1 | 0 | 1 | - |
12 | 0 | 0 | 0 | 0 | - |
13 | 18 | 23 | 13 | 6 | Y |
14 | 0 | 0 | 1 | 0 | - |
15 | 0 | 6 | 10 | 1 | - |
16 | 0 | 0 | 0 | 0 | - |
17 | 10 | 16 | 4 | 3 | - |
18 | 1 | 2 | 0 | 1 | - |
19 | 1 | 106 | 1 | 1 | Y |
Outliers Detection 2
Cara deteksi kedua (lebih efisien) adalah menghitung jarak dari mean setiap fitur (c), sehingga sampel A akan menjadi outlier jika
\left(\sum^n_{i=1}\frac{\left(A_c-\overline{c}\right)^2}{\overline{c}}\right) > r
# Outliers 2 avgs = [df[x].mean() for x in df.columns][1:] r = 50 d = df.values def get_is_outlier(i): dist = sum([(c-avgs[j])**2/avgs[j] for j,c in enumerate(d[i,1:])]) return '{:.2f}'.format(dist), 'Y' if dist > r else '-' print("Deteksi outlier dengan r =",r) table(pd.DataFrame([[*d[i], *get_is_outlier(i)] for i in range(len(d))], columns=[*df.columns,"Dist", "Outliers?"]))
Deteksi outlier dengan r = 50
user_id | pause_video | play_video | seek_video | stop_video | Dist | Outliers? |
---|---|---|---|---|---|---|
0 | 1 | 4 | 1 | 1 | 12.13 | - |
1 | 14 | 14 | 0 | 1 | 21.38 | - |
2 | 0 | 0 | 0 | 0 | 23.5 | - |
3 | 2 | 2 | 0 | 1 | 15.62 | - |
4 | 3 | 22 | 18 | 0 | 54.1 | Y |
5 | 1 | 5 | 9 | 1 | 14.31 | - |
6 | 5 | 9 | 6 | 1 | 2.41 | - |
7 | 1 | 18 | 16 | 0 | 40.06 | - |
8 | 7 | 9 | 2 | 1 | 3.56 | - |
9 | 1 | 1 | 0 | 0 | 19.78 | - |
10 | 32 | 33 | 1 | 1 | 182.25 | Y |
11 | 0 | 1 | 0 | 1 | 20.57 | - |
12 | 0 | 0 | 0 | 0 | 23.5 | - |
13 | 18 | 23 | 13 | 6 | 86.56 | Y |
14 | 0 | 0 | 1 | 0 | 21.74 | - |
15 | 0 | 6 | 10 | 1 | 17.55 | - |
16 | 0 | 0 | 0 | 0 | 23.5 | - |
17 | 10 | 16 | 4 | 3 | 9.91 | - |
18 | 1 | 2 | 0 | 1 | 17 | - |
19 | 1 | 106 | 1 | 1 | 636.18 | Y |
Handling Missing Values with KNN
KNN (K-Neighboring)
from numpy import nan from sklearn.impute import KNNImputer dm = df.values.tolist() dm[6][2] = nan dm[9][3] = nan dfm = pd.DataFrame(dm,columns=df.columns) print("Before") table(dfm) imputer = KNNImputer(n_neighbors=5) dm = imputer.fit_transform(dm) dfm = pd.DataFrame(dm,columns=df.columns) print("After") table(dfm)
Before
user_id | pause_video | play_video | seek_video | stop_video |
---|---|---|---|---|
0 | 1 | 4 | 1 | 1 |
1 | 14 | 14 | 0 | 1 |
2 | 0 | 0 | 0 | 0 |
3 | 2 | 2 | 0 | 1 |
4 | 3 | 22 | 18 | 0 |
5 | 1 | 5 | 9 | 1 |
6 | 5 | nan | 6 | 1 |
7 | 1 | 18 | 16 | 0 |
8 | 7 | 9 | 2 | 1 |
9 | 1 | 1 | nan | 0 |
10 | 32 | 33 | 1 | 1 |
11 | 0 | 1 | 0 | 1 |
12 | 0 | 0 | 0 | 0 |
13 | 18 | 23 | 13 | 6 |
14 | 0 | 0 | 1 | 0 |
15 | 0 | 6 | 10 | 1 |
16 | 0 | 0 | 0 | 0 |
17 | 10 | 16 | 4 | 3 |
18 | 1 | 2 | 0 | 1 |
19 | 1 | 106 | 1 | 1 |
After
user_id | pause_video | play_video | seek_video | stop_video |
---|---|---|---|---|
0 | 1 | 4 | 1 | 1 |
1 | 14 | 14 | 0 | 1 |
2 | 0 | 0 | 0 | 0 |
3 | 2 | 2 | 0 | 1 |
4 | 3 | 22 | 18 | 0 |
5 | 1 | 5 | 9 | 1 |
6 | 5 | 4.2 | 6 | 1 |
7 | 1 | 18 | 16 | 0 |
8 | 7 | 9 | 2 | 1 |
9 | 1 | 1 | 3.2 | 0 |
10 | 32 | 33 | 1 | 1 |
11 | 0 | 1 | 0 | 1 |
12 | 0 | 0 | 0 | 0 |
13 | 18 | 23 | 13 | 6 |
14 | 0 | 0 | 1 | 0 |
15 | 0 | 6 | 10 | 1 |
16 | 0 | 0 | 0 | 0 |
17 | 10 | 16 | 4 | 3 |
18 | 1 | 2 | 0 | 1 |
19 | 1 | 106 | 1 | 1 |