import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline
from collections import Counter
from statsmodels.tsa.stattools import adfuller # adf test
from statsmodels.tsa.stattools import kpss #kpss test
from statsmodels.tsa.stattools import acf, pacf #acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf #plot acf, plot pacf
import plotly.express as px #plot
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA #ARIMA
import tensorflow as tf
import torch
from transformers import pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


#Remove warning
import warnings
warnings.filterwarnings('ignore')


train = pd.read_csv('train_2.csv') 
print("Shape des données:",train.shape)

Shape des données: (145063, 804)


train.head(2)


print("Nombre des données NA par jour:\n", train.isna().sum(), '\n')
print("Nombre des données NA au total:",train.isna().sum().sum())

Nombre des données NA par jour:
 Page              0
2015-07-01    20740
2015-07-02    20816
2015-07-03    20544
2015-07-04    20654
              ...  
2017-09-06     1775
2017-09-07     3467
2017-09-08     1061
2017-09-09     3332
2017-09-10     5578
Length: 804, dtype: int64 

Nombre des données NA au total: 7027348


jours = [r for r in range(train.isna().sum().shape[0])]

fig = plt.figure(1,figsize=[10,10])
plt.ylabel('NaN par jours')
plt.xlabel('Jour')
plt.title('Nombre de NaN par jour')

plt.plot(jours,train.isna().sum())
plt.show()


print("Pourcentage des données NA:",round(train.isna().sum().sum()*100/(len(train)*len(train.columns)),2), "%.")

Pourcentage des données NA: 6.03 %.


train =train.dropna()


train[train['Page'].str.contains("wikimedia")]


train[train['Page'].str.contains("mediawiki")]


page_acc_agent = pd.DataFrame([i.split("_")[-3:] for i in train["Page"]])
page_acc_agent.columns = ["Media type", "Access", "Agent"]


page_acc_agent = pd.DataFrame([i.split("_")[-3:] for i in train["Page"]])
page_acc_agent.columns = ["Media type", "Access", "Agent"]
page_acc_agent.loc[page_acc_agent["Media type"].str.contains("wikipedia"), 'Media type'] = "wikipedia"
page_acc_agent.loc[page_acc_agent["Media type"].str.contains("wikimedia"), 'Media type'] = "wikimedia"
page_acc_agent.loc[page_acc_agent["Media type"].str.contains("mediawiki"), 'Media type'] = "mediawiki"
page_acc_agent


# Ajouter dans nos donnes
train['Access']=page_acc_agent['Access']
train['Agent'] = page_acc_agent['Agent']
train['Media type'] = page_acc_agent['Media type']
train.head(2)


access_names = ["all-access", "desktop","mobile-web"]
count = [train["Access"].value_counts().sort_index()[0],train["Access"].value_counts().sort_index()[1],train["Access"].value_counts().sort_index()[2]]

# Accuracy
fig = px.bar(x=access_names, y=count, title="Traffic par rapport de type d'access", labels={'x': 'Type access', 'y':'Count'},color=access_names)
fig.show()


agent_names = ["all-agents", "spider"]
count = [train["Agent"].value_counts().sort_index()[0],train["Agent"].value_counts().sort_index()[1]]

# Accuracy
fig = px.bar(x=agent_names, y=count, title="Traffic par rapport de type d'agent", labels={'x': 'Type agent', 'y':'Count'},color=agent_names)
fig.show()


media_types_names = ["wikipedia", "wikimedia", "mediawiki"]
count = [train["Media type"].value_counts().sort_index()[0],train["Media type"].value_counts().sort_index()[1], train["Media type"].value_counts().sort_index()[2]]

# Accuracy
fig = px.bar(x=media_types_names, y=count, title="Traffic par rapport au type de media", labels={'x': 'Media type', 'y':'Count'},color=media_types_names)
fig.show()


def trouver_language(page):
    res = re.search('[a-z][a-z].wikipedia.org',page)
    if res:
        return res[0][0:2]
    return 'na'

train['Language'] = train.Page.map(trouver_language)
train["Language"].value_counts().sort_index()

de    16145
en    16680
es    12773
fr    15760
ja    18379
na     8756
ru    13432
zh    13159
Name: Language, dtype: int64


lang = ["de","en","es","fr","ja","na","ru","zh"]
count = [train["Language"].value_counts().sort_index()[0],train["Language"].value_counts().sort_index()[1],
         train["Language"].value_counts().sort_index()[2],train["Language"].value_counts().sort_index()[3],
         train["Language"].value_counts().sort_index()[4],train["Language"].value_counts().sort_index()[5],
         train["Language"].value_counts().sort_index()[6],train["Language"].value_counts().sort_index()[7]]

# Accuracy
fig = px.bar(x=lang, y=count, title="Le nombre de pages par rapport de langues utilisés ", 
             labels={'x': 'Language', 'y':'Count'},color=lang)
fig.show()


### Creer un dataset par rapport language
lang_data = {}
lang_data['en'] = train[train.Language=='en'].iloc[:,0:-1]
lang_data['na'] = train[train.Language=='na'].iloc[:,0:-1]
lang_data['fr'] = train[train.Language=='fr'].iloc[:,0:-1]
lang_data['zh'] = train[train.Language=='zh'].iloc[:,0:-1]
lang_data['ru'] = train[train.Language=='ru'].iloc[:,0:-1]
lang_data['de'] = train[train.Language=='de'].iloc[:,0:-1]
lang_data['es'] = train[train.Language=='es'].iloc[:,0:-1]
lang_data['ja'] = train[train.Language=='ja'].iloc[:,0:-1]

sums = {}
for key in lang_data:
    sums[key] = lang_data[key].iloc[:,1:-3].sum(axis=0) / lang_data[key].shape[0]
    
### Plot sur le nbre de views par pages par rapport a chaque language

jours = [r for r in range(sums["en"].shape[0])]

fig = plt.figure(1,figsize=[10,10])
plt.ylabel('Views par pages')
plt.xlabel('Jour')
plt.title('Pages sur differents languages')
labels={'en':'English','ja':'Japanese','de':'German',
        'na':'Néerlandais','fr':'French','zh':'Chinese',
        'ru':'Russian','es':'Spanish',
       }

for key in sums:
    plt.plot(jours,sums[key],label = labels[key] )
    
plt.legend()
plt.show()


#Voici comment nous avons créé notre dataset des articles en anglais avec le thème du sujet qu'ils traite. 
#Code à ne pas faire tourner, tant la durée de compilation est grande. 
train_en = lang_data["en"]
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

#Obtenir le titre des articles 
def get_title(page):
    pos1 = page.rindex("_en.")
    pos2 = page[:pos1]
    return pos2

articles_name = train_en.Page.map(get_title)
candidate_labels = ["Mathematics and abstractions", "Art and culture", "Geography and places", "Health and fitness", "History and events", "Natural sciences and nature", 
                    "People and self", "Philosophy and thinking", "Religion and spirituality", "Social sciences and society", "Technology and applied sciences"]
category = []
for i, name in enumerate(articles_name):
    sequence_to_classify = name
    scores = classifier(sequence_to_classify, candidate_labels)['scores']
    labels =  classifier(sequence_to_classify, candidate_labels)['labels']
    max_value = max(scores)
    index = scores.index(max_value)
    category.append(labels[index])
        
train_category = train_en[0:i]
train_category['category'] =  category
train_category.to_csv("train_category.csv")


topics = pd.read_csv('train_category.csv') 

print("Shape des données:",topics.shape)
def get_title(page):
    pos1 = page.rindex("_en.")
    pos2 = page[:pos1]
    return pos2

topics_dict = {}
topics_dict['en'] = topics
articles_name = topics_dict['en'].Page.map(get_title)

topics['Page'] = articles_name
topics['Page']

topics[['Page', 'category']].tail(5)
topics.to_csv("Wikipedia_topics.csv")

Shape des données: (1705, 806)


topics[topics['category']=='Mathematics and abstractions'][['Page', 'category']]


topic_sets = {}
topic_sets['Mathematics and abstractions'] = topics[topics['category']=='Mathematics and abstractions'].iloc[:,0:-1]
topic_sets['Geography and places'] = topics[topics['category']=='Geography and places'].iloc[:,0:-1]
topic_sets['Health and fitness'] = topics[topics['category']=='Health and fitness'].iloc[:,0:-1]
topic_sets['History and events'] = topics[topics['category']=='History and events'].iloc[:,0:-1]
topic_sets['Natural sciences and nature'] = topics[topics['category']=='Natural sciences and nature'].iloc[:,0:-1]
topic_sets['People and self'] = topics[topics['category']=='People and self'].iloc[:,0:-1]
topic_sets['Philosophy and thinking'] = topics[topics['category']=='Philosophy and thinking'].iloc[:,0:-1]
topic_sets['Social sciences and society'] = topics[topics['category']=='Social sciences and society'].iloc[:,0:-1]
topic_sets['Religion and spirituality'] = topics[topics['category']=='Religion and spirituality'].iloc[:,0:-1]
topic_sets['Technology and applied sciences'] = topics[topics['category']=='Technology and applied sciences'].iloc[:,0:-1]
topic_sets['Art and culture'] = topics[topics['category']=='Art and culture'].iloc[:,0:-1]


sums = {}
for key in topic_sets:
    sums[key] = topic_sets[key].iloc[:,2:].sum(axis=0) / topic_sets[key].shape[0]

jours = [r for r in range(sums["Mathematics and abstractions"].shape[0])]


fig = plt.figure(1,figsize=[10,10])
plt.ylabel('Views par pages')
plt.xlabel('Jour')
plt.title('Pages en anglais sur differents sujets')
labels={"Mathematics and abstractions" : "Maths", "Art and culture": "Arts", "Geography and places":"Geo", 
        "Health and fitness": "Health", "History and events":"Hist", "Natural sciences and nature":"Sciences", 
        "People and self":"Psy", "Philosophy and thinking":"Philo", "Religion and spirituality":"Religion", 
        "Social sciences and society":"Society", "Technology and applied sciences":"Techno"
       }

for key in sums:
    plt.plot(jours,sums[key],label = labels[key] )
    
plt.legend()
plt.show()


plt.plot(jours,topic_sets['Religion and spirituality'].iloc[:,2:].sum(axis=0) / topic_sets['Religion and spirituality'].shape[0],label = 'Religion' )
plt.legend()
plt.show()


plt.plot(jours,topic_sets['Technology and applied sciences'].iloc[:,2:].sum(axis=0) / topic_sets['Technology and applied sciences'].shape[0],label = 'Techno' )
plt.legend()
plt.show()


plt.plot(jours,topic_sets['People and self'].iloc[:,2:].sum(axis=0) / topic_sets['People and self'].shape[0],label = 'People' )
plt.legend()
plt.show()


#View par chaque language chaque jour a partir de 1/7/2015 au 10/09/2017

total_view = {} 
for key in lang_data:
    total_view[key] = lang_data[key].iloc[:, 1:-3].sum(axis=0) / len(lang_data[key])

data_en = total_view['en'] #View des pages en Anglais 
data_de = total_view['de'] #View des pages en Allemagne
data_fr = total_view['fr'] #View des pages en Francais
data_ch = total_view['zh'] #View des pages en Chinois


def adf_test(x, name):
    dfoutput = pd.DataFrame(columns=['Test Statistic','p-value','#Lags Used','Number of Observations Used','Critical Value (1%)', 'Critical Value (5%)','Critical Value (10%)','Name'])
    for i in range(len(x)):
        dftest = adfuller(x[i], autolag='AIC')
        dfoutput.loc[i]=[dftest[0],dftest[1],dftest[2],dftest[3], dftest[4]['1%'], dftest[4]['5%'], dftest[4]['10%'],name[i]]
    return dfoutput


adf_test([data_en, data_de, data_fr, data_ch],['anglais','allemand','français','chinois'])


def test_stationarity(x):


    #Determing rolling statistics
    rolmean = x.rolling(window=22,center=False).mean()

    rolstd = x.rolling(window=12,center=False).std()
    
    #Plot rolling statistics:
    orig = plt.plot(x.values, color='blue',label='Original')
    mean = plt.plot(rolmean.values, color='red', label='Rolling Mean')
    std = plt.plot(rolstd.values, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()
test_stationarity(data_en)


test_stationarity(data_de)


test_stationarity(data_fr)


test_stationarity(data_ch)


def kpss_test(x, name):
    kpss_output = pd.DataFrame(columns=['Test Statistic','p-value','#Lags Used','Critical Value (1%)', 'Critical Value (5%)','Critical Value (10%)','Name'])
    for i in range(len(x)):
        kpsstest = kpss(x[i], regression='c', nlags="auto")
        kpss_output.loc[i]=[kpsstest[0],kpsstest[1],kpsstest[2],kpsstest[3]['1%'], kpsstest[3]['5%'], kpsstest[3]['10%'],name[i]]
    return kpss_output


kpss_test([data_en, data_de, data_fr, data_ch],['anglais','allemand','français','chinois'])


#original
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_en)
ax1.set_title('Original')
plot_acf(data_en, ax=ax2)
plt.show()

#1ere differenciation
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_en.diff())
ax1.set_title('1ere differenciation')
plot_acf(data_en.diff().dropna(), ax=ax2)
plt.show()

#2eme Differenciation
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_en.diff().diff())
ax1.set_title('2eme Differenciation')
plot_acf(data_en.diff().diff().dropna(), ax=ax2)
plt.show()


adf_test([data_en.diff().dropna()], ['anglais'])


kpss_test([data_en.diff().dropna()], ['anglais'])


#original
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_de)
ax1.set_title('Original')
plot_acf(data_de, ax=ax2)

#1ère différenciation
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_de.diff())
ax1.set_title('1ere Differentiation')
plot_acf(data_de.diff().dropna(), ax=ax2)


#2nd Differencing
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_de.diff().diff())
ax1.set_title('2eme Differentiation')
plot_acf(data_de.diff().diff().dropna(), ax=ax2)
plt.show()


#adf test
adf_test([data_de.diff().dropna()], ["Allemand"])


kpss_test([data_de.diff().dropna()], ["Allemand"])


#Original
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_fr)
ax1.set_title('Original')
plot_acf(data_fr, ax=ax2)

#1st Differencing
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_fr.diff())
ax1.set_title('1ere Differentiation')
plot_acf(data_fr.diff().dropna(), ax=ax2)

#2nd Differencing
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_fr.diff().diff())
ax1.set_title('2eme Differentiation')
plot_acf(data_fr.diff().diff().dropna(), ax=ax2)
plt.show()


#adf test
adf_test([data_fr.diff().dropna()], ['Français'])


#kpss test
kpss_test([data_fr.diff().dropna()], ['Français'])


fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_ch)
ax1.set_title('Original')
plot_acf(data_ch, ax=ax2)

#1st Differencing
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_ch.diff())
ax1.set_title('1ere Differentiation')
plot_acf(data_ch.diff().dropna(), ax=ax2)


#2nd Differencing
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_ch.diff().diff())
ax1.set_title('2eme Differentiation')
plot_acf(data_ch.diff().diff().dropna(), ax=ax2)
plt.show()


#adf test
adf_test([data_ch.diff().dropna()], ["Chinois"])


#kpss test
kpss_test([data_ch.diff().dropna()], ["Chinois"])


fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_en.diff())
ax1.set_title('1ere Differentiation')
plot_pacf(data_en.diff().dropna(), ax=ax2)
plt.show()

fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_en.diff())
ax1.set_title('1ere Differentiation')
plot_acf(data_en.diff().dropna(), ax=ax2)
plt.show()


fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_de.diff())
ax1.set_title('1ere Differentiation')
plot_pacf(data_de.diff().dropna(), ax=ax2)


fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_de.diff())
ax1.set_title('2ème Differentiation')
plot_acf(data_de.diff().dropna(), ax=ax2)
plt.show()


fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_fr.diff())
ax1.set_title('1ere Differentiation')
plot_pacf(data_fr.diff().dropna(), ax=ax2)

fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_fr.diff())
ax1.set_title('2ème Differentiation')
plot_acf(data_fr.diff().dropna(), ax=ax2)
plt.show()


fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_ch.diff())
ax1.set_title('1ere Differentiation')
plot_pacf(data_ch.diff().dropna(), ax=ax2)

fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(data_ch.diff())
ax1.set_title('2ème Differentiation')
plot_acf(data_ch.diff().dropna(), ax=ax2)
plt.show()


# 2,1,2 ARIMA Model
model = ARIMA(data_en, order=(2,1,2))
results = model.fit()


# Forecast
forecast = results.predict(1,803,typ='levels')  # 95% conf

#plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,803),data_en, label='data')
plt.plot(np.arange(0,803),forecast.values, label='forecast')
plt.xlabel('Jours')
plt.ylabel('Views')
plt.legend(loc='upper left', fontsize=8)
plt.show()


# 1,2,1 ARIMA Model
model = ARIMA(data_de, order=(2,1,2))
results = model.fit()


# Forecast
forecast = results.predict(1,803,typ='levels')  # 95% conf

#plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,803),data_de, label='data')
plt.plot(np.arange(0,803),forecast.values, label='forecast')
plt.xlabel('Jours')
plt.ylabel('Views')
plt.legend(loc='upper left', fontsize=8)
plt.show()


# 2,1,2 ARIMA Model
model = ARIMA(data_fr, order=(2,1,2))
results = model.fit()
#print(results.summary())


# Forecast
forecast = results.predict(1,803,typ='levels')  # 95% conf

#plot

plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,803),data_fr, label='data')
plt.plot(np.arange(0,803),forecast.values, label='forecast')
plt.xlabel('Jours')
plt.ylabel('Views')
plt.legend(loc='upper left', fontsize=8)
plt.show()


# 2,1,2 ARIMA Model
model = ARIMA(data_ch, order=(2,1,2))
results = model.fit()
#print(results.summary())


# Forecast
forecast = results.predict(1,803,typ='levels')  # 95% conf

#plot

plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,803),data_ch, label='data')
plt.plot(np.arange(0,803),forecast.values, label='forecast')
plt.xlabel('Jours')
plt.ylabel('Views')
plt.legend(loc='upper left', fontsize=8)
plt.show()


#Split les données de proportions 80:20
train_ind = int(len(data_en)*0.8)
train = data_en[:train_ind] #642 valeurs dans le train 
test = data_en[train_ind:] # 161 valeurs dans le test


model = ARIMA(train, order=(2, 1, 2))  
results= model.fit()


# Forecast
fc = results.predict(1,803,typ='levels')  
#Intervalle de confidence
forecast_val=results.get_forecast(steps=161,alpha=0.05) #95% confidence
confidence_int = forecast_val.conf_int()

lower_series = confidence_int.iloc[:, 0]
upper_series = confidence_int.iloc[:, 1]


plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,642),train, label='training')
plt.plot(np.arange(1,804),fc.values, label='forecast')
plt.plot(np.arange(642,803),test.values, label='actual')
plt.fill_between(np.arange(642,803), 
                 lower_series, 
                 upper_series, 
                 color='r', alpha=.15)
plt.xlabel('Days')
plt.ylabel('Views')
plt.legend(loc='upper left', fontsize=8)
plt.show()


#Accuracy metrics
def accuracy_metrics(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    mins = np.amin(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmaxerror
    return({'mape':mape,'minmax':minmax})

accuracy_metrics(fc,data_en.values)

{'mape': 0.04149801549499804, 'minmax': 0.03957430362702874}


#Forecast pour 50 jours suivants

#models
model = ARIMA(data_en, order=(2, 1, 2))  
results= model.fit()  

# Forecast
n_periods = 50
fc = results.predict(803,853,typ='levels')  # 95% conf

#Intervalle de confidence
forecast_val=results.get_forecast(steps=n_periods,alpha=0.05)
confidence_int = forecast_val.conf_int()

lower_series = confidence_int.iloc[:, 0]
upper_series = confidence_int.iloc[:, 1]

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,803),data_en.values)
plt.plot(np.arange(803,854),fc, color='darkgreen')
plt.fill_between(np.arange(803,853), 
                 lower_series, 
                 upper_series, 
                 color='r', alpha=.15)
plt.xlabel('Jours')
plt.ylabel('Views')
plt.title("Prevision le traffic sur les page Wikipedia en Anglais dans 50 jours")
plt.show()


#Split les données de proportions 80:20
train_ind = int(len(data_de)*0.8)
train = data_de[:train_ind] #641 valeurs dans le train 
test = data_de[train_ind:] # 161 valeurs dans le test


model = ARIMA(train, order=(2, 1, 2))  
results= model.fit()


# Forecast
fc = results.predict(1,803,typ='levels')  
#Intervalle de confidence
forecast_val=results.get_forecast(steps=161,alpha=0.05) #95% confidence
confidence_int = forecast_val.conf_int()

lower_series = confidence_int.iloc[:, 0]
upper_series = confidence_int.iloc[:, 1]


plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,642),train, label='training')
plt.plot(np.arange(1,804),fc.values, label='forecast')
plt.plot(np.arange(642,803),test.values, label='actual')
plt.fill_between(np.arange(642,803), 
                 lower_series, 
                 upper_series, 
                 color='r', alpha=.15)
plt.xlabel('Days')
plt.ylabel('Views')
plt.legend(loc='upper left', fontsize=8)
plt.show()


#Accuracy metrics
def accuracy_metrics(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    mins = np.amin(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmaxerror
    return({'mape':mape,'minmax':minmax})

accuracy_metrics(fc,data_de.values)

{'mape': 0.06030298097560872, 'minmax': 0.05329951894114826}


#Forecast pour 50 jours suivants

#models
model = ARIMA(data_de, order=(2, 1, 2))  
results= model.fit()  

# Forecast
n_periods = 50
fc = results.predict(803,853,typ='levels')  # 95% conf

#Intervalle de confidence
forecast_val=results.get_forecast(steps=n_periods,alpha=0.05)
confidence_int = forecast_val.conf_int()

lower_series = confidence_int.iloc[:, 0]
upper_series = confidence_int.iloc[:, 1]

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,803),data_de.values)
plt.plot(np.arange(803,854),fc, color='darkgreen')
plt.fill_between(np.arange(803,853), 
                 lower_series, 
                 upper_series, 
                 color='r', alpha=.15)
plt.xlabel('Jours')
plt.ylabel('Views')
plt.title("Prevision le traffic sur les page Wikipedia en Allemagne dans 50 jours")
plt.show()


#Split les données de proportions 80:20
train_ind = int(len(data_fr)*0.8)
train = data_fr[:train_ind] #641 valeurs dans le train 
test = data_fr[train_ind:] # 161 valeurs dans le test


model = ARIMA(train, order=(2, 1, 2))  
results= model.fit()


# Forecast
fc = results.predict(1,803,typ='levels')  
#Intervalle de confidence
forecast_val=results.get_forecast(steps=161,alpha=0.05) #95% confidence
confidence_int = forecast_val.conf_int()

lower_series = confidence_int.iloc[:, 0]
upper_series = confidence_int.iloc[:, 1]


plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,642),train, label='training')
plt.plot(np.arange(1,804),fc.values, label='forecast')
plt.plot(np.arange(642,803),test.values, label='actual')
plt.fill_between(np.arange(642,803), 
                 lower_series, 
                 upper_series, 
                 color='r', alpha=.15)
plt.xlabel('Days')
plt.ylabel('Views')
plt.legend(loc='upper left', fontsize=8)
plt.show()


#Accuracy metrics
def accuracy_metrics(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))#MAPE
    mins = np.amin(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs) #minmaxerror
    return({'mape':mape,'minmax':minmax})

accuracy_metrics(fc,data_fr.values)

{'mape': 0.0648559431074301, 'minmax': 0.05381242234208283}


#Forecast pour 50 jours suivants

#models
model = ARIMA(data_fr, order=(2, 1,2))
results= model.fit()  

# Forecast
n_periods = 50
fc = results.predict(803,853,typ='levels')  # 95% conf

#Intervalle de confidence
forecast_val=results.get_forecast(steps=n_periods,alpha=0.05)
confidence_int = forecast_val.conf_int()

lower_series = confidence_int.iloc[:, 0]
upper_series = confidence_int.iloc[:, 1]

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,803),data_fr.values)
plt.plot(np.arange(803,854),fc,color='darkgreen')
plt.fill_between(np.arange(803,853),
                 lower_series, 
                 upper_series, 
                 color='r', alpha=.15)
plt.xlabel('Days')
plt.ylabel('Views')
plt.title("Prevision le traffic sur les page Wikipedia en Francais dans 50 jours")
plt.show()


#Split les données de proportions 80:20
train_ind = int(len(data_ch)*0.8)
train = data_ch[:train_ind] #641 valeurs dans le train 
test = data_ch[train_ind:] # 161 valeurs dans le test


model = ARIMA(train, order=(2, 1, 2))  
results= model.fit()


# Forecast
fc = results.predict(1,803,typ='levels')  
#Intervalle de confidence
forecast_val=results.get_forecast(steps=161,alpha=0.05) #95% confidence
confidence_int = forecast_val.conf_int()

lower_series = confidence_int.iloc[:, 0]
upper_series = confidence_int.iloc[:, 1]


plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,642),train, label='training')
plt.plot(np.arange(1,804),fc.values, label='forecast')
plt.plot(np.arange(642,803),test.values, label='actual')
plt.fill_between(np.arange(642,803), 
                 lower_series, 
                 upper_series, 
                 color='r', alpha=.15)
plt.xlabel('Days')
plt.ylabel('Views')
plt.legend(loc='upper left', fontsize=8)
plt.show()


#Accuracy metrics
def accuracy_metrics(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    mins = np.amin(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    maxs = np.amax(np.hstack([forecast[:,None], 
                              actual[:,None]]), axis=1)
    minmax = 1 - np.mean(mins/maxs)             # minmax
    return({'mape':mape, 'minmax':minmax})

accuracy_metrics(fc,data_ch.values)

{'mape': 0.054614075437687516, 'minmax': 0.049827832184477616}


#Forecast pour 50 jours suivants

#models
model = ARIMA(data_ch, order=(2, 1,2))
results= model.fit()  

# Forecast
n_periods = 50
fc = results.predict(803,853,typ='levels')  # 95% conf

#Intervalle de confidence
forecast_val=results.get_forecast(steps=n_periods,alpha=0.05)
confidence_int = forecast_val.conf_int()

lower_series = confidence_int.iloc[:, 0]
upper_series = confidence_int.iloc[:, 1]

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(np.arange(0,803),data_ch.values)
plt.plot(np.arange(803,854),fc,color='darkgreen')
plt.fill_between(np.arange(803,853),
                 lower_series, 
                 upper_series, 
                 color='r', alpha=.15)
plt.xlabel('Days')
plt.ylabel('Views')
plt.title("Prevision le traffic sur les page Wikipedia en Chinois dans 50 jours")
plt.show()


# Convert index to datetime object
data_en.index = pd.to_datetime(data_en.index)

# Split the data into training and test sets
train_size = int(len(data_en) * 0.8)
train_data = data_en.iloc[:train_size]
test_data = data_en.iloc[train_size:]


# Convert the datetime index to a float value
start_date = train_data.index.min()
train_data['days_since_start'] = (train_data.index - start_date).days
test_data['days_since_start'] = (test_data.index - start_date).days


# Define the Ridge Regression model with cross-validation
alphas = [0.1, 1.0, 10.0]
ridge_cv = RidgeCV(alphas=alphas, cv=5)

# Train the model on the training data
X_train = train_data['days_since_start'].values.reshape(-1, 1)
y_train = train_data.values.reshape(-1, 1)
y_train = np.delete(y_train, -1, axis=0)
ridge_cv.fit(X_train, y_train)

# Print the best alpha value
print(f"Best alpha: {ridge_cv.alpha_}")

# Make predictions on the test data using the best alpha value
X_test = test_data['days_since_start'].values.reshape(-1, 1)
y_test = test_data.values.reshape(-1, 1)
y_test = np.delete(y_test, -1, axis=0)
y_pred = ridge_cv.predict(X_test)

# Evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


# Print the evaluation metrics
print(f"Ridge Regression - MSE: {mse:.2f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2 score: {r2:.2f}")

Best alpha: 0.1
Ridge Regression - MSE: 4081594.40, RMSE: 2020.30, MAE: 1952.68, R2 score: -22.54


# Plot the predicted values and the actual values
plt.figure(figsize=(10, 6))
plt.plot(X_test, y_test, label='Actual')
plt.plot(X_test, y_pred, label='Predicted')
plt.legend()
plt.xlabel('Date')
plt.ylabel('Sales')
plt.title('Ridge Regression - Predicted vs Actual Sales')
plt.show()


# Convert index to datetime object
data_en.index = pd.to_datetime(data_en.index)

# Split the data into training and test sets
train_size = int(len(data_en) * 0.8)
train_data = data_en.iloc[:train_size]
test_data = data_en.iloc[train_size:]


# Convert the datetime index to a float value
start_date = train_data.index.min()
train_data['days_since_start'] = (train_data.index - start_date).days
test_data['days_since_start'] = (test_data.index - start_date).days


# Define the Ridge Regression model with cross-validation
alphas = [0.1, 1.0, 10.0]
lasso_cv = LassoCV(alphas=alphas, cv=5)

# Train the model on the training data
X_train = train_data['days_since_start'].values.reshape(-1, 1)
y_train = train_data.values.reshape(-1, 1)
y_train = np.delete(y_train, -1, axis=0)
lasso_cv.fit(X_train, y_train)

# Print the best alpha value
print(f"Best alpha: {lasso_cv.alpha_}")

# Make predictions on the test data using the best alpha value
X_test = test_data['days_since_start'].values.reshape(-1, 1)
y_test = test_data.values.reshape(-1, 1)
y_test = np.delete(y_test, -1, axis=0)
y_pred = lasso_cv.predict(X_test)

# Evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Ridge Regression - MSE: {mse:.2f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2 score: {r2:.2f}")

Best alpha: 10.0
Ridge Regression - MSE: 4081126.51, RMSE: 2020.18, MAE: 1952.57, R2 score: -22.54


train_ind = int(len(data_en)*0.8)
train = data_en[:train_ind] #641 valeurs dans le train 
test = data_en[train_ind:]

train = train.to_frame()
train.columns = ['y']

plt.figure(figsize=(15, 7))
plt.plot(train.y.values, label="actual", linewidth=2.0);


# Ajout d'un d'1 à 7 pas en arrière
for i in range(1, 8):
    train["lag_{}".format(i)] = train.y.shift(i)
train.tail()


tscv = TimeSeriesSplit(n_splits=3)


def timeseries_train_test_split(X, y, test_size):
    
    test_index = int(len(X)*(1-test_size))
    
    X_train = X.iloc[:test_index]
    y_train = y.iloc[:test_index]
    X_test = X.iloc[test_index:]
    y_test = y.iloc[test_index:]
    
    return X_train, X_test, y_train, y_test

y = train.dropna().y
X = train.dropna().drop(['y'], axis=1)


# On garde 20% des données pour l'ensemble test
# donc l'ensemble Test est de ~127 jours
X_train, X_test, y_train, y_test = timeseries_train_test_split(X, y, test_size=0.20)


print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(508, 7) (508,) (127, 7) (127,)


train.index = pd.to_datetime(train.index)
train["Jour de la semaine"] = train.index.weekday
train['Weekend'] = train.index.weekday.isin([5,6])*1
train.tail(7)


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


def plotModelResults(model, X_train=X_train, X_test=X_test, plot_intervals=False, plot_anomalies=False):
    
    prediction = model.predict(X_test)
    
    plt.figure(figsize=(15, 7))
    plt.plot(prediction, "g", label="prediction", linewidth=2.0)
    plt.plot(y_test.values, label="actual", linewidth=2.0)
    
    if plot_intervals:
        cv = cross_val_score(model, X_train, y_train, 
                                    cv=tscv, 
                                    scoring="neg_mean_absolute_error")
        mae = cv.mean() * (-1)
        deviation = cv.std()
        
        scale = 1.96
        lower = prediction - (mae + scale * deviation)
        upper = prediction + (mae + scale * deviation)
        
        plt.plot(lower, "r--", label="upper bond / lower bond", alpha=0.5)
        plt.plot(upper, "r--", alpha=0.5)
        
        if plot_anomalies:
            anomalies = np.array([np.NaN]*len(y_test))
            anomalies[y_test<lower] = y_test[y_test<lower]
            anomalies[y_test>upper] = y_test[y_test>upper]
            plt.plot(anomalies, "o", markersize=10, label = "Anomalies")
    
    mape_error = mean_absolute_percentage_error(prediction, y_test)
    smape_error = smape(prediction, y_test)
    plt.title("MAPE: "+str(mape_error)+"\n"+"SMAPE: "+str(smape_error))
    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid(True);


# Fonction pour l'erreur MAPE 
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Fonction pour l'erreur SMAPE
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 200 * np.mean(diff)


from xgboost import XGBRegressor 

xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)

plotModelResults(xgb, X_train=X_train_scaled, X_test=X_test_scaled, plot_intervals=True)


# On définit un modèle pour 1000 arbres
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

rf.fit(X_train_scaled, y_train);


plotModelResults(rf, X_train=X_train_scaled, X_test=X_test_scaled, plot_intervals=True)

	Page	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	...	2017-09-01	2017-09-02	2017-09-03	2017-09-04	2017-09-05	2017-09-06	2017-09-07	2017-09-08	2017-09-09	2017-09-10
0	2NE1_zh.wikipedia.org_all-access_spider	18.0	11.0	5.0	13.0	14.0	9.0	9.0	22.0	26.0	...	19.0	33.0	33.0	18.0	16.0	27.0	29.0	23.0	54.0	38.0
1	2PM_zh.wikipedia.org_all-access_spider	11.0	14.0	15.0	18.0	11.0	13.0	22.0	11.0	10.0	...	32.0	30.0	11.0	19.0	54.0	25.0	26.0	23.0	13.0	81.0

	Page	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	...	2017-09-01	2017-09-02	2017-09-03	2017-09-04	2017-09-05	2017-09-06	2017-09-07	2017-09-08	2017-09-09	2017-09-10
13332	Accueil_commons.wikimedia.org_all-access_spider	55.0	48.0	44.0	35.0	46.0	41.0	32.0	43.0	40.0	...	168.0	123.0	58.0	72.0	146.0	3232.0	5853.0	46.0	48.0	52.0
13333	Atlas_of_Asia_commons.wikimedia.org_all-access...	5.0	3.0	4.0	6.0	3.0	1.0	5.0	6.0	6.0	...	22.0	26.0	28.0	25.0	22.0	20.0	23.0	21.0	23.0	21.0
13334	Atlas_of_Europe_commons.wikimedia.org_all-acce...	4.0	6.0	9.0	7.0	6.0	4.0	5.0	6.0	13.0	...	31.0	30.0	27.0	21.0	33.0	40.0	39.0	21.0	33.0	35.0
13335	Atlas_of_World_War_II_commons.wikimedia.org_al...	5.0	2.0	6.0	6.0	8.0	11.0	5.0	5.0	6.0	...	9.0	15.0	9.0	11.0	11.0	7.0	13.0	5.0	9.0	13.0
13336	Atlas_of_colonialism_commons.wikimedia.org_all...	8.0	6.0	15.0	5.0	5.0	9.0	10.0	3.0	9.0	...	27.0	19.0	29.0	48.0	27.0	23.0	20.0	28.0	27.0	30.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
82908	Category:Uploaded_with_VicuñaUploader_commons....	13.0	13.0	14.0	15.0	9.0	11.0	13.0	7.0	13.0	...	399.0	264.0	168.0	140.0	44.0	45.0	101.0	79.0	44.0	131.0
82910	Početna_strana_commons.wikimedia.org_desktop_a...	69.0	86.0	78.0	63.0	58.0	65.0	97.0	95.0	77.0	...	137.0	118.0	121.0	125.0	153.0	1660.0	5512.0	144.0	113.0	122.0
82911	Commons:Portale_Comunità_commons.wikimedia.org...	71.0	61.0	60.0	56.0	50.0	60.0	72.0	73.0	57.0	...	55.0	43.0	32.0	49.0	81.0	62.0	58.0	63.0	52.0	52.0
82912	Commons:Média_du_jour_commons.wikimedia.org_de...	12.0	12.0	17.0	17.0	8.0	16.0	21.0	15.0	12.0	...	15.0	6.0	14.0	14.0	19.0	18.0	12.0	23.0	11.0	15.0
82914	Pàgina_principal_commons.wikimedia.org_desktop...	140.0	137.0	131.0	115.0	83.0	132.0	176.0	162.0	138.0	...	217.0	200.0	173.0	247.0	237.0	1733.0	5638.0	246.0	182.0	181.0

	Page	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	...	2017-09-01	2017-09-02	2017-09-03	2017-09-04	2017-09-05	2017-09-06	2017-09-07	2017-09-08	2017-09-09	2017-09-10
19614	API_www.mediawiki.org_all-access_all-agents	528.0	587.0	634.0	483.0	510.0	559.0	625.0	594.0	565.0	...	458.0	325.0	330.0	462.0	529.0	439.0	459.0	439.0	302.0	274.0
19615	API:Account_creation_www.mediawiki.org_all-acc...	37.0	38.0	46.0	41.0	79.0	91.0	100.0	54.0	43.0	...	32.0	25.0	24.0	44.0	43.0	35.0	37.0	37.0	33.0	25.0
19616	API:Allimages_www.mediawiki.org_all-access_all...	19.0	31.0	19.0	10.0	32.0	26.0	29.0	23.0	21.0	...	6.0	11.0	10.0	14.0	9.0	12.0	15.0	13.0	14.0	4.0
19617	API:Allpages_www.mediawiki.org_all-access_all-...	32.0	29.0	14.0	15.0	23.0	41.0	39.0	27.0	35.0	...	15.0	12.0	7.0	19.0	22.0	21.0	19.0	20.0	10.0	11.0
19618	API:Backlinks_www.mediawiki.org_all-access_all...	29.0	16.0	15.0	19.0	18.0	20.0	19.0	21.0	23.0	...	13.0	12.0	14.0	19.0	14.0	22.0	13.0	10.0	8.0	10.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
84707	Wikimedia_Apps/Team_www.mediawiki.org_all-acce...	4.0	4.0	3.0	3.0	9.0	8.0	3.0	1.0	1.0	...	2.0	3.0	1.0	3.0	2.0	4.0	8.0	4.0	8.0	1.0
84709	Wikimedia_Apps/iOS_FAQ_www.mediawiki.org_all-a...	0.0	0.0	2.0	0.0	0.0	1.0	0.0	1.0	0.0	...	2.0	2.0	5.0	1.0	1.0	5.0	4.0	2.0	3.0	2.0
84723	Wikimedia_Language_engineering_www.mediawiki.o...	29.0	73.0	32.0	44.0	54.0	33.0	38.0	12.0	10.0	...	6.0	5.0	6.0	6.0	9.0	4.0	7.0	11.0	12.0	5.0
84727	Wikipedia_Zero_www.mediawiki.org_all-access_sp...	11.0	12.0	14.0	7.0	9.0	13.0	9.0	6.0	13.0	...	4.0	2.0	5.0	4.0	5.0	10.0	6.0	7.0	6.0	4.0
84729	Zürich_Hackathon_2014_www.mediawiki.org_all-ac...	3.0	19.0	19.0	30.0	21.0	24.0	17.0	178.0	40.0	...	10.0	4.0	8.0	7.0	5.0	8.0	6.0	6.0	5.0	6.0

	Page	2015-07-01	2015-07-02	2015-07-03	2015-07-04	2015-07-05	2015-07-06	2015-07-07	2015-07-08	2015-07-09	...	2017-09-04	2017-09-05	2017-09-06	2017-09-07	2017-09-08	2017-09-09	2017-09-10	Access	Agent	Media type
0	2NE1_zh.wikipedia.org_all-access_spider	18.0	11.0	5.0	13.0	14.0	9.0	9.0	22.0	26.0	...	18.0	16.0	27.0	29.0	23.0	54.0	38.0	all-access	spider	wikipedia
1	2PM_zh.wikipedia.org_all-access_spider	11.0	14.0	15.0	18.0	11.0	13.0	22.0	11.0	10.0	...	19.0	54.0	25.0	26.0	23.0	13.0	81.0	all-access	spider	wikipedia

	Page	category
136	Algebraic_topology	Mathematics and abstractions
384	Calc	Mathematics and abstractions
385	Calculator	Mathematics and abstractions
710	Ex_Machina_(film)	Mathematics and abstractions
739	Fermat's_Last_Theorem	Mathematics and abstractions
744	Fibonacci_number	Mathematics and abstractions
931	Hetalia:_Axis_Powers	Mathematics and abstractions
1406	MF_Grimm	Mathematics and abstractions

	Test Statistic	p-value	#Lags Used	Number of Observations Used	Critical Value (1%)	Critical Value (5%)	Critical Value (10%)	Name
0	-3.396666	0.011073	21	781	-3.438751	-2.865248	-2.568744	anglais
1	-1.590738	0.488244	21	781	-3.438751	-2.865248	-2.568744	allemand
2	-1.715705	0.423040	20	782	-3.438740	-2.865243	-2.568742	français
3	-2.837007	0.053179	21	781	-3.438751	-2.865248	-2.568744	chinois

	Test Statistic	p-value	#Lags Used	Critical Value (1%)	Critical Value (5%)	Critical Value (10%)	Name
0	0.576517	0.024771	17	0.739	0.463	0.347	anglais
1	1.259470	0.010000	17	0.739	0.463	0.347	allemand
2	1.060942	0.010000	17	0.739	0.463	0.347	français
3	0.784973	0.010000	15	0.739	0.463	0.347	chinois

	y	lag_1	lag_2	lag_3	lag_4	lag_5	lag_6	lag_7
2017-03-29	5182.510791	5109.558393	5005.346283	5006.523321	4767.011331	4426.551259	4484.150420	4717.031175
2017-03-30	4651.347062	5182.510791	5109.558393	5005.346283	5006.523321	4767.011331	4426.551259	4484.150420
2017-03-31	4499.341847	4651.347062	5182.510791	5109.558393	5005.346283	5006.523321	4767.011331	4426.551259
2017-04-01	4472.170324	4499.341847	4651.347062	5182.510791	5109.558393	5005.346283	5006.523321	4767.011331
2017-04-02	4831.191367	4472.170324	4499.341847	4651.347062	5182.510791	5109.558393	5005.346283	5006.523321

	y	lag_1	lag_2	lag_3	lag_4	lag_5	lag_6	lag_7	Jour de la semaine	Weekend
2017-03-27	5005.346283	5006.523321	4767.011331	4426.551259	4484.150420	4717.031175	5041.409233	5176.765588	0	0
2017-03-28	5109.558393	5005.346283	5006.523321	4767.011331	4426.551259	4484.150420	4717.031175	5041.409233	1	0
2017-03-29	5182.510791	5109.558393	5005.346283	5006.523321	4767.011331	4426.551259	4484.150420	4717.031175	2	0
2017-03-30	4651.347062	5182.510791	5109.558393	5005.346283	5006.523321	4767.011331	4426.551259	4484.150420	3	0
2017-03-31	4499.341847	4651.347062	5182.510791	5109.558393	5005.346283	5006.523321	4767.011331	4426.551259	4	0
2017-04-01	4472.170324	4499.341847	4651.347062	5182.510791	5109.558393	5005.346283	5006.523321	4767.011331	5	1
2017-04-02	4831.191367	4472.170324	4499.341847	4651.347062	5182.510791	5109.558393	5005.346283	5006.523321	6	1

Projet de Statistique Prévision : Wikipedia Traffic Forecast¶

NGO Khoa Anh, Athéna Brisse, ISUP, M2 ISDS¶

Table des matières¶

Bibliothèques¶

Introduction¶

Visualisation des données¶

Variables NaN :¶

Creer des nouvelles variables explicatives (Access, Agent et Site Web Source)¶

On split l'intitulé de chaque page pour obtenir l'acces, l'agent et le site web source¶

View Count (Access)¶

View Count (Agent)¶

View Media type¶

Trouver le language des pages¶

Création d'une variable explicative supplémentaire : le thème de l'article¶

Exemples¶

ARIMA sur le traffic des pages par rapport à language utilisée (Anglais, Allemand, Francais, Chinois)¶

Modèle AR¶

Modèle MA¶

Modèle ARIMA¶

Résumé des étapes à faire¶

Étape 1: Verifier la stationnarite¶

Test ADF¶

Test stationnarite pour les pages en Anglais¶

KPSS Test¶

L'ordre de différenciation (d) dans le modèle ARIMA¶

Pour les pages en Anglais¶

ALLEMAND¶

FRANCAIS¶

CHINOIS¶

Trouver l'ordre du terme AR (p) et le terme MA (q)¶

Anglais¶

Allemand¶

Français¶

Chinois¶

Construire le model ARIMA¶

Pour les pages en Anglais (p,d,q) = (2,1,2)¶

Pour les pages en Allemagne (p,d,q) = (2,1,2)¶

Pour les pages en Francais (p,d,q) = (2,1,2)¶

Pour les pages en Chinois (p,d,q) = (2,1,2)¶

Construire le modèle ARIMA optimal en utilisant Out-of-Time Cross validation¶

Pour les pages en Anglais¶

Pour les pages en Allemagne¶

Pour les pages en Francais¶

Pour les pages en Chinois¶

Conclusion générale sur l'ARIMA¶

À suivre...¶

Prédiction avec Ridge regression¶

Sur les articles en Anglais¶

Prediction avec Lasso Regression¶

Prédiction avec XGBoost¶

Sur les articles en Anglais¶

Rappel : vues des articles wikipédia en anglais en moyenne¶

Ajout de features : jour de la semaine et week-end¶

Mise à l'échelle (Scaling)¶

Prediction avec Random Forest¶

CONCLUSION¶