#import balíčkov import numpy as np import pandas as pd from pandas import concat from pandas import read_csv from pandas import datetime from pandas import Series from sklearn.metrics import mean_squared_error from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.metrics import r2_score from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers import LSTM from keras.models import load_model from statsmodels.tsa.arima_model import ARIMA import math from math import log from math import sqrt import time import keras #načítanie dát z datasetu, výber dat pre konkrétnu akciu, presun close price do posledného stlpca df = pd.read_csv("prices-split-adjusted.csv", parse_dates=True) df['date'] = pd.to_datetime(df["date"]) df.set_index(df["date"], inplace=True) df = df[df['symbol'] == "AMZN"] df.drop(['symbol'],1,inplace=True) df["adj_close"] = df.close df.drop(['close'], 1, inplace=True) # funkcia pre úpravu a normalizáciu dát def preprocess(df): min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) df['open'] = min_max_scaler.fit_transform(df['open'].values.reshape(-1,1)) df['volume'] = min_max_scaler.fit_transform(df['volume'].values.reshape(-1,1)) df['high'] = min_max_scaler.fit_transform(df['high'].values.reshape(-1,1)) df['low'] = min_max_scaler.fit_transform(df['low'].values.reshape(-1,1)) df['adj_close'] = min_max_scaler.fit_transform(df['adj_close'].values.reshape(-1,1)) df.drop(['date'], 1, inplace=True) return df # funkcia pre úpravu dát do požadovaného formátu - rozdelenie na trénovaciu a testovaciu množinu pre BKP def load_data_BKP(df): X = df.iloc[:,[0,1,2,3]] y = df.iloc[:, 4] # 80%ný split na testovaciu a trénovaciu množinu split = int(len(df)*0.8) X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:] return X_train, X_test, y_train, y_test # funkcia pre úpravu dát do požadovaného formátu - rozdelenie na trénovaciu a testovaciu množinu pre LSTM def load_data_LSTM(stock, seq_len): amount_of_features = len(stock.columns) # 5 data = stock.as_matrix() sequence_length = seq_len + 1 result = [] for index in range(len(data) - sequence_length): result.append(data[index: index + sequence_length]) result = np.array(result) row = round(0.8 * result.shape[0]) # 80% split train = result[:int(row), :] x_train = train[:, :-1] y_train = train[:, -1][:,-1] x_test = result[int(row):, :-1] y_test = result[int(row):, -1][:,-1] x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], amount_of_features)) x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], amount_of_features)) return [x_train, y_train, x_test, y_test] # funkcia pre výpočet logaritmických výnosov def logrets(numbers) : logs = [log(x) for x in numbers] return np.diff(logs) # funkcia zostavenie lstm modelu def build_model_lstm(layers): d = 0.5 model = Sequential() model.add(LSTM(512, input_shape=(layers[1], layers[0]), return_sequences=True)) model.add(Dropout(d)) model.add(LSTM(512, input_shape=(layers[1], layers[0]), return_sequences=True)) model.add(Dropout(d)) model.add(LSTM(32, input_shape=(layers[1], layers[0]), return_sequences=False)) model.add(Dropout(d)) model.add(Dense(1,kernel_initializer="uniform",activation='linear') start = time.time() model.compile(loss='mse',optimizer='adam', metrics=['accuracy']) print("Compilation Time : ", time.time() - start) return model # zostavenie backpropagation modelu def build_model(): regressor = Sequential() regressor.add(Dense(units = 512, kernel_initializer = 'uniform', activation = 'relu', input_dim = 4)) regressor.add(Dropout(.2)) regressor.add(Dense(units = 512, kernel_initializer = 'uniform', activation = 'relu')) regressor.add(Dropout(.2)) regressor.add(Dense(units = 64, kernel_initializer = 'uniform', activation = 'relu')) regressor.add(Dropout(.2)) regressor.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'linear')) regressor.compile(optimizer = 'adam', loss = 'mean_squared_error') return regressor # zostavenie ARIMA modelu def model_arima (ts): X = ts size = int(len(X) * 0.80) train, test = X[0:size], X[size:len(X)] history = [x for x in train] predictions = list() for t in range(len(test)): model = ARIMA(history, order=(1,0,0)) model_fit = model.fit(disp=0) output = model_fit.forecast() yhat = output[0] predictions.append(yhat) obs = test[t] history.append(obs) print('predicted=%f, expected=%f' % (yhat, obs)) return predictions # funkcia pre výpočet presnosti modelu def mean_absolute_percentage_error(newy_test_ret, newp_ret): newy_test_ret, newp_ret = np.array(newy_test_ret), np.array(newp_ret) return np.mean(np.abs((newy_test_ret - newp_ret) / newy_test_ret)) def mean_squared_error(newy_test_ret, newp_ret): newy_test_ret, newp_ret = np.array(newy_test_ret), np.array(newp_ret) return np.mean(np.square(newy_test_ret - newp_ret)) def mean_absolute_error(newy_test_ret, newp_ret): newy_test_ret, newp_ret = np.array(newy_test_ret), np.array(newp_ret) return np.mean(np.abs(newy_test_ret - newp_ret)) #funkcia pre prevedenie normalizovaných dát do pôvodného stavu def denormalize(df, normalized_value): df = df['adj_close'].values.reshape(-1,1) normalized_value = normalized_value.reshape(-1,1) min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) a = min_max_scaler.fit_transform(df) new = min_max_scaler.inverse_transform(normalized_value) return new MODEL LSTM # načítanie dát v požadovanom formáte a rozdelením na trénovaciu a testovaciu množinu df = preprocess(df) window = 30 x_train, y_train, x_test, y_test = load_data_LSTM(df, window) # zostavenie lstm modelu model = build_model_lstm([5,window,1]) model.fit(x_train,y_train,batch_size=256,epochs=80,validation_split=0.1,verbose=1) #zostrojenie predpovede diff=[] ratio=[] p = model.predict(x_test) print (p.shape) # pre každé pozorovanie v testovacej množine for u in range(len(y_test)): pr = p[u][0] ratio.append((y_test[u]/pr)-1) diff.append(abs(y_test[u]- pr)) # prevedenie normalizovaných dát do pôvodného stavu newp = denormalize(df, p) newy_test = denormalize(df, y_test) # výpočet logaritmických výnosov newp_ret= logrets(newp) newp_ret[0]=0 newy_test_ret =logrets(newy_test) newy_test_ret[0]=0 # výpočet presnosti modelu coefficient_of_determination_lstm = r2_score(newy_test_ret, newp_ret) mse_lstm = mean_squared_error(newy_test_ret, newp_ret) mae_lstm = mean_absolute_error(newy_test_ret, newp_ret) mape_lstm = mean_absolute_percentage_error(newy_test_ret, newp_ret) print(mse_lstm, mae_lstm,mape_lstm) print(coefficient_of_determination_lstm) real_ret_avg_lstm = newy_test_ret.sum() / len(newy_test_ret) real_ret_lstm = newy_test_ret.sum() pred_ret_avg_lstm = newp_ret.sum() / len(newp_ret) pred_ret_lstm = newp_ret.sum() print(real_ret_avg_lstm , pred_ret_lstm, real_ret_lstm, pred_ret_avg_lstm) BKP #načítanie dát z datasetu, výber dat pre konkrétnu akciu, presun close price do posledného stlpca df = pd.read_csv("prices-split-adjusted.csv", parse_dates=True) df['date'] = pd.to_datetime(df["date"]) df.set_index(df["date"], inplace=True) df = df[df['symbol'] == "AMZN"] df.drop(['symbol'],1,inplace=True) df["adj_close"] = df.close df.drop(['close'], 1, inplace=True) df.drop(['date'], 1, inplace=True) X_train, X_test, y_train, y_test = load_data_BKP(df) regressor=build_model() regressor.fit(X_train, y_train, batch_size = 512, epochs = 100) #predikcia na základe modelu y_pred = regressor.predict(X_test) # prevedenie normalizovaných dát do pôvodného stavu newp = denormalize(df, y_pred) y_test = np.array(y_test) newy_test = denormalize(df, y_test) # výpočet logaritmických výnosov newp_ret= logrets(newp) newp_ret[0]=0 newy_test_ret =logrets(newy_test) newy_test_ret[0]=0 # výpočet presnosti modelu coefficient_of_determination_bkp = r2_score(newy_test_ret, newp_ret) mse_bkp = mean_squared_error(newy_test_ret, newp_ret) mae_bkp = mean_absolute_error(newy_test_ret, newp_ret) mape_bkp = mean_absolute_percentage_error(newy_test_ret, newp_ret) print(mse_bkp, mae_bkp,mape_bkp) print(coefficient_of_ determination_bkp) real_ret_avg_bkp = newy_test_ret.sum() / len(newy_test_ret) real_ret_bkp = newy_test_ret.sum() pred_ret_avg_bkp= newp_ret.sum() / len(newp_ret) pred_ret_bkp = newp_ret.sum() print(real_ret_avg_bkp, pred_ret_bkp, real_ret_bkp, pred_ret_avg_bkp) MODEL ARIMA dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d') df = pd.read_csv("prices-split-adjusted.csv", parse_dates=['date'], index_col='date',date_parser=dateparse) df = df[df['symbol'] == "AMZN"] df.drop(['symbol'],1,inplace=True) df = preprocess(df) ts = df['close'] ts = np.diff(ts) ts= np.array(ts) predictions = model_ARIMA(ts) coefficient_of_dermination = r2_score(test, predictions) mse = mean_squared_error(test, predictions) mae = mean_absolute_error(test, predictions) mape = mean_absolute_percentage_error(test, predictions) print(mse, mae,mape) print(coefficient_of_dermination) real_ret_avg_bkp = newy_test_ret.sum() / len(newy_test_ret) real_ret_bkp = newy_test_ret.sum() pred_ret_avg_bkp= newp_ret.sum() / len(newp_ret) pred_ret_bkp = newp_ret.sum() PARAMETRY LSTM # funkcie pre testovanie hyperparametrov dropout, počet epoch, počet neurónov u LSTM def build_model_param(layers, neurons, d): model = Sequential() model.add(LSTM(neurons[0], input_shape=(layers[1], layers[0]), return_sequences=True)) model.add(Dropout(d)) model.add(LSTM(neurons[1], input_shape=(layers[1], layers[0]), return_sequences=False)) model.add(Dropout(d)) model.add(Dense(neurons[2],kernel_initializer="uniform",activation='relu')) model.add(Dense(neurons[3],kernel_initializer="uniform",activation='linear')) model.summary() return model # funkcie pre evaluáciu modelu na trenovacej a testovacej množine, ktorú následne použijeme pre výpočet MSE pri rôznych hodnotách parametrov def model_score(model, x_train, y_train, x_test, y_test): trainScore = model.evaluate(x_train, y_train, verbose=0) testScore = model.evaluate(x_test, y_test, verbose=0) return trainScore[0], testScore[0] def quick_measure(seq_len, d, shape, neurons, epochs): df = get_stock_data() X_train, y_train, X_test, y_test = load_data_LSTM(df, seq_len) model = build_model_param(shape, neurons, d) model.fit(X_train, y_train, batch_size=512, epochs=epochs, validation_split=0.1, verbose=1) trainScore, testScore = model_score(model, X_train, y_train, X_test, y_test) return trainScore, testScore # funkcie pre testovanie hyperparametru window size def build_model_param2(layers, neurons, d): model = Sequential() model.add(LSTM(neurons[0], input_shape=(layers[1], layers[0]), return_sequences=True)) model.add(Dropout(d)) model.add(LSTM(neurons[1], input_shape=(layers[1], layers[0]), return_sequences=False)) model.add(Dropout(d)) model.add(Dense(neurons[2],kernel_initializer="uniform",activation='relu')) model.add(Dense(neurons[3],kernel_initializer="uniform",activation='linear')) model.compile(loss='mse',optimizer='adam', metrics=['accuracy']) model.summary() return model # funkcie pre evaluáciu modelu na trenovacej a testovacej množine, ktorú následne použijeme pre výpočet MSE pri rôznych hodnotách parametrov def quick_measure2(seq_len, d, shape, neurons, epochs): df = get_stock_data() x_train, y_train, x_test, y_test = load_data_LSTM(df, seq_len) model = build_model_param2(shape, neurons, d) model.fit(x_train, y_train, batch_size=512, epochs=epochs, validation_split=0.1, verbose=1) trainScore, testScore = model_score(model, x_train, y_train, x_test, y_test) return trainScore, testScore df = pd.read_csv("prices-split-adjusted.csv", parse_dates=True) df['date'] = pd.to_datetime(df["date"]) df.set_index(df["date"], inplace=True) df = df[df['symbol'] == "AMZN"] df.drop(['symbol'],1,inplace=True) df["adj_close"] = df.close df.drop(['close'], 1, inplace=True) seq_len = 22 d = 0.2 shape = [5, seq_len, 1] # feature, window, output neurons = [128, 128, 32, 1] epochs = 60 window=22 df = preprocess(df) x_train, y_train, X_test, y_test = load_data_LSTM(df, window) model = build_model2(shape, neurons, d) model.fit(x_train, y_train, batch_size=30, epochs=30, validation_split=0.1, verbose=0) # výber optimálnej hodnoty dropout podľa najmenšej MSE na trénovacej množine dlist = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] dropout_result = {} for d in dlist: trainScore, testScore = quick_measure(seq_len, d, shape, neurons, epochs) dropout_result[d] = trainScore min_val = min(dropout_result.values()) min_val_key = [k for k, v in dropout_result.items() if v == min_val] print (dropout_result) print (min_val_key) # výber optimálnej hodnoty počet epoch podľa najmenšej MSE na trénovacej množine epochslist = [10,20,30,40,50,60,70,80,90,100] epochs_result = {} for epochs in epochslist: trainScore, testScore = quick_measure(seq_len, d, shape, neurons, epochs) epochs_result[epochs] = trainScore min_val = min(epochs_result.values()) min_val_key = [k for k, v in epochs_result.items() if v == min_val] print (epochs_result) print (min_val_key) # výber optimálnej hodnoty počtu neurónov podľa najmenšej MSE na trénovacej množine neuronlist1 = [32, 64, 128, 256, 512] neuronlist2 = [16, 32, 64] neurons_result = {} for neuron_lstm in neuronlist1: neurons = [neuron_lstm, neuron_lstm] for activation in neuronlist2: neurons.append(activation) neurons.append(1) trainScore, testScore = quick_measure(seq_len, d, shape, neurons, epochs) neurons_result[str(neurons)] = trainScore neurons = neurons[:2] min_val = min(neurons_result.values()) min_val_key = [k for k, v in neurons_result.items() if v == min_val] print (neurons_result) print (min_val_key) neurons = [256, 256, 32, 1] epochs = 30 d = 0.4 decay = 0.4 # výber optimálnej hodnoty window size podľa najmenšej MSE na trénovacej množine seq_len_list = [ 10, 22, 30, 45, 60] seq_len_result = {} for seq_len in seq_len_list: shape = [5, seq_len, 1] trainScore, testScore = quick_measure3(seq_len, d, shape, neurons, epochs) seq_len_result[seq_len] = trainScore min_val = min(seq_len_result.values()) min_val_key = [k for k, v in seq_len_result.items() if v == min_val] print (seq_len_result) print (min_val_key) PARAMETRY BKP # funkcie pre testovanie hyperparametrov dropout, počet epoch u BKP def build_model_param (d): model = Sequential() model.add(Dense(units = 500, kernel_initializer = 'uniform', activation = 'relu', input_dim = 4)) model.add(Dropout(d)) model.add(Dense(units = 500, kernel_initializer = 'uniform', activation = 'relu')) model.add(Dropout(d)) model.add(Dense(units = 500, kernel_initializer = 'uniform', activation = 'relu')) model.add(Dropout(d)) model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) model.compile(optimizer = 'adam', loss = 'mean_squared_error') return model # funkcia pre testovanie optimálneho počtu neurónov u BKP def build_model_param2 (neurons, d): model = Sequential() model.add(Dense(neurons[0], kernel_initializer = 'uniform', activation = 'relu', input_dim = 4)) model.add(Dropout(d)) model.add(Dense(neurons[1], kernel_initializer = 'uniform', activation = 'relu')) model.add(Dropout(d)) model.add(Dense(neurons[2], kernel_initializer = 'uniform', activation = 'relu')) model.add(Dropout(d)) model.add(Dense(neurons[3], kernel_initializer = 'uniform', activation = 'sigmoid')) model.compile(optimizer = 'adam', loss = 'mean_squared_error') return model # funkcie pre evaluáciu modelu na trenovacej a testovacej množine, ktorú následne použijeme pre výpočet MSE pri rôznych hodnotách parametrov def quick_measure2(d, neurons, epochs): model = build_model_param2(neurons, d) model.fit(X_train, y_train, batch_size=512, epochs=epochs, validation_split=0.1, verbose=1) trainScore, testScore = model_score2(model, X_train, y_train, X_test, y_test) return trainScore, testScore df = pd.read_csv("prices-split-adjusted.csv", parse_dates=True) df['date'] = pd.to_datetime(df["date"]) df.set_index(df["date"], inplace=True) df = df[df['symbol'] == "AMZN"] df.drop(['symbol'],1,inplace=True) df["adj_close"] = df.close df.drop(['close'], 1, inplace=True) df = preprocess(df) x_train, y_train, X_test, y_test = load_data_BKP(df) d = 0.2 epochs = 60 model = build_model(d) model.fit(X_train, y_train, batch_size=512, epochs=epochs, validation_split=0.1, verbose=1) # výber optimálnej hodnoty dropout podľa najmenšej MSE na trénovacej množine dlist = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] dropout_result = {} for d in dlist: trainScore, testScore = quick_measure(d, epochs) dropout_result[d] = trainScore min_val = min(dropout_result.values()) min_val_key = [k for k, v in dropout_result.items() if v == min_val] print (dropout_result) print (min_val_key) # výber optimálnej hodnoty počet epoch podľa najmenšej MSE na trénovacej množine epochslist = [10,20,30,40,50,60,70,80,90,100] epochs_result = {} for epochs in epochslist: trainScore, testScore = quick_measure(d, epochs) epochs_result[epochs] = trainScore min_val = min(epochs_result.values()) min_val_key = [k for k, v in epochs_result.items() if v == min_val] print (epochs_result) print (min_val_key) neurons = [128, 128, 32, 1] model = build_model2(neurons, d) model.fit(X_train, y_train, batch_size=512, epochs=epochs, validation_split=0.1, verbose=1) # výber optimálnej počtu neurónov podľa najmenšej MSE na trénovacej množine neuronlist1 = [32, 64, 128, 256, 512] neuronlist2 = [16, 32, 64] neurons_result = {} for neuron_lstm in neuronlist1: neurons = [neuron_lstm, neuron_lstm] for activation in neuronlist2: neurons.append(activation) neurons.append(1) trainScore, testScore = quick_measure2(d, neurons, epochs) neurons_result[str(neurons)] = trainScore neurons = neurons[:2] min_val = min(neurons_result.values()) min_val_key = [k for k, v in neurons_result.items() if v == min_val] print (neurons_result) print (min_val_key)