Commit 8461ca81 authored by kfilo001's avatar kfilo001
Browse files

added AnomalyDetectorApp

parent 7204eb11
import sys
sys.path.append('/data/financedata/thesis/kfilo001/anomaly-detector/source')
sys.path.append('/data/financedata/thesis/kfilo001/anomaly-detector-production/virtualview/jnotebooks')
from source.RawDataSet import RawDataSet
from source.FeatureSelection import FeatureSelection
from source.TrainingDataSet import TrainingDataSet
from source.ANN import ANN
from source.PredictionDataSet import PredictionDataSet
from source.Config import Config
from source.AnomalyDetector import AnomalyDetector
from datetime import datetime, time, timedelta, date
import pandas as pd
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
import math
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [10, 5]
plt.rcParams['figure.dpi'] = 150
def calculate_wcss(data):
wcss = []
for n in range(2, 21):
kmeans = KMeans(n_clusters=n)
kmeans.fit(X=data)
wcss.append(kmeans.inertia_)
return wcss
def optimal_number_of_clusters(wcss):
x1, y1 = 2, wcss[0]
x2, y2 = 20, wcss[len(wcss) - 1]
distances = []
for i in range(len(wcss)):
x0 = i + 2
y0 = wcss[i]
numerator = abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1)
denominator = math.sqrt((y2 - y1) ** 2 + (x2 - x1) ** 2)
distances.append(numerator / denominator)
return distances.index(max(distances)) + 2
def kmeans(matrix, components):
# calculating the within clusters sum-of-squares for 19 cluster amounts
sum_of_squares = calculate_wcss(matrix)
n = optimal_number_of_clusters(sum_of_squares)
kmeans = KMeans(n_clusters=n)
clusters = kmeans.fit(components)
y = clusters.labels_
df = pd.DataFrame()
df['y'] = y
df['time'] = df.index
df['pca-one'] = components[:, 0]
df['pca-two'] = components[:, 1]
return df, n
if __name__ == '__main__':
# path 'config.json'
script = sys.argv[0]
command = sys.argv[1]
config_path = sys.argv[2]
config = Config(command, config_path)
config.check_parameters()
config_data = config.get_config_data()
if command == '-t':
raw_dataset = RawDataSet(config_data)
raw_dataset.read_tickers_from_config()
print('Reading data from folder...')
raw_dataset.read_csv_from_folder()
# ----Stocks read from folder
print(len(raw_dataset.get_stocks().keys()), "stocks were read from folder")
raw_dataset.format_dataset()
raw_dataset.filter()
print("After filtering:", len(raw_dataset.get_stocks().keys()), "left")
if config_data["interpolate"] == 1:
print('Imputation of missing timestamps started... ')
raw_dataset.interpolate()
else:
print('Skipping interpolation... ')
stocks = raw_dataset.get_stocks()
print("Stocks were successfully read and added in dict :", stocks)
print("Starting feature selection process")
feature_selection = FeatureSelection(stocks, config_data)
feature_selection.create_matrix()
feature_selection.calculate_log_returns()
feature_selection.normalize_stocks()
feature_selection.pca()
print("There are # components", feature_selection.get_number_pca_components())
feature_selection.calculate_correlations()
print("Tickers before filtering :", len(feature_selection.get_correlations().columns))
feature_selection.filter_out_correlated_tickers()
selected_tickers, left_tickers = feature_selection.get_selected_tickers()
print("Tickers left after dropping correlations:", len(left_tickers))
print("Ticker selected for training :", selected_tickers)
feature_selection.save_info_feature_selection()
stocks_matrix = feature_selection.get_matrix()
print("Visualizing PCA components...")
if feature_selection.get_number_pca_components() > 1:
kmeans, n_clusters = kmeans(feature_selection.get_normalized_matrix(),
feature_selection.get_pca_components_values())
plt.figure(figsize=(16, 10))
pca_plot = sns.scatterplot(
x="pca-one", y="pca-two",
hue='y',
palette=sns.color_palette("hls", n_clusters),
data=kmeans,
legend="full",
alpha=0.3
)
pca_plot.figure.savefig(config_data["path_to_model"] +
datetime.now().strftime('%Y%m%d-%H%M%S') + "_pca_plot.png")
else:
print('PCA components cannot be visaulized as there is only 1 component')
print("Preparing data for training ...")
training_dataset = TrainingDataSet(stocks_matrix, config_data, selected_tickers)
training_dataset.create_matrix()
training_dataset.split_stocks()
training_dataset.create_window_dataset()
train, validation, test = training_dataset.get_window_dataset()
print("Train set size :", train.shape,
"Validation set size :", validation.shape,
"Test set size:", test.shape)
print("Starting training...")
nn = ANN(train, validation, test, config_data)
if config_data["model_to_train"] == 'LSTM_AE':
nn.LSTM_AE()
elif config_data["model_to_train"] == 'CNN_AE':
nn.CNN_AE()
elif config_data["model_to_train"] == 'LSTM_VAE':
nn.LSTM_VAE()
elif config_data["model_to_train"] == 'CNN_VAE':
nn.CNN_VAE()
else:
sys.exit('The provided name of model in config is wrong,'
' make sure that it has a correct name,'
'currently only 4 models available: LSTM_AE, LSTM_VAE, CNN_AE, CNN_VAE')
nn.get_model_summary()
nn.run_training()
nn.evaluate_model()
loss, val_loss, loss_acc, val_acc = nn.get_model_evaluation()
nn.calculate_prediction()
nn.save_model()
print("Visualizing training results...")
plt.figure(figsize=(16, 10))
plt.plot(loss, linewidth=2, label='Train')
plt.plot(val_loss, linewidth=2, label='Valid')
plt.legend(loc='upper right')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.savefig(config_data["path_to_model"] + datetime.now().strftime('%Y%m%d-%H%M%S') + "_training_results.png")
print("Saving trained model...")
nn.save_model()
else:
raw_dataset_for_prediction = RawDataSet(config_data)
raw_dataset_for_prediction.read_tickers_from_config()
print("Reading stocks from folder...")
raw_dataset_for_prediction.read_csv_from_folder()
# ----Stocks read from folder
print(len(raw_dataset_for_prediction.get_stocks().keys()), "stocks were read from folder")
raw_dataset_for_prediction.format_dataset()
raw_dataset_for_prediction.filter()
print("After filtering:", len(raw_dataset_for_prediction.get_stocks().keys()), "left")
if config_data["interpolate"] == 1:
print('Imputation of missing timestamps started... ')
raw_dataset_for_prediction.interpolate()
else:
print('Skipping interpolation... ')
stocks = raw_dataset_for_prediction.get_stocks()
print("Following stock were added in dict...")
print(stocks)
first_entry = True
scores = pd.DataFrame()
print("Starting of calculation of anomalies for every stock")
for ticker in stocks.keys():
print("Calculation for :", ticker)
dict = {}
add_scores = pd.DataFrame()
sc = []
dict[ticker] = stocks[ticker]
prediction = PredictionDataSet(dict, config_data)
prediction.create_matrix()
prediction.calculate_log_returns()
prediction.normalize_stocks()
prediction.create_window_dataset()
prediction_stock = prediction.get_tensors_for_prediction()
anomaly_detector = AnomalyDetector(dict, prediction_stock, config_data)
anomaly_detector.calculate_prediction()
anomaly_detector.calculate_reconstruction_error()
anomaly_detector.evaluate_model()
metrics, val = anomaly_detector.get_validation_score()
add_scores['ticker'] = [ticker]
add_scores['metric'] = [metrics[0]]
add_scores['score'] = [(1 - val[0]) * 100]
anomaly_detector.calculate_anomalies()
anomalies = anomaly_detector.get_calculated_anomalies()
add_scores['>0.8'] = [anomalies['close_prices'].loc[anomalies['anomaly_flag'] == True].count()]
add_scores['>0.9'] = [anomalies['close_prices'].loc[(anomalies['anomaly_flag'] == True) & (
anomalies['reconstruction_error'] > 0.9)].count()]
add_scores['>0.99'] = [anomalies['close_prices'].loc[(anomalies['anomaly_flag'] == True) & (
anomalies['reconstruction_error'] >= 0.99)].count()]
add_scores['average_error'] = [
sum(anomalies['reconstruction_error'].values) / anomalies['anomaly_flag'].count()]
add_scores['coverage'] = [
anomalies['close_prices'].loc[anomalies['reconstruction_error'] >= 0.8].count() / anomalies[
'anomaly_flag'].count() * 100]
print("Average reconstruction error :", add_scores['average_error'])
if not first_entry:
scores = pd.concat([scores, add_scores])
else:
scores = add_scores
first_entry = False
print('Visualizing anomalies...')
plt.figure()
plt.plot(anomalies.index, anomalies['close_prices'])
plt.scatter(anomalies['close_prices'].loc[anomalies['anomaly_flag'] == True].index,
anomalies['close_prices'].loc[anomalies['anomaly_flag'] == True], color='red')
plt.savefig(config_data["path_to_anomalies"] + ticker + '_' +
datetime.now().strftime('%Y%m%d-%H%M%S') + "_anomalies.png")
anomaly_detector.save_anomalies_to_file()
scores.to_csv(config_data["path_to_anomalies"] +
datetime.now().strftime('%Y%m%d-%H%M%S') + '_prediction_scores' + '.csv')
,selected_ticker,pca_components,#important_features,correlated_columns_before,left_columns
0,['ALXN'],2,2,2,1
,training_loss,validation_loss,training_accuracy,validation_accuracy,testing_loss,testing_accuracy
0,0.7230271030217409,0.5735959274228662,0.32729604840278625,0.3272128999233246,0.10706828534603119,0.32741910219192505
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment