Skip to content

Commit 0459264

Browse files
committed
Anomaly detection algorithm and Visualisation
1 parent 403e494 commit 0459264

File tree

5 files changed

+156
-27
lines changed

5 files changed

+156
-27
lines changed

README.md

+23-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Anomaly Detection on Streamed Data 🌐
22

3-
Welcome to the **Anomaly Detection on Streamed Data** project! This repository dives into the fascinating world of identifying anomalies in real-time data streams using Python.
3+
Welcome to the **Efficient Data Stream Anomaly Detection** project! This repository dives into the fascinating world of identifying anomalies in real-time data streams using Python.
44

55
## What is Anomaly Detection?
66

@@ -33,4 +33,25 @@ Anomaly detection can be achieved through various methods, including:
3333

3434
In this project, I leverage Python's powerful libraries to implement these methods, showcasing effective ways to identify anomalies in streamed data.
3535

36-
## Project Features & Processes :
36+
## Project Features :
37+
38+
## How to Run the code on your system:
39+
40+
## Theory and Formula:
41+
42+
also Include a concise explanation of your chosen algorithm and its effectiveness.
43+
44+
## Inner Structure of Code:
45+
46+
## Code Explanation:
47+
48+
- **Streamed Data Genarator**:
49+
- **Anomaly Detector from streamed data**:
50+
- **Visualization**:
51+
52+
#Anomaly: variable
53+
#random.choices(population , weights ) choses population based on provided weight,
54+
55+
# The weights determine the likelihood of each item being chosen.
56+
57+
#anomaly = random.choices([0, random.uniform(10, 20)], [0.99, 0.01])[0] last 0 represent arrey index # population part Weight Part which is 99% normal & 1% anomaly

ZScore_anomaly_detector.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import numpy as np
2+
import random
3+
import math
4+
class ZScoreAnomalyDetector:
5+
def __init__(self):
6+
self.n = 0 # Count of data points
7+
self.mean = 0 # Running mean
8+
self.M2 = 0 # Running sum of squares of differences from the mean (used to calculate variance)
9+
10+
def update(self, x):
11+
"""Update mean and variance incrementally."""
12+
self.n += 1
13+
delta = x - self.mean # Difference between new data point and current mean
14+
self.mean += delta / self.n # Update the mean
15+
delta2 = x - self.mean # Difference between new data point and new mean
16+
self.M2 += delta * delta2 # Update M2 for variance calculation
17+
18+
def variance(self):
19+
"""Return the variance."""
20+
if self.n < 2:
21+
return float('nan') # Variance is undefined for fewer than 2 data points
22+
return self.M2 / (self.n - 1) # Return the unbiased estimate of variance
23+
24+
def std(self):
25+
"""Return the standard deviation."""
26+
return math.sqrt(self.variance()) # Standard deviation is the square root of variance
27+
28+
def z_score(self, x):
29+
"""Calculate the Z-score of the new data point."""
30+
if self.n < 2:
31+
return 0 # Z-score is not meaningful if we have less than 2 data points
32+
return (x - self.mean) / self.std() # Z-score formula
33+
34+
35+
36+
Binary file not shown.

main.py

+97-25
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,77 @@
1+
import ZScore_anomaly_detector as zScoreClass
12
import numpy as np
23
import random
3-
#Anomaly: variable
4-
#random.choices(population , weights ) choses population based on provided weight,
5-
# The weights determine the likelihood of each item being chosen.
6-
#anomaly = random.choices([0, random.uniform(10, 20)], [0.99, 0.01])[0] last 0 represent arrey index
7-
# population part Weight Part which is 99% normal & 1% anomaly
8-
9-
4+
import matplotlib.pyplot as plt
105

6+
# Function to simulate the continuous data stream
117
def data_stream():
128
t = 0 # Initialize time counter
139
while True:
14-
time_value = t # Use as the time value intiger
15-
seasonal = 10 * np.sin(time_value) # Seasonal variation, sin simulating cyclic behavior
16-
anomaly = random.choices([0, random.uniform(10, 20)], [0.99, 0.01])[0] # Rare anomaly 1% chance of anomaly
17-
noise = random.uniform(-1, 1) #Small random noise
18-
yield seasonal + noise + anomaly #Yield the sum of seasonal, noise, and any anomaly
10+
time_value = t # Use as the time value
11+
seasonal = 10 * np.sin(time_value) # Seasonal variation
12+
anomaly = random.choices([0, random.uniform(10, 20)], [0.99, 0.01])[0] # 1% chance of anomaly
13+
noise = random.uniform(-1, 1) # Small random noise
14+
yield seasonal + noise + anomaly # Yield the sum of seasonal, noise, and any anomaly
15+
16+
t += 0.1 # Increment time value
17+
18+
# Anomaly detection function
19+
def anomaly_detection(stop_threshold=1000):
20+
"""Detect anomalies in a continuous data stream using Z-score and visualize results."""
21+
detector = zScoreClass.ZScoreAnomalyDetector() # Initialize the anomaly detector
22+
stream = data_stream() # Get the data stream
23+
24+
# Initialize lists for plotting
25+
data_points = []
26+
z_scores = []
27+
anomaly_points = []
28+
29+
# Set up the plot
30+
plt.ion() # Turn on interactive mode for real-time plotting
31+
fig, ax = plt.subplots()
32+
line, = ax.plot([], [], label='Data Stream')
33+
scatter_anomaly = ax.scatter([], [], color='red', label='Anomalies')
34+
ax.set_xlim(0, stop_threshold)
35+
ax.set_ylim(-20, 30)
36+
ax.set_xlabel('Time Steps')
37+
ax.set_ylabel('Data Value')
38+
ax.legend()
39+
40+
for i, data_point in enumerate(stream):
41+
detector.update(data_point) # Update mean and variance
42+
z = detector.z_score(data_point) # Calculate Z-score
43+
44+
# Append to data lists
45+
data_points.append(data_point)
46+
z_scores.append(z)
1947

20-
t += 0.1 # Will Incriment time value over time like real world
48+
# Plot update
49+
line.set_data(range(len(data_points)), data_points)
2150

51+
# Check for anomaly
52+
if abs(z) > 3:
53+
print(f"Anomaly detected: {data_point}, Z-score: {z}")
54+
anomaly_points.append((i, data_point)) # Store anomaly points
2255

56+
# Update scatter plot for anomalies
57+
if anomaly_points:
58+
ax.scatter(*zip(*anomaly_points), color='red')
2359

60+
# Redraw plot
61+
ax.set_xlim(0, max(10, len(data_points)))
62+
fig.canvas.draw()
63+
fig.canvas.flush_events()
2464

25-
def anomaly_detection(stop_threshold=20): # This stop_hreshold serves as a stoping point so we can stop the program rather then infinite loop
26-
stream= data_stream()
27-
for data_point in stream:
28-
print(f"Data Point: {data_point}")
65+
# Stop after a set number of iterations
66+
if i >= stop_threshold:
67+
break
2968

69+
plt.ioff() # Turn off interactive mode
70+
plt.show()
71+
72+
# Run the anomaly detection function
3073

3174
anomaly_detection()
32-
# anomaly detection algorithm , from stream data , show flaged data in visiulised way + documentation Update<< Work left tomorrow
3375

3476

3577

@@ -50,18 +92,48 @@ def anomaly_detection(stop_threshold=20): # This stop_hreshold serves as a stopi
5092

5193

5294

95+
96+
97+
98+
99+
"""
100+
small scale Algorithm Test
101+
53102
# data = [1, 2, 2, 2, 3, 1, 1, 15, 2, 2, 2, 3, 1, 1, 2]
54103
# mean = np.mean(data)
55104
# std = np.std(data)
56105
# print('mean of the dataset is', mean)
57106
# print('std. deviation is', std)
58107
108+
threshold = 3
109+
outlier = []
110+
for i in data:
111+
z = (i-mean)/std
112+
if z > threshold:
113+
outlier.append(i)
114+
print('outlier in dataset is', outlier)
115+
116+
"""
117+
118+
119+
# For Z-Score Calculation:
120+
# To calculate the Z-score of a data point in a stream, you'll need:
121+
122+
# The mean of the data stream up to the current point.
123+
# The standard deviation of the data stream up to the current point.
124+
125+
# for continious data stream intremental Mean calculation
126+
#! new_mean= old_mean + (x - old_mean)/n
127+
128+
# Incremental Variance and Standard Deviation:
129+
# The standard deviation can be calculated incrementally by first updating the variance
130+
# (because standard deviation is just the square root of variance).
59131

60-
# threshold = 3
61-
# outlier = []
62-
# for i in data:
63-
# z = (i-mean)/std
64-
# if z > threshold:
65-
# outlier.append(i)
66-
# print('outlier in dataset is', outlier)
132+
#! new_variance=old_variance + (x-old_mean) * (x-new-mean) / n
133+
# where x= new data_point and n is number of data point.
134+
# then
135+
#! new_standerd_deviation = root_over(new_variance)
67136

137+
# !Z-Score:
138+
#! Z = x - U / sigma where x = new current data point , u is the mean of the data stream so far ,
139+
# ! sigma is the standard_deviation in data stream so far.
File renamed without changes.

0 commit comments

Comments
 (0)