Skip to content

Redis #92

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,24 @@ pip install -r requirements.txt
- `openAI_key`: Your OpenAI API key.
### **4.** Run:

Chatbot
**Chatbot**
```
streamlit run frontend/chatbot/app.py
```
Recommender:
**Recommender:**
```
redis-server
```
Check server is running (Outputs pong if connected)
```
redis-cli ping
```
Check database by
```
redis-cli

KEYS *
```
```
streamlit run frontend/recommender/app.py
```
Expand Down
139 changes: 139 additions & 0 deletions codecompasslib/models/lr_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import os
import sys
import pandas as pd
from typing import Tuple, List
from pandas import DataFrame, concat
from numpy import ndarray, argsort
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from category_encoders import ordinal
from lightgbm_model import preprocess_data
from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive
from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos

# go up to root
# Construct the path to the root directory (one level up from embeddings)
root_dir = os.path.dirname(os.path.abspath(__file__))
project_dir = os.path.dirname(root_dir)
real_project_dir = os.path.dirname(project_dir)
# Add the project directory to the Python path
sys.path.insert(0, real_project_dir)




def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tuple[DataFrame, ndarray]:
"""
Encode the categorical columns in a DataFrame using the specified encoder.
:param df: The DataFrame to be encoded.
:param encoder: The encoder object used for encoding the categorical columns.
:param label_col: The name of the label column.
:param typ: The type of encoding to perform. Defaults to "fit".
:return: A tuple containing the encoded DataFrame and the label column values.
"""
if typ == "fit":
df = encoder.fit_transform(df)
else:
df = encoder.transform(df)
y: ndarray = df[label_col].values
del df[label_col]
return df, y


def train_logistic_regression_model(df_merged: DataFrame, label_col: str) -> Tuple[LogisticRegression, ordinal.OrdinalEncoder]:
"""
Trains a logistic regression model using the provided merged dataframe.

This function trains a logistic regression model using the provided merged dataframe. It performs the following steps:
1. Splits the merged dataframe into training, validation, and test sets.
2. Encodes categorical columns using ordinal encoding.
3. Trains the logistic regression model using the training data.
4. Returns the trained logistic regression model and the ordinal encoder.

Note: This function assumes that the merged dataframe has the following columns:
- 'target': The target variable to be predicted.
- 'id': An identifier column.
- 'owner_user': A column representing the owner user.
- 'embedding_0' to 'embedding_255': Numerical columns representing the embeddings.
- 'language': A categorical column representing the language.
- 'stars': A numerical column representing the number of stars.

:param df_merged: DataFrame containing the training data.
:param label_col: The name of the target variable column.
:return: A tuple containing the trained logistic regression model and the ordinal encoder.
"""

print("Training logistic regression model")

X: DataFrame = df_merged.drop(columns=['target']) # drop columns not used for training
y: DataFrame = df_merged[label_col]

# Dataset is imbalanced -> make sure that the stratify parameter is set
X_combined, X_test, y_combined, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.1, random_state=42,
stratify=y_combined)

# combine X_train and y_train
train_data = concat([X_train, y_train], axis=1)
valid_data = concat([X_val, y_val], axis=1)
test_data = concat([X_test, y_test], axis=1)

cate_cols = ['language']
ord_encoder: ordinal.OrdinalEncoder = ordinal.OrdinalEncoder(cols=cate_cols)

train_x, train_y = encode_csv(train_data, ord_encoder, label_col)
valid_x, valid_y = encode_csv(valid_data, ord_encoder, label_col, "transform")
test_x, test_y = encode_csv(test_data, ord_encoder, label_col, "transform")

logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(train_x, train_y)

return logistic_regression_model, ord_encoder


def generate_logistic_regression_recommendations(target_user: str, df_non_embedded: DataFrame,
df_embedded: DataFrame, number_of_recommendations: int = 10) -> list:
"""
Generates recommendations using the logistic regression model.

Args:
target_user (str): The target user for whom recommendations are generated.
df_non_embedded (DataFrame): The non-embedded data frame containing the features.
df_embedded (DataFrame): The embedded data frame containing the features.
number_of_recommendations (int, optional): The number of recommendations to generate. Defaults to 10.

Returns:
list: A list of recommendations, each containing the repository name, owner user, and prediction score.
"""
# Preprocess data
label_col: str = 'target'
df_merged, starred_or_owned_by_user = preprocess_data(df_embedded, df_non_embedded, label_col, target_user)

df_training_ready: DataFrame = df_merged.drop(columns=['id', 'owner_user'])

logistic_regression_model: LogisticRegression
ord_encoder: ordinal.OrdinalEncoder
# Train logistic regression model
logistic_regression_model, ord_encoder = train_logistic_regression_model(df_training_ready, label_col)

# Make predictions for all repos
full_dataset_x, full_dataset_y = encode_csv(df_training_ready, ord_encoder, label_col, "transform")
all_preds = logistic_regression_model.predict_proba(full_dataset_x)[:, 1]

# Get sorted predictions with the highest one first
top_indices = argsort(all_preds)[::-1]

# Get the top recommendations
recommendations: list = []
counter: int = 0
for index in top_indices:
if counter == number_of_recommendations:
break
# disregard if the repo is already starred by the user
if df_merged.iloc[index]['id'] in starred_or_owned_by_user:
continue
else:
counter += 1
recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index]))

return recommendations
62 changes: 27 additions & 35 deletions frontend/recommender/app.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,45 @@
import os
import sys
import streamlit as st
import redis
import json
import pandas as pd

# Navigate to root directory
root_dir = os.path.dirname(os.path.abspath(__file__))
project_dir = os.path.dirname(root_dir)
real_project_dir = os.path.dirname(project_dir)
# Function to retrieve recommendations from Redis
def retrieve_recommendations_from_redis(target_user):
try:
# Connect to Redis
redis_client = redis.Redis(host='localhost', port=6379, db=0)

# Add project directory to Python path
sys.path.insert(0, real_project_dir)
# Retrieve recommendations from Redis
recommendations = redis_client.get(target_user)

# Import necessary functions from codecompasslib
from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations, load_data

# Function to load cached data
def load_cached_data():
# Check if data is already stored in session state
if 'cached_data' not in st.session_state:
with st.spinner('Fetching data from the server...'):
# Load data
full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'
full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'
st.session_state.cached_data = load_data(full_data_folder_id, full_data_embedded_folder_id)
return st.session_state.cached_data
if recommendations:
return json.loads(recommendations.decode("utf-8"))
else:
return None
except Exception as e:
st.error(f"Could not fetch recommendations from Redis: {e}")
return None

def main():
# Load the data
df_non_embedded, df_embedded = load_cached_data()

# Set app title
st.title('GitHub Repo Recommendation System')

# Input for target user
target_user = st.text_input("Enter the target user's username:")

# Button to get recommendations
if st.button('Get Recommendations'):
# Check if user exists in the dataset
if target_user not in df_embedded['owner_user'].values:
st.error("User not found in the dataset. Please enter a valid username.")
else:
# Generate recommendations
with st.spinner('Generating recommendations...'):
recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10)

# Button to retrieve and display recommendations
if st.button('Retrieve and Display Recommendations'):
# Retrieve recommendations from Redis
retrieved_recommendations = retrieve_recommendations_from_redis(target_user)

if retrieved_recommendations:
# Display recommendations
st.subheader("Recommendations")
for index, repo in enumerate(recommendations):
name = df_non_embedded[df_non_embedded['id'] == repo[0]]['name'].values[0]
description = df_non_embedded[df_non_embedded['id'] == repo[0]]['description'].values[0]
for index, repo in enumerate(retrieved_recommendations):
name = repo[1] # Assuming the second element in the recommendation tuple is the repo name
description = "" # You may need to fetch description from Redis or another source
link = f"https://github.com/{repo[1]}/{name}"

# Display recommendation details in a card-like format with shadow
Expand All @@ -60,6 +50,8 @@ def main():
<a href="{link}" target="_blank" style="color: #0366d6; text-decoration: none;">View on GitHub</a>
</div>
""", unsafe_allow_html=True)
else:
st.error("No recommendations found for the target user.")

if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions frontend/recommender/fill_redis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import sys
import redis
import json
import pandas as pd

# Navigate to root directory
root_dir = os.path.dirname(os.path.abspath(__file__))
project_dir = os.path.dirname(root_dir)
real_project_dir = os.path.dirname(project_dir)

# Add project directory to Python path
sys.path.insert(0, real_project_dir)

# Import necessary functions from codecompasslib
from codecompasslib.models.lightgbm_model import load_data


def fill_redis_with_data():
try:
# Load the data
full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'
full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'
df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)

print("Checkpoint 1")
# Convert DataFrames to CSV
df_non_embedded_csv = df_non_embedded.to_csv(index=False)
df_embedded_csv = df_embedded.to_csv(index=False)

#print first 3 rows of the csv
print("\nFirst 3 rows of the csv")
print(df_non_embedded_csv[:3])

# Convert CSV to JSON
print("Checkpoint 2")
df_non_embedded_json = json.loads(df_non_embedded_csv)
df_embedded_json = json.loads(df_embedded_csv)

#print first 3 rows of the json
print("\nFirst 3 rows of the json")
print(df_non_embedded_json[:3])

print("Checkpoint 3")

# Connect to Redis
redis_client = redis.Redis(host='localhost', port=6379, db=0)

# Store the JSON strings in Redis
print("Storing data in Redis...")

print("Not embedded df saving ...")
redis_client.set('df_non_embedded', df_non_embedded_json)

print("Embedded df saving ...")
redis_client.set('df_embedded', df_embedded_json)

print("Data stored in Redis successfully.")
except Exception as e:
print(f"Error: {e}")

if __name__ == "__main__":
fill_redis_with_data()
1 change: 1 addition & 0 deletions recommendations.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[[15500812.0, "matthewbdaly", 1.0000000036274914e-15], [7778661.0, "Skellington-zz", 1.0000000036274914e-15], [41188271.0, "marvin-zhao", 1.0000000036274914e-15], [41188345.0, "marvin-zhao", 1.0000000036274914e-15], [42033762.0, "marvin-zhao", 1.0000000036274914e-15], [160783310.0, "marvin-zhao", 1.0000000036274914e-15], [13188377.0, "marvin-zhao", 1.0000000036274914e-15], [32130169.0, "marvin-zhao", 1.0000000036274914e-15], [191329936.0, "marvin-zhao", 1.0000000036274914e-15], [4689581.0, "marvin-zhao", 1.0000000036274914e-15]]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ openai >= 1.14.3
category-encoders==2.6.3
sentence_transformers==2.6.0
lightgbm==4.3.0
redis


Loading