diff --git a/README.md b/README.md index b253ed1..5e31266 100644 --- a/README.md +++ b/README.md @@ -56,11 +56,24 @@ pip install -r requirements.txt - `openAI_key`: Your OpenAI API key. ### **4.** Run: -Chatbot +**Chatbot** ``` streamlit run frontend/chatbot/app.py ``` -Recommender: +**Recommender:** +``` +redis-server +``` +Check server is running (Outputs pong if connected) +``` +redis-cli ping +``` +Check database by +``` +redis-cli + +KEYS * +``` ``` streamlit run frontend/recommender/app.py ``` diff --git a/codecompasslib/models/lr_model.py b/codecompasslib/models/lr_model.py new file mode 100644 index 0000000..877d530 --- /dev/null +++ b/codecompasslib/models/lr_model.py @@ -0,0 +1,139 @@ +import os +import sys +import pandas as pd +from typing import Tuple, List +from pandas import DataFrame, concat +from numpy import ndarray, argsort +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from category_encoders import ordinal +from lightgbm_model import preprocess_data +from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive +from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos + +# go up to root +# Construct the path to the root directory (one level up from embeddings) +root_dir = os.path.dirname(os.path.abspath(__file__)) +project_dir = os.path.dirname(root_dir) +real_project_dir = os.path.dirname(project_dir) +# Add the project directory to the Python path +sys.path.insert(0, real_project_dir) + + + + +def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tuple[DataFrame, ndarray]: + """ + Encode the categorical columns in a DataFrame using the specified encoder. + :param df: The DataFrame to be encoded. + :param encoder: The encoder object used for encoding the categorical columns. + :param label_col: The name of the label column. + :param typ: The type of encoding to perform. Defaults to "fit". + :return: A tuple containing the encoded DataFrame and the label column values. + """ + if typ == "fit": + df = encoder.fit_transform(df) + else: + df = encoder.transform(df) + y: ndarray = df[label_col].values + del df[label_col] + return df, y + + +def train_logistic_regression_model(df_merged: DataFrame, label_col: str) -> Tuple[LogisticRegression, ordinal.OrdinalEncoder]: + """ + Trains a logistic regression model using the provided merged dataframe. + + This function trains a logistic regression model using the provided merged dataframe. It performs the following steps: + 1. Splits the merged dataframe into training, validation, and test sets. + 2. Encodes categorical columns using ordinal encoding. + 3. Trains the logistic regression model using the training data. + 4. Returns the trained logistic regression model and the ordinal encoder. + + Note: This function assumes that the merged dataframe has the following columns: + - 'target': The target variable to be predicted. + - 'id': An identifier column. + - 'owner_user': A column representing the owner user. + - 'embedding_0' to 'embedding_255': Numerical columns representing the embeddings. + - 'language': A categorical column representing the language. + - 'stars': A numerical column representing the number of stars. + + :param df_merged: DataFrame containing the training data. + :param label_col: The name of the target variable column. + :return: A tuple containing the trained logistic regression model and the ordinal encoder. + """ + + print("Training logistic regression model") + + X: DataFrame = df_merged.drop(columns=['target']) # drop columns not used for training + y: DataFrame = df_merged[label_col] + + # Dataset is imbalanced -> make sure that the stratify parameter is set + X_combined, X_test, y_combined, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y) + X_train, X_val, y_train, y_val = train_test_split(X_combined, y_combined, test_size=0.1, random_state=42, + stratify=y_combined) + + # combine X_train and y_train + train_data = concat([X_train, y_train], axis=1) + valid_data = concat([X_val, y_val], axis=1) + test_data = concat([X_test, y_test], axis=1) + + cate_cols = ['language'] + ord_encoder: ordinal.OrdinalEncoder = ordinal.OrdinalEncoder(cols=cate_cols) + + train_x, train_y = encode_csv(train_data, ord_encoder, label_col) + valid_x, valid_y = encode_csv(valid_data, ord_encoder, label_col, "transform") + test_x, test_y = encode_csv(test_data, ord_encoder, label_col, "transform") + + logistic_regression_model = LogisticRegression() + logistic_regression_model.fit(train_x, train_y) + + return logistic_regression_model, ord_encoder + + +def generate_logistic_regression_recommendations(target_user: str, df_non_embedded: DataFrame, + df_embedded: DataFrame, number_of_recommendations: int = 10) -> list: + """ + Generates recommendations using the logistic regression model. + + Args: + target_user (str): The target user for whom recommendations are generated. + df_non_embedded (DataFrame): The non-embedded data frame containing the features. + df_embedded (DataFrame): The embedded data frame containing the features. + number_of_recommendations (int, optional): The number of recommendations to generate. Defaults to 10. + + Returns: + list: A list of recommendations, each containing the repository name, owner user, and prediction score. + """ + # Preprocess data + label_col: str = 'target' + df_merged, starred_or_owned_by_user = preprocess_data(df_embedded, df_non_embedded, label_col, target_user) + + df_training_ready: DataFrame = df_merged.drop(columns=['id', 'owner_user']) + + logistic_regression_model: LogisticRegression + ord_encoder: ordinal.OrdinalEncoder + # Train logistic regression model + logistic_regression_model, ord_encoder = train_logistic_regression_model(df_training_ready, label_col) + + # Make predictions for all repos + full_dataset_x, full_dataset_y = encode_csv(df_training_ready, ord_encoder, label_col, "transform") + all_preds = logistic_regression_model.predict_proba(full_dataset_x)[:, 1] + + # Get sorted predictions with the highest one first + top_indices = argsort(all_preds)[::-1] + + # Get the top recommendations + recommendations: list = [] + counter: int = 0 + for index in top_indices: + if counter == number_of_recommendations: + break + # disregard if the repo is already starred by the user + if df_merged.iloc[index]['id'] in starred_or_owned_by_user: + continue + else: + counter += 1 + recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index])) + + return recommendations diff --git a/frontend/recommender/app.py b/frontend/recommender/app.py index 68b8cfc..22733a2 100644 --- a/frontend/recommender/app.py +++ b/frontend/recommender/app.py @@ -1,55 +1,45 @@ import os import sys import streamlit as st +import redis +import json import pandas as pd -# Navigate to root directory -root_dir = os.path.dirname(os.path.abspath(__file__)) -project_dir = os.path.dirname(root_dir) -real_project_dir = os.path.dirname(project_dir) +# Function to retrieve recommendations from Redis +def retrieve_recommendations_from_redis(target_user): + try: + # Connect to Redis + redis_client = redis.Redis(host='localhost', port=6379, db=0) -# Add project directory to Python path -sys.path.insert(0, real_project_dir) + # Retrieve recommendations from Redis + recommendations = redis_client.get(target_user) -# Import necessary functions from codecompasslib -from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations, load_data - -# Function to load cached data -def load_cached_data(): - # Check if data is already stored in session state - if 'cached_data' not in st.session_state: - with st.spinner('Fetching data from the server...'): - # Load data - full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd' - full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By' - st.session_state.cached_data = load_data(full_data_folder_id, full_data_embedded_folder_id) - return st.session_state.cached_data + if recommendations: + return json.loads(recommendations.decode("utf-8")) + else: + return None + except Exception as e: + st.error(f"Could not fetch recommendations from Redis: {e}") + return None def main(): - # Load the data - df_non_embedded, df_embedded = load_cached_data() - # Set app title st.title('GitHub Repo Recommendation System') # Input for target user target_user = st.text_input("Enter the target user's username:") - # Button to get recommendations - if st.button('Get Recommendations'): - # Check if user exists in the dataset - if target_user not in df_embedded['owner_user'].values: - st.error("User not found in the dataset. Please enter a valid username.") - else: - # Generate recommendations - with st.spinner('Generating recommendations...'): - recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10) - + # Button to retrieve and display recommendations + if st.button('Retrieve and Display Recommendations'): + # Retrieve recommendations from Redis + retrieved_recommendations = retrieve_recommendations_from_redis(target_user) + + if retrieved_recommendations: # Display recommendations st.subheader("Recommendations") - for index, repo in enumerate(recommendations): - name = df_non_embedded[df_non_embedded['id'] == repo[0]]['name'].values[0] - description = df_non_embedded[df_non_embedded['id'] == repo[0]]['description'].values[0] + for index, repo in enumerate(retrieved_recommendations): + name = repo[1] # Assuming the second element in the recommendation tuple is the repo name + description = "" # You may need to fetch description from Redis or another source link = f"https://github.com/{repo[1]}/{name}" # Display recommendation details in a card-like format with shadow @@ -60,6 +50,8 @@ def main(): View on GitHub """, unsafe_allow_html=True) + else: + st.error("No recommendations found for the target user.") if __name__ == "__main__": main() diff --git a/frontend/recommender/fill_redis.py b/frontend/recommender/fill_redis.py new file mode 100644 index 0000000..8c759d7 --- /dev/null +++ b/frontend/recommender/fill_redis.py @@ -0,0 +1,63 @@ +import os +import sys +import redis +import json +import pandas as pd + +# Navigate to root directory +root_dir = os.path.dirname(os.path.abspath(__file__)) +project_dir = os.path.dirname(root_dir) +real_project_dir = os.path.dirname(project_dir) + +# Add project directory to Python path +sys.path.insert(0, real_project_dir) + +# Import necessary functions from codecompasslib +from codecompasslib.models.lightgbm_model import load_data + + +def fill_redis_with_data(): + try: + # Load the data + full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd' + full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By' + df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id) + + print("Checkpoint 1") + # Convert DataFrames to CSV + df_non_embedded_csv = df_non_embedded.to_csv(index=False) + df_embedded_csv = df_embedded.to_csv(index=False) + + #print first 3 rows of the csv + print("\nFirst 3 rows of the csv") + print(df_non_embedded_csv[:3]) + + # Convert CSV to JSON + print("Checkpoint 2") + df_non_embedded_json = json.loads(df_non_embedded_csv) + df_embedded_json = json.loads(df_embedded_csv) + + #print first 3 rows of the json + print("\nFirst 3 rows of the json") + print(df_non_embedded_json[:3]) + + print("Checkpoint 3") + + # Connect to Redis + redis_client = redis.Redis(host='localhost', port=6379, db=0) + + # Store the JSON strings in Redis + print("Storing data in Redis...") + + print("Not embedded df saving ...") + redis_client.set('df_non_embedded', df_non_embedded_json) + + print("Embedded df saving ...") + redis_client.set('df_embedded', df_embedded_json) + + print("Data stored in Redis successfully.") + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + fill_redis_with_data() diff --git a/recommendations.json b/recommendations.json new file mode 100644 index 0000000..46e5595 --- /dev/null +++ b/recommendations.json @@ -0,0 +1 @@ +[[15500812.0, "matthewbdaly", 1.0000000036274914e-15], [7778661.0, "Skellington-zz", 1.0000000036274914e-15], [41188271.0, "marvin-zhao", 1.0000000036274914e-15], [41188345.0, "marvin-zhao", 1.0000000036274914e-15], [42033762.0, "marvin-zhao", 1.0000000036274914e-15], [160783310.0, "marvin-zhao", 1.0000000036274914e-15], [13188377.0, "marvin-zhao", 1.0000000036274914e-15], [32130169.0, "marvin-zhao", 1.0000000036274914e-15], [191329936.0, "marvin-zhao", 1.0000000036274914e-15], [4689581.0, "marvin-zhao", 1.0000000036274914e-15]] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f2f4b12..94ca7bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,5 +18,6 @@ openai >= 1.14.3 category-encoders==2.6.3 sentence_transformers==2.6.0 lightgbm==4.3.0 +redis