Spaces:

charlieoneill
/

saerch.ai

Running

App Files Files Community

charlieoneill commited on Aug 1, 2024

Commit

d6eab4f

1 Parent(s): 3187d23

feature families

Browse files

Files changed (2) hide show

.gitignore +3 -1
app.py +382 -27

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	- data/

+data/
+__pycache__
+__pycache__/

app.py CHANGED Viewed

@@ -1,3 +1,132 @@
 import gradio as gr
 import numpy as np
 import json
@@ -11,6 +140,10 @@ import plotly.express as px
 from collections import Counter
 from huggingface_hub import hf_hub_download
 import os
 import os
 print(os.getenv('MODEL_REPO_ID'))
@@ -44,7 +177,15 @@ def download_all_files():
         # "csLG_clean_families_64_9216.json",
         # "astroPH_clean_families_64_9216.json",
         "astroPH_family_analysis_64_9216.json",
-        "csLG_family_analysis_64_9216.json"
     ]
     for file in files_to_download:
@@ -74,9 +215,13 @@ def load_subject_data(subject):
     feature_analysis_path = f"data/{subject}_feature_analysis_results_{k}.json"
     metadata_path = f'data/{subject}_paper_metadata.csv'
     topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}_int32.npy"
     topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}_float16.npy"
     families_path = f"data/{subject}_clean_families_{k}_{n_dirs}.json"
     family_analysis_path = f"data/{subject}_family_analysis_{k}_{n_dirs}.json"
     abstract_embeddings = np.load(embeddings_path).astype(np.float32)  # Load float16 and convert to float32
     with open(texts_path, 'r') as f:
@@ -86,6 +231,7 @@ def load_subject_data(subject):
     df_metadata = pd.read_csv(metadata_path)
     topk_indices = np.load(topk_indices_path)  # Already in int32, no conversion needed
     topk_values = np.load(topk_values_path).astype(np.float32)
     model_filename = f"{subject}_64_9216.pth"
     model_path = os.path.join("data", model_filename)
@@ -109,6 +255,9 @@ def load_subject_data(subject):
         'df_metadata': df_metadata,
         'topk_indices': topk_indices,
         'topk_values': topk_values,
         'ae': ae,
         'decoder': decoder,
         # 'feature_families': feature_families,
@@ -163,13 +312,15 @@ def get_feature_activations(subject, feature_index, m=5, min_length=100):
 def calculate_co_occurrences(subject, target_index, n_features=9216):
     topk_indices = subject_data[subject]['topk_indices']
     mask = np.any(topk_indices == target_index, axis=1)
     co_occurring_indices = topk_indices[mask].flatten()
     co_occurrences = Counter(co_occurring_indices)
     del co_occurrences[target_index]
-    result = np.zeros(n_features, dtype=int)
     result[list(co_occurrences.keys())] = list(co_occurrences.values())
     return result
 def style_dataframe(df: pd.DataFrame, is_top: bool) -> pd.DataFrame:
@@ -291,10 +442,175 @@ def visualize_feature(subject, index):
         "Co-occurrences": topk_values_co_occurrence
     })
     df_co_occurrences_styled = df_co_occurrences.style.format({
-        "Co-occurrences": "{:.0f}"  # Keep as integer
     })
-    return output, styled_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences_styled, fig2
 # Modify the main interface function
 def create_interface():
@@ -453,7 +769,10 @@ def create_interface():
                             def search_feature_labels(search_text):
                                 if not search_text:
                                     return gr.CheckboxGroup(choices=[])
-                                matches = [f"{f['label']} ({f['index']})" for f in subject_data[current_subject]['feature_analysis'] if search_text.lower() in f['label'].lower()]
                                 return gr.CheckboxGroup(choices=matches[:10])
                             feature_search.change(search_feature_labels, inputs=[feature_search], outputs=[feature_matches])
@@ -536,24 +855,24 @@ def create_interface():
                             wrap=True
                         )
-                        gr.Markdown("## Correlated Features")
                         with gr.Row():
                             with gr.Column(scale=1):
-                                gr.Markdown("### Top 5 Correlated Features")
-                                top_correlated = gr.Dataframe(
-                                    headers=["Feature", "Cosine similarity"],
                                     interactive=False
                                 )
                             with gr.Column(scale=1):
-                                gr.Markdown("### Bottom 5 Correlated Features")
-                                bottom_correlated = gr.Dataframe(
-                                    headers=["Feature", "Cosine similarity"],
                                     interactive=False
                                 )
                         with gr.Row():
                             with gr.Column(scale=1):
-                                gr.Markdown("## Top 5 Co-occurring Features")
                                 co_occurring_features = gr.Dataframe(
                                     headers=["Feature", "Co-occurrences"],
                                     interactive=False
@@ -562,10 +881,31 @@ def create_interface():
                                 gr.Markdown(f"## Activation Value Distribution")
                                 activation_dist = gr.Plot()
                         def search_feature_labels(search_text, current_subject):
                             if not search_text:
                                 return gr.CheckboxGroup(choices=[])
-                            matches = [f"{f['label']} ({f['index']})" for f in subject_data[current_subject]['feature_analysis'] if search_text.lower() in f['label'].lower()]
                             return gr.CheckboxGroup(choices=matches[:10])
                         feature_search.change(search_feature_labels, inputs=[feature_search, subject], outputs=[feature_matches])
@@ -576,15 +916,15 @@ def create_interface():
                             # Extract the feature index from the selected feature string
                             feature_index = int(selected_features[0].split('(')[-1].strip(')'))
-                            feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist = visualize_feature(current_subject, feature_index)
                             # Return the visualization results along with empty values for search box and checkbox
-                            return feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, "", []
                         visualize_button.click(
                             on_visualize,
                             inputs=[feature_matches, subject],
-                            outputs=[feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, feature_search, feature_matches]
                         )
                     with gr.Tab("Feature Families"):
@@ -595,19 +935,26 @@ def create_interface():
                             family_matches = gr.CheckboxGroup(label="Matching Feature Families", choices=[])
                             visualize_family_button = gr.Button("Visualize Feature Family")
-                        family_info = gr.Markdown()
                         family_dataframe = gr.Dataframe(
-                            headers=["Feature", "F1 Score", "Pearson Correlation"],
-                            datatype=["markdown", "number", "number"],
                             label="Family and Child Features"
                         )
                         def search_feature_families(search_text, current_subject):
                             family_analysis = subject_data[current_subject]['family_analysis']
                             if not search_text:
                                 return gr.CheckboxGroup(choices=[])
-                            matches = [family['superfeature'] for family in family_analysis if search_text.lower() in family['superfeature'].lower()]
                             return gr.CheckboxGroup(choices=matches[:10])  # Limit to top 10 matches
                         def visualize_feature_family(selected_families, current_subject):
@@ -627,16 +974,20 @@ def create_interface():
                             df_data = [
                                 {
                                     "Feature": f"## {family_data['superfeature']}",
                                     "F1 Score": round(family_data['family_f1'], 2),
-                                    "Pearson Correlation": round(family_data['family_pearson'], 4)
                                 },
                             ]
-                            for name, f1, pearson in zip(family_data['feature_names'], family_data['feature_f1'], family_data['feature_pearson']):
                                 df_data.append({
                                     "Feature": name,
                                     "F1 Score": round(f1, 2),
-                                    "Pearson Correlation": round(pearson, 4)
                                 })
                             df = pd.DataFrame(df_data)
@@ -645,13 +996,17 @@ def create_interface():
                             output += "## Super Reasoning\n"
                             output += f"{family_data['super_reasoning']}\n\n"
-                            return output, df, "", []  # Return empty string for search box and empty list for checkbox
                         family_search.change(search_feature_families, inputs=[family_search, subject], outputs=[family_matches])
                         visualize_family_button.click(
                             visualize_feature_family,
                             inputs=[family_matches, subject],
-                            outputs=[family_info, family_dataframe, family_search, family_matches]
                         )

+# import gradio as gr
+# import numpy as np
+# import json
+# import pandas as pd
+# from openai import OpenAI
+# import yaml
+# from typing import Optional, List, Dict, Tuple, Any
+# from topk_sae import FastAutoencoder
+# import torch
+# import plotly.express as px
+# from collections import Counter
+# from huggingface_hub import hf_hub_download
+# import os
+# import networkx as nx
+# import plotly.graph_objs as go
+# from ast import literal_eval as make_tuple
+# import random
+# import os
+# print(os.getenv('MODEL_REPO_ID'))
+# # Constants
+# EMBEDDING_MODEL = "text-embedding-3-small"
+# d_model = 1536
+# n_dirs = d_model * 6
+# k = 64
+# auxk = 128
+# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# torch.set_grad_enabled(False)
+# # Function to download all necessary files
+# def download_all_files():
+#     files_to_download = [
+#         "astroPH_paper_metadata.csv",
+#         "csLG_feature_analysis_results_64.json",
+#         "astroPH_topk_indices_64_9216_int32.npy",
+#         "astroPH_64_9216.pth",
+#         "astroPH_topk_values_64_9216_float16.npy",
+#         "csLG_abstract_texts.json",
+#         "csLG_topk_values_64_9216_float16.npy",
+#         "csLG_abstract_embeddings_float16.npy",
+#         "csLG_paper_metadata.csv",
+#         "csLG_64_9216.pth",
+#         "astroPH_abstract_texts.json",
+#         "astroPH_feature_analysis_results_64.json",
+#         "csLG_topk_indices_64_9216_int32.npy",
+#         "astroPH_abstract_embeddings_float16.npy",
+#         # "csLG_clean_families_64_9216.json",
+#         # "astroPH_clean_families_64_9216.json",
+#         # "astroPH_family_analysis_64_9216.json",
+#         "csLG_family_analysis_64_9216.json"
+#     ]
+#     for file in files_to_download:
+#         local_path = os.path.join("data", file)
+#         os.makedirs(os.path.dirname(local_path), exist_ok=True)
+#         hf_hub_download(repo_id="charlieoneill/saerch-ai-data", filename=file, local_dir="data")
+#         print(f"Downloaded {file}")
+# # Load configuration and initialize OpenAI client
+# download_all_files()
+# # Load the API key from the environment variable
+# api_key = os.getenv('openai_key')
+# # Ensure the API key is set
+# if not api_key:
+#     raise ValueError("The environment variable 'openai_key' is not set.")
+# # Initialize the OpenAI client with the API key
+# client = OpenAI(api_key=api_key)
+# # Function to load data for a specific subject
+# def load_subject_data(subject):
+#     embeddings_path = f"data/{subject}_abstract_embeddings_float16.npy"
+#     texts_path = f"data/{subject}_abstract_texts.json"
+#     feature_analysis_path = f"data/{subject}_feature_analysis_results_{k}.json"
+#     metadata_path = f'data/{subject}_paper_metadata.csv'
+#     topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}_int32.npy"
+#     norms_path = f"data/{subject}_norms_{k}_{n_dirs}.npy"
+#     topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}_float16.npy"
+#     families_path = f"data/{subject}_clean_families_{k}_{n_dirs}.json"
+#     family_analysis_path = f"data/{subject}_family_analysis_{k}_{n_dirs}.json"
+#     nns_32to64 = json.load(open(f"data/{subject}_nns_32to64.json"))
+#     nns_16to32 = json.load(open(f"data/{subject}_nns_16to32.json"))
+#     nns_16to64 = json.load(open(f"data/{subject}_nns_16to64.json"))
+#     abstract_embeddings = np.load(embeddings_path).astype(np.float32)  # Load float16 and convert to float32
+#     with open(texts_path, 'r') as f:
+#         abstract_texts = json.load(f)
+#     with open(feature_analysis_path, 'r') as f:
+#         feature_analysis = json.load(f)
+#     df_metadata = pd.read_csv(metadata_path)
+#     topk_indices = np.load(topk_indices_path)  # Already in int32, no conversion needed
+#     topk_values = np.load(topk_values_path).astype(np.float32)
+#     norms = np.load(norms_path).astype(np.float32)
+#     model_filename = f"{subject}_64_9216.pth"
+#     model_path = os.path.join("data", model_filename)
+#     ae = FastAutoencoder(n_dirs, d_model, k, auxk, multik=0).to(device)
+#     ae.load_state_dict(torch.load(model_path))
+#     ae.eval()
+#     weights = torch.load(model_path)
+#     decoder = weights['decoder.weight'].cpu().numpy()
+#     del weights
+#     with open(family_analysis_path, 'r') as f:
+#         family_analysis = json.load(f)
+#     return {
+#         'abstract_embeddings': abstract_embeddings,
+#         'abstract_texts': abstract_texts,
+#         'feature_analysis': feature_analysis,
+#         'df_metadata': df_metadata,
+#         'topk_indices': topk_indices,
+#         'topk_values': topk_values,
+#         'norms': norms,
+#         'nns_32to64': nns_32to64,
+#         'nns_16to64': nns_16to64,
+#         'ae': ae,
+#         'decoder': decoder,
+#         # 'feature_families': feature_families,
+#         'family_analysis': family_analysis
+#     }
 import gradio as gr
 import numpy as np
 import json
 from collections import Counter
 from huggingface_hub import hf_hub_download
 import os
+import networkx as nx
+import plotly.graph_objs as go
+from ast import literal_eval as make_tuple
+import random
 import os
 print(os.getenv('MODEL_REPO_ID'))
         # "csLG_clean_families_64_9216.json",
         # "astroPH_clean_families_64_9216.json",
         "astroPH_family_analysis_64_9216.json",
+        "csLG_family_analysis_64_9216.json",
+        "csLG_nns_32to64.json",
+        "csLG_nns_16to32.json",
+        "csLG_nns_16to64.json",
+        "astroPH_nns_32to64.json",
+        "astroPH_nns_16to32.json",
+        "astroPH_nns_16to64.json",
+        "csLG_norms_64_9216_float16.npy",
+        "astroPH_norms_64_9216_float16.npy"
     ]
     for file in files_to_download:
     feature_analysis_path = f"data/{subject}_feature_analysis_results_{k}.json"
     metadata_path = f'data/{subject}_paper_metadata.csv'
     topk_indices_path = f"data/{subject}_topk_indices_{k}_{n_dirs}_int32.npy"
+    norms_path = f"data/{subject}_norms_{k}_{n_dirs}_float16.npy"
     topk_values_path = f"data/{subject}_topk_values_{k}_{n_dirs}_float16.npy"
     families_path = f"data/{subject}_clean_families_{k}_{n_dirs}.json"
     family_analysis_path = f"data/{subject}_family_analysis_{k}_{n_dirs}.json"
+    nns_32to64 = json.load(open(f"data/{subject}_nns_32to64.json"))
+    nns_16to32 = json.load(open(f"data/{subject}_nns_16to32.json"))
+    nns_16to64 = json.load(open(f"data/{subject}_nns_16to64.json"))
     abstract_embeddings = np.load(embeddings_path).astype(np.float32)  # Load float16 and convert to float32
     with open(texts_path, 'r') as f:
     df_metadata = pd.read_csv(metadata_path)
     topk_indices = np.load(topk_indices_path)  # Already in int32, no conversion needed
     topk_values = np.load(topk_values_path).astype(np.float32)
+    norms = np.load(norms_path).astype(np.float32)
     model_filename = f"{subject}_64_9216.pth"
     model_path = os.path.join("data", model_filename)
         'df_metadata': df_metadata,
         'topk_indices': topk_indices,
         'topk_values': topk_values,
+        'norms': norms,
+        'nns_32to64': nns_32to64,
+        'nns_16to64': nns_16to64,
         'ae': ae,
         'decoder': decoder,
         # 'feature_families': feature_families,
 def calculate_co_occurrences(subject, target_index, n_features=9216):
     topk_indices = subject_data[subject]['topk_indices']
+    norms = subject_data[subject]['norms']
     mask = np.any(topk_indices == target_index, axis=1)
     co_occurring_indices = topk_indices[mask].flatten()
     co_occurrences = Counter(co_occurring_indices)
     del co_occurrences[target_index]
+    result = np.zeros(n_features, dtype=np.float32)
     result[list(co_occurrences.keys())] = list(co_occurrences.values())
+    result[list(co_occurrences.keys())] /= np.minimum(norms[list(co_occurrences.keys())], norms[target_index])
     return result
 def style_dataframe(df: pd.DataFrame, is_top: bool) -> pd.DataFrame:
         "Co-occurrences": topk_values_co_occurrence
     })
     df_co_occurrences_styled = df_co_occurrences.style.format({
+        "Co-occurrences": "{:.2f}"  # 2 decimal points
     })
+    # Add new code for feature splitting
+    nns_16to64 = subject_data[subject]['nns_16to64']
+    nns_32to64 = subject_data[subject]['nns_32to64']
+    # Get nearest neighbors for 16 and 32
+    #nn_16 = nns_16to64[str(index)]
+    # this is really involved it's a lot easier the other direction
+    nn_16 = []
+    for key in nns_16to64.keys():
+        for match in nns_16to64[key]:
+            if index == match['feature'][0]:
+                nn_16.append([key, float(match['similarity'])])
+    #nn_32 = nns_32to64[str(index)]
+    nn_32 = []
+    for key in nns_32to64.keys():
+        for match in nns_32to64[key]:
+            if index == match['feature'][0]:
+                nn_32.append([key, float(match['similarity'])])
+    # Create dataframes for 16 and 32 nearest neighbors
+    try:
+        df_16 = pd.DataFrame(nn_16, columns=["Feature", "Cosine Similarity"])
+        df_16 = df_16.style.format({"Cosine Similarity": "{:.4f}"})
+    except:
+        df_16 = pd.DataFrame(["No Match"], columns=["Feature"])
+    try:
+        df_32 = pd.DataFrame(nn_32, columns=["Feature", "Cosine Similarity"])
+        df_32 = df_32.style.format({"Cosine Similarity": "{:.4f}"})
+    except:
+        df_32 = pd.DataFrame(["No Match"], columns=["Feature"])
+    return output, styled_top_abstracts, df_top_correlated_styled, df_bottom_correlated_styled, df_co_occurrences_styled, fig2, df_16, df_32
+def create_interactive_directed_graph(family):
+    matrix = np.array(family['matrix'])
+    matrix[matrix < 0.07] = 0
+    densities = family['densities']
+    for i in range(len(densities)):
+        for j in range(len(densities)):
+            if densities[i] < densities[j]:
+                matrix[i][j] = 0
+    G = nx.from_numpy_array(matrix, create_using=nx.DiGraph())
+    num_nodes = len(family['feature_f1'])
+    all_f1s = family['feature_pearson'] + [family['family_pearson']]
+    node_info = {i: {"name": f"{family['feature_names'][i]}", "density": family['densities'][i], "pearson": all_f1s[i]} for i in range(num_nodes)}
+    nx.set_node_attributes(G, node_info)
+    # Create node trace
+    node_x = []
+    node_y = []
+    node_text = []
+    node_size = []
+    node_color = []
+    pos = nx.spring_layout(G, k = np.sqrt(1/num_nodes) * 3)
+    for node in G.nodes():
+        x, y = pos[node]
+        node_x.append(x)
+        node_y.append(y)
+        node_text.append(G.nodes[node]['name'] + "<br>log density: " + str(round(np.log10(G.nodes[node]['density'] + 1e-5), 3)))
+        node_size.append((np.log10(G.nodes[node]['density'] + 1e-5) + 6) * 10)
+        node_color.append(G.nodes[node]['pearson'])
+    node_trace = go.Scatter(
+        x=node_x, y=node_y,
+        mode='markers',
+        hoverinfo='text',
+        marker=dict(
+            showscale=True,
+            colorscale='purples',
+            size=node_size,  # Set node marker size to node['f1']
+            color=node_color,
+            cmin = 0,
+            cmax = 1,
+            colorbar=dict(
+                thickness=15,
+                title='Pearson Correlation',
+                xanchor='left',
+                titleside='right',
+            ),
+            line_width=2,
+            opacity = 1,),
+            opacity = 1)
+    node_trace.text = node_text
+    # Create edge trace
+    edge_traces = []
+    annotations = []
+    for edge in G.edges():
+        x0, y0 = pos[edge[0]]
+        x1, y1 = pos[edge[1]]
+        weight = matrix[edge[0], edge[1]]
+        # Calculate offset (adjust this value to move arrows further from or closer to nodes)
+        offset = 0.00
+        start_x = x0
+        start_y = y0
+        end_x = x1
+        end_y = y1
+        # # Calculate new start and end points
+        # if start_x > end_x:
+        #     start_x = x0 - offset
+        #     end_x = x0 + offset
+        # else:
+        #     start_x = x0 + offset
+        #     end_x = x1 - offset
+        # if start_y > end_y:
+        #     start_y = y0 - offset
+        #     end_y = y1 + offset
+        # else:
+        #     start_y = y0 + offset
+        #     end_y = y1 - offset
+        edge_trace = go.Scatter(
+            x=[start_x, end_x, None],
+            y=[start_y, end_y, None],
+            line=dict(width=weight * 20, color='#888'),  # Multiply weight by 20 for better visibility
+            hovertext="weight: " + str(round(weight, 3)),  # Set the hover text to the edge weight
+            mode='lines',
+            line_shape='spline',
+            opacity = 0.5,
+        )
+        edge_traces.append(edge_trace)
+        annotation = dict(
+            ax=start_x,
+            ay=start_y,
+            x=end_x,
+            y=end_y,
+            xref='x',
+            yref='y',
+            axref='x',
+            ayref='y',
+            showarrow=True,
+            arrowhead=4,
+            arrowsize=4, #max(min(weight * 3, 0.3), 2),  # Reduced from 30 to 10
+            arrowwidth=1,  # Reduced from 30 to 2
+            arrowcolor='#999',
+            opacity = 1,
+        )
+        annotations.append(annotation)
+    annotation_trace = go.Scatter(x=[], y=[], mode='markers', hoverinfo='none', marker=dict(opacity=0))
+    # Create the figure
+    fig = go.Figure(data=[annotation_trace, *edge_traces, node_trace],
+                    layout=go.Layout(
+                        showlegend=False,
+                        hovermode='closest',
+                        margin=dict(b=20,l=5,r=5,t=40),
+                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)),
+                    )
+    fig.update_xaxes(showline=False, linewidth=0, gridcolor='white')
+    fig.update_yaxes(showline=False, linewidth=0, gridcolor='white')
+    fig.update_layout(
+        plot_bgcolor='white',
+        annotations=annotations,
+    )
+    return fig
 # Modify the main interface function
 def create_interface():
                             def search_feature_labels(search_text):
                                 if not search_text:
                                     return gr.CheckboxGroup(choices=[])
+                                matches = [f for f in subject_data[current_subject]['feature_analysis'] if search_text.lower() in f['label'].lower()]
+                                matches = sorted(matches, key=lambda x: x['pearson_correlation'], reverse=True)
+                                matches = [f"{f['label']} ({f['index']})" for f in matches]
                                 return gr.CheckboxGroup(choices=matches[:10])
                             feature_search.change(search_feature_labels, inputs=[feature_search], outputs=[feature_matches])
                             wrap=True
                         )
+                        gr.Markdown("## Feature Splitting")
                         with gr.Row():
                             with gr.Column(scale=1):
+                                gr.Markdown("### Best Match in SAE16")
+                                nn_16_table = gr.Dataframe(
+                                    headers=["Feature", "Cosine Similarity"],
                                     interactive=False
                                 )
                             with gr.Column(scale=1):
+                                gr.Markdown("### Best Match in SAE32")
+                                nn_32_table = gr.Dataframe(
+                                    headers=["Feature", "Cosine Similarity"],
                                     interactive=False
                                 )
                         with gr.Row():
                             with gr.Column(scale=1):
+                                gr.Markdown("## Top Co-occurring Features")
                                 co_occurring_features = gr.Dataframe(
                                     headers=["Feature", "Co-occurrences"],
                                     interactive=False
                                 gr.Markdown(f"## Activation Value Distribution")
                                 activation_dist = gr.Plot()
+                        gr.Markdown("## Correlated Features")
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                gr.Markdown("### Top Correlated Features")
+                                top_correlated = gr.Dataframe(
+                                    headers=["Feature", "Cosine similarity"],
+                                    interactive=False
+                                )
+                            with gr.Column(scale=1):
+                                gr.Markdown("### Bottom Correlated Features")
+                                bottom_correlated = gr.Dataframe(
+                                    headers=["Feature", "Cosine similarity"],
+                                    interactive=False
+                                )
                         def search_feature_labels(search_text, current_subject):
                             if not search_text:
                                 return gr.CheckboxGroup(choices=[])
+                            matches = [f for f in subject_data[current_subject]['feature_analysis'] if search_text.lower() in f['label'].lower()]
+                            matches = sorted(matches, key=lambda x: x['pearson_correlation'], reverse=True)
+                            matches = [f"{f['label']} ({f['index']})" for f in matches]
                             return gr.CheckboxGroup(choices=matches[:10])
                         feature_search.change(search_feature_labels, inputs=[feature_search, subject], outputs=[feature_matches])
                             # Extract the feature index from the selected feature string
                             feature_index = int(selected_features[0].split('(')[-1].strip(')'))
+                            feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, nn_16, nn_32 = visualize_feature(current_subject, feature_index)
                             # Return the visualization results along with empty values for search box and checkbox
+                            return feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, "", [], nn_16, nn_32
                         visualize_button.click(
                             on_visualize,
                             inputs=[feature_matches, subject],
+                            outputs=[feature_info, top_abstracts, top_correlated, bottom_correlated, co_occurring_features, activation_dist, feature_search, feature_matches, nn_16_table, nn_32_table]
                         )
                     with gr.Tab("Feature Families"):
                             family_matches = gr.CheckboxGroup(label="Matching Feature Families", choices=[])
                             visualize_family_button = gr.Button("Visualize Feature Family")
                         family_dataframe = gr.Dataframe(
+                            headers=["Feature", "Parent Co-Occurrence", "F1 Score", "Pearson"],
+                            datatype=["markdown", "number", "number", "number"],
                             label="Family and Child Features"
                         )
+                        gr.Markdown("# Family Graph")
+                        graph_plot = gr.Plot(label="Directed Graph")
+                        # family_info = gr.Markdown()
                         def search_feature_families(search_text, current_subject):
                             family_analysis = subject_data[current_subject]['family_analysis']
                             if not search_text:
                                 return gr.CheckboxGroup(choices=[])
+                            matches = [family for family in family_analysis if search_text.lower() in family['superfeature'].lower()]
+                            matches = sorted(matches, key=lambda x: x['family_pearson'], reverse=True)
+                            matches = [family['superfeature'] for family in matches]
+                            matches = list(dict.fromkeys(matches))
                             return gr.CheckboxGroup(choices=matches[:10])  # Limit to top 10 matches
                         def visualize_feature_family(selected_families, current_subject):
                             df_data = [
                                 {
                                     "Feature": f"## {family_data['superfeature']}",
+                                    "Parent Co-Occurrence": 1,
                                     "F1 Score": round(family_data['family_f1'], 2),
+                                    "Pearson": round(family_data['family_pearson'], 4)
                                 },
                             ]
+                            coocs = np.array(family_data['matrix'])[:, -1]
+                            # print(coocs)
+                            for name, cooc, f1, pearson in zip(family_data['feature_names'], coocs, family_data['feature_f1'], family_data['feature_pearson']):
                                 df_data.append({
                                     "Feature": name,
+                                    "Parent Co-Occurrence": round(cooc, 2),
                                     "F1 Score": round(f1, 2),
+                                    "Pearson": round(pearson, 4)
                                 })
                             df = pd.DataFrame(df_data)
                             output += "## Super Reasoning\n"
                             output += f"{family_data['super_reasoning']}\n\n"
+                            graph = create_interactive_directed_graph(family_data)
+                            #return output, df, "", [], graph  # Return empty string for search box and empty list for checkbox
+                            return df, "", [], graph
                         family_search.change(search_feature_families, inputs=[family_search, subject], outputs=[family_matches])
                         visualize_family_button.click(
                             visualize_feature_family,
                             inputs=[family_matches, subject],
+                            #outputs=[family_info, family_dataframe, family_search, family_matches, graph_plot]
+                            outputs=[family_dataframe, family_search, family_matches, graph_plot]
                         )