from brainiak.fcma.preprocessing import prepare_fcma_data,RandomType
from brainiak.fcma.classifier import Classifier
from brainiak.fcma.util import compute_correlation
from brainiak import io
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from mpi4py import MPI
from nxviz.plots import CircosPlot
from itertools import product
from nilearn import image, plotting, datasets
import sys
import logging
import numpy as np
import pandas as pd
import nibabel as nib
import os, glob, subprocess
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt


# --------------------
# Set up directories
# --------------------

cur_dir = os.getcwd()
data_dir = os.path.join(cur_dir, 'simulated_data')  # What is the directory containing data?
suffix = '.nii.gz'  # What is the extension of the data you're loading
mask_file = os.path.join(cur_dir, 'sim_info',  'full_mask.nii.gz')  # What is the path to the whole brain mask
epoch_file = os.path.join(cur_dir, 'simulated_data', 'sim_epoch_file.npy')  # What is the path to the epoch file
left_out_subj = 0  # Which participant (as an integer) are you leaving out for this cv, for the current demo, set it to 0
output_dir = os.path.join(cur_dir, 'FCMA_result')  # What is the path to the folder you want to save this data in

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# ----------------------------
# Load and Preprocess the data
# ----------------------------

print("Load in the volumes, mask, and labels")
images = io.load_images_from_dir(data_dir, suffix=suffix)
mask = io.load_boolean_mask(mask_file)
epoch_list = io.load_labels(epoch_file)

print("Parse the epoch data for useful dimensions")
epochs_per_subj = epoch_list[0].shape[1]
num_subjs = len(epoch_list)

print("Preprocess the data and prepare for FCMA")
raw_data, _, labels = prepare_fcma_data(images, epoch_list, mask)

# ----------------------------------
# Outer loop - take out traning set 
# for feature selection
# ----------------------------------

print("Enforce leave one out")
start_idx = int(int(left_out_subj) * epochs_per_subj)
end_idx = int(start_idx + epochs_per_subj)

print("Take out the idxs corresponding to all participants but this one")
subsampled_idx = list(set(range(len(labels))) - set(range(start_idx, end_idx)))
labels_subsampled = [labels[i] for i in subsampled_idx]
raw_data_subsampled = [raw_data[i] for i in subsampled_idx]

Load in the volumes, mask, and labels
Parse the epoch data for useful dimensions
Preprocess the data and prepare for FCMA
Enforce leave one out
Take out the idxs corresponding to all participants but this one


print(f"Label count: {len(labels_subsampled)}")
print(f"Epoch (block) count: {len(raw_data_subsampled)}")
print(f"Shape of each epoch (block): {raw_data_subsampled[0].shape}")

Label count: 140
Epoch (block) count: 140
Shape of each epoch (block): (15, 432)


# -----------------------------
# Enlarging the feature space 
# -----------------------------
raw_data_subsampled_mat = [np.corrcoef(sample.T) for sample in raw_data_subsampled]


# ------------------------------------------
# Inner loop leave one out cross validation
# ------------------------------------------
vox_score = []
# for every voxel in the brain
for i in range(raw_data_subsampled_mat[0].shape[0]):
    
    # A) extract the corresponding row from each subjects' correlation matrix 
    raw_data_sub_selected_vox = [sample[i,] for sample in raw_data_subsampled_mat]

    # B) Perform the inner loop leave-one-out cross validation
    kf = KFold(n_splits = num_subjs - 1)
    score = []
    for train_ind, test_ind in kf.split(labels_subsampled):

        # For each fold, get training and validation set 
        vox_train, vox_test = [raw_data_sub_selected_vox[ind] for ind in train_ind], [raw_data_sub_selected_vox[ind] for ind in test_ind]
        lab_train, lab_test = [labels_subsampled[ind] for ind in train_ind], [labels_subsampled[ind] for ind in test_ind]

        # set up the model
        svm_clf = SVC(kernel='linear', shrinking=False, C=1)

        # train the model
        svm_clf.fit(vox_train, lab_train)

        # test the model
        predict = svm_clf.predict(vox_test)
        
        # C) A prediction accuracy value for this selected voxel could be computed for each fold
        score.append(svm_clf.score(vox_test, lab_test))
    
    # D) Final prediction accuracy for this voxel
    vox_score.append(np.mean(score))


# rank all voxels based on their prediction accuracies, the higher the acc, the lower the rank 
vox_dic = {vid:score for vid, score in zip(range(len(vox_score)), vox_score)}
vox_rank = sorted(vox_dic, key=vox_dic.get, reverse=True)

print(f"The length of the prediction accuracy list is {len(vox_score)}")
print(f"The top ranked 10 voxel IDs are {vox_rank[0:10]}")

The length of the prediction accuracy list is 432
The top ranked 10 voxel IDs are [234, 324, 382, 10, 8, 291, 17, 39, 78, 252]


# Use subprocess to run a python script with MPI. This should work on a cluster or local computer. 
for left_sid in range(num_subjs):
    subprocess.run(f"mpirun -np 2 python3 ./FCMA_script/fcma_voxel_selection_cv.py {data_dir} {suffix} {mask_file} {epoch_file} {left_sid} {output_dir}", shell = True)
    print(f"Subject {num_subjs} done")


[os.path.basename(x) for x in glob.glob(os.path.join(output_dir, 'fc_no0*'))]

['fc_no0_result_seq.nii.gz',
 'fc_no0_result_score.nii.gz',
 'fc_no0_result_list.txt']


with open(output_dir + '/fc_no0_result_list.txt', 'r') as f:
    line = f.readlines()
[l.split(' ')[0] for l in line[0:10]]

['234', '252', '278', '17', '382', '39', '291', '314', '119', '10']


# make top n masks
for n in [10,20,30]:
    top_n = 'top_' + str(n)
    #subprocess.run("module load fsl", shell = True)
    subprocess.run(f"bash ./FCMA_script/make_top_voxel_mask.sh {output_dir} {n} {os.path.join(output_dir, top_n)}", shell = True)


# ------------------------------------------
# top n feature mask from the training set 
# ------------------------------------------
# path to 3 top feature masks 
top_n_mask_files = {os.path.join(output_dir, "top_10/fc_no0_result_seq_top10.nii.gz") : 'TOP 10 Voxels',
                    os.path.join(output_dir, "top_20/fc_no0_result_seq_top20.nii.gz") : 'TOP 20 Voxels',
                    os.path.join(output_dir, "top_30/fc_no0_result_seq_top30.nii.gz") : 'TOP 30 Voxels'}

# for each top feature mask: 
for top_n_mask_path in top_n_mask_files.keys():
    
    # ----------------
    # preprocess data 
    # ----------------
    # Load in the volumes, mask and labels
    images = io.load_images_from_dir(data_dir, suffix=suffix)
    top_n_mask = io.load_boolean_mask(top_n_mask_path)
    epoch_list = io.load_labels(epoch_file)

    # Parse the epoch data for useful dimensions
    epochs_per_subj = epochs_per_subj = epoch_list[0].shape[1]
    num_subjs = len(epoch_list)

    # Prepare the data
    int_data, _, labels = prepare_fcma_data(images, epoch_list, top_n_mask)

    # -------------------
    # Outer loop testing
    # -------------------

    # What indexes pick out the left out participant?
    start_idx = int(int(left_out_subj) * epochs_per_subj)
    end_idx = int(start_idx + epochs_per_subj)

    # Take out the idxs corresponding to all participants but this one
    training_idx = list(set(range(len(labels))) - set(range(start_idx, end_idx)))
    testing_idx = list(range(start_idx, end_idx))

    # Pull out the data
    int_data_training = [int_data[i] for i in training_idx]
    int_data_testing = [int_data[i] for i in testing_idx]

    # Pull out the labels
    labels_training = [labels[i] for i in training_idx]
    labels_testing = [labels[i] for i in testing_idx]

    # Prepare the data to be processed efficiently (albeit in a less easy to follow way)
    rearranged_int_data = int_data_training + int_data_testing
    rearranged_labels = labels_training + labels_testing
    num_training_samples = epochs_per_subj * (num_subjs - 1)

    # Set up data so that the internal mask is correlated with the internal mask
    corr_obj = list(zip(rearranged_int_data, rearranged_int_data))

    # no shrinking, set C=1
    svm_clf = SVC(kernel='precomputed', shrinking=False, C=1)

    clf = Classifier(svm_clf, epochs_per_subj=epochs_per_subj)

    # Train the model on the training data
    clf.fit(corr_obj, rearranged_labels, num_training_samples)

    # Test on the testing data
    predict = clf.predict()

    # ---------------------
    # Print out the results
    # ---------------------
    
    print(f'--{top_n_mask_files[top_n_mask_path]} RESULTS--')
    print(f"Top feature size: {top_n_mask_files[top_n_mask_path]}")
    print(clf.decision_function())
    print(clf.predict())

    # How often does the prediction match the target
    num_correct = (np.asanyarray(predict) == np.asanyarray(labels_testing)).sum()


    # Report accuracy
    print( 'When leaving subject %d out for testing using the %s mask for an intrinsic correlation, the accuracy is %d / %d = %.2f' %
        (int(left_out_subj), top_n_mask_path,  num_correct, epochs_per_subj, num_correct / epochs_per_subj))
    print('---------------------------------------------------------------------------------------------------')

--TOP 10 Voxels RESULTS--
Top feature size: TOP 10 Voxels
[ 0.38126299 -2.41850781  0.45284516 -1.80357795  0.17842044 -0.77340787
  1.75422849  1.86623616  3.07343915 -3.13254844 -1.45388323 -0.41695132
  0.44155523  0.22353497 -0.83062927  0.7705409   0.99191314 -0.12486879
 -1.44480709 -0.06320189]
[1 0 1 0 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 0]
When leaving subject 0 out for testing using the /Users/peetal/Documents/GitHub/FCMA_demo/FCMA_result/top_10/fc_no0_result_seq_top10.nii.gz mask for an intrinsic correlation, the accuracy is 8 / 20 = 0.40
---------------------------------------------------------------------------------------------------
--TOP 20 Voxels RESULTS--
Top feature size: TOP 20 Voxels
[-8.86536867e-01  1.07148090e-01 -8.85313201e-01 -6.78471999e-01
  2.52912073e-01 -1.22895096e+00 -1.69236952e-04 -1.41433092e-01
  3.92896756e-02 -1.17931980e+00  5.94036382e-02  1.80326335e+00
  3.25205485e-01  4.20668033e-01 -2.49353493e-01  9.49864680e-01
 -8.31116802e-02 -5.89936394e-01  1.70349872e+00  1.34718994e-01]
[0 1 0 0 1 0 0 0 1 0 1 1 1 1 0 1 0 0 1 1]
When leaving subject 0 out for testing using the /Users/peetal/Documents/GitHub/FCMA_demo/FCMA_result/top_20/fc_no0_result_seq_top20.nii.gz mask for an intrinsic correlation, the accuracy is 14 / 20 = 0.70
---------------------------------------------------------------------------------------------------
--TOP 30 Voxels RESULTS--
Top feature size: TOP 30 Voxels
[-1.65670387e-01 -3.42620453e-01 -6.07346123e-01 -1.22227886e-01
  7.48107093e-02 -8.58807325e-01 -5.63491134e-01 -1.22717309e-01
 -6.08503929e-01 -3.74452274e-01  3.77464014e-01  5.14801342e-01
  9.31709915e-01  4.56100009e-01 -8.34279388e-01  9.91251666e-01
 -3.33540780e-01 -1.08481783e-03  1.46670936e+00 -6.37847517e-02]
[0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0]
When leaving subject 0 out for testing using the /Users/peetal/Documents/GitHub/FCMA_demo/FCMA_result/top_30/fc_no0_result_seq_top30.nii.gz mask for an intrinsic correlation, the accuracy is 15 / 20 = 0.75
---------------------------------------------------------------------------------------------------


# run classification for each left out subject for each top-n-mask size
for n, left_sid in product([10,20,30],range(8)):
    
    top_n_mask_file_path = os.path.join(output_dir, f'top_{n}', f'fc_no{left_sid}_result_seq_top{n}.nii.gz')
    results_path = os.path.join(output_dir, f'top_{n}')
    
    subprocess.run(f"mpirun -np 2 python3 ./FCMA_script/fcma_classify.py {data_dir} {suffix} {top_n_mask_file_path} {epoch_file} {left_sid} {results_path}", shell = True) 
    
    print(f"Left out subject {left_sid}, Top {n} voxel mask, Classification done")

Left out subject 0, Top 10 voxel mask, Classification done
Left out subject 1, Top 10 voxel mask, Classification done
Left out subject 2, Top 10 voxel mask, Classification done
Left out subject 3, Top 10 voxel mask, Classification done
Left out subject 4, Top 10 voxel mask, Classification done
Left out subject 5, Top 10 voxel mask, Classification done
Left out subject 6, Top 10 voxel mask, Classification done
Left out subject 7, Top 10 voxel mask, Classification done
Left out subject 0, Top 20 voxel mask, Classification done
Left out subject 1, Top 20 voxel mask, Classification done
Left out subject 2, Top 20 voxel mask, Classification done
Left out subject 3, Top 20 voxel mask, Classification done
Left out subject 4, Top 20 voxel mask, Classification done
Left out subject 5, Top 20 voxel mask, Classification done
Left out subject 6, Top 20 voxel mask, Classification done
Left out subject 7, Top 20 voxel mask, Classification done
Left out subject 0, Top 30 voxel mask, Classification done
Left out subject 1, Top 30 voxel mask, Classification done
Left out subject 2, Top 30 voxel mask, Classification done
Left out subject 3, Top 30 voxel mask, Classification done
Left out subject 4, Top 30 voxel mask, Classification done
Left out subject 5, Top 30 voxel mask, Classification done
Left out subject 6, Top 30 voxel mask, Classification done
Left out subject 7, Top 30 voxel mask, Classification done


# write a function to extract left-out subject id and the classification acc. 
def extract_clf_acc(base_dir, top_num, num_subjs):
    
    import os
    import numpy as np
    import pandas as pd
    path = os.path.join(base_dir, top_num, 'classify_result.txt')

    # read each line, from which extract the left out subject id and the clf acc
    with open(path, "r") as file:
        lines = file.readlines()
    loo_id = []
    acc = []
    for line in lines: 
        id_start_index = line.index('_no') + len('_no')
        id_end_index = line.index('_result_seq')
        id = int(line[id_start_index:id_end_index])
        loo_id.append(id)

        acc_start_index = line.index(': ') + 2
        acc_end_index = line.index('\n')
        score = float(line[acc_start_index:acc_end_index])
        acc.append(score)

    # write a dataframe   
    colnames = ['loo_id','clf_acc','top_num']
    df = pd.DataFrame(index=range(len(loo_id)), columns=colnames)
    df['loo_id'] = loo_id 
    df['clf_acc'] = acc
    df['top_num'] = np.repeat(top_num, len(loo_id)).tolist()
        
    return df

# generate df and concatenate them together
df_list = [extract_clf_acc(output_dir,'top_10', num_subjs),
           extract_clf_acc(output_dir,'top_20', num_subjs),
           extract_clf_acc(output_dir,'top_30', num_subjs)]
final_df = pd.concat(df_list).reset_index(drop=True)

# a quick look at the df 
ax = sns.pointplot(x="top_num", y="clf_acc", data=final_df)
ax.set_title('Classification accuracy for each top-n-mask')
ax.set_ylabel('Classification accuracy')
ax.set_xlabel('Top n mask')
print("The FCMA classification accuracy when using the functional connectivity pattern of \n voxels within each subject's top n mask to predict the current task condition (A or B) ")

The FCMA classification accuracy when using the functional connectivity pattern of 
 voxels within each subject's top n mask to predict the current task condition (A or B)


print("The cherry-picked 432 voxel brain that consists of two brain regions")
plotting.view_img(os.path.join(cur_dir, 'sim_info', 'full_mask.nii.gz'), title = "The brain mask", colorbar = False)

The cherry-picked 432 voxel brain that consists of two brain regions


print("The top 30 voxels proportion map")
plotting.view_img(os.path.join(output_dir, 'top_30','prop_top30.nii.gz'), title = "Top 30 voxels")

The top 30 voxels proportion map


# Load in the data 
epoch_data = raw_data[0] # Just load a single subject and a single epoch
mask_top_n_file = os.path.join(output_dir, 'top_30/fc_no0_result_seq_top30.nii.gz')
mask_top_n_nii = nib.load(mask_top_n_file)  # Load the mask that leaves that participant out
mask_top_n = mask_top_n_nii.get_fdata()

# Convert the top n mask into a vector with the same number of elements as the whole brain
mask_vec = mask.reshape(np.prod(mask.shape))
mask_top_vec = mask_top_n.reshape(np.prod(mask_top_n.shape))[mask_vec]

# Modify the epoch data, for visulization purpose
epoch_data[:,np.where(mask_top_vec==0)] = 1

# Make the data c continguous 
epoch_data_masked = np.ascontiguousarray(epoch_data.T, dtype=np.float32)

# Create the internal correlation
epoch_corr = compute_correlation(epoch_data_masked, epoch_data_masked)

# Load the two simulated ROIs
roiA = nib.load(os.path.join(cur_dir, 'sim_info/ROI_A_mpfc.nii.gz')).get_fdata()
roiB = nib.load(os.path.join(cur_dir, 'sim_info/ROI_B_precuenus.nii.gz')).get_fdata()

# Pull out the coordinates of the mask (in numpy space)
coord_x, coord_y, coord_z = np.where(mask == 1)

# %matplotlib inline

# What is the (absolute) correlation threshold
threshold = 0.8

# Preset the graph
G = nx.Graph()

# Create the edge list
nodelist = []
edgelist = []
for row_counter in range(epoch_corr.shape[0]):
    nodelist.append(str(row_counter))  # Set up the node names
    
    for col_counter in range(epoch_corr.shape[1]):
        
        # Determine whether to include the edge based on whether it exceeds the threshold
        if (abs(epoch_corr[row_counter, col_counter]) > threshold) and (abs(epoch_corr[row_counter, col_counter]) < 1):
            # Add a tuple specifying the voxel pairs being compared and the weight of the edge
            edgelist.append((str(row_counter), str(col_counter), {'weight': epoch_corr[row_counter, col_counter]}))
        
# Create the nodes in the graph
G.add_nodes_from(nodelist)

# Add the edges
G.add_edges_from(edgelist)

for n, d in G.nodes(data=True):
    
    # Is the voxel in roiA or B
    if roiA[coord_x[int(n)],coord_y[int(n)],coord_z[int(n)]] == 1:
        G.nodes[n]['grouping'] = 'roiA (mPFC)'

    elif roiB[coord_x[int(n)],coord_y[int(n)],coord_z[int(n)]] == 1:
        G.nodes[n]['grouping'] = 'roiB (Precuneus)'


c = CircosPlot(graph=G, node_grouping='grouping', node_color='grouping', group_label_position='middle',figsize=(10,6))
c.draw()
plt.title('Circos plot of an epoch data')
print("The figure plots all 432 voxels across two ROIs; \nand highlighted how the top 30 voxels are functionally connected, both within and across ROIs.")

The figure plots all 432 voxels across two ROIs; 
and highlighted how the top 30 voxels are functionally connected, both within and across ROIs.

Full Correlation Matrix Analysis (FCMA) demo¶

Overview¶

Table of Contents¶

Important details of simulated data ¶

1. FCMA Step1: Feature (voxel) selection ¶

1.1 The goal ¶

1.2 Data preprocessing ¶

1.3 Understanding preprocessed data ¶

1.4 Feature selection with simplified implementation ¶

Steps:¶

1.5 Understanding the outputs of feature selection ¶

1.6 Perform feature selection with actual FCMA implementation ¶

1.7 Select n top-performing features ¶

2. FCMA Step 2: Classification using the selected features (voxels) ¶

2.1 The goal ¶

2.2 Classification steps ¶

2.3 Understanding the outputs of classification ¶

2.4 Perform classification for all outer loop folds using FCMA script ¶

3 Results Visualizations ¶

3.1 Examine classification accuracy for different top-n-mask ¶

3.2 Visualize voxels in top-n-masks ¶

3.3 Visualize functional connectivity pattern with circos plot ¶

4. Summary ¶