Brainiak also includes a tutorial for FCMA, which could be find here.
Annotated Bibliography:
link
Describes successful decoding of cognitive tasks using the pattern of correlation between 90 functional regions of interest across the brain.link
Describes how complex biological systems, including neural processes, can be understood through the interactions of their component variables, such as the full correlation matrix in fMRI data.link
Describes optimizations made for Intel Xeon Phi Coprocessors to greatly enhance FCMA performance in cluster-based computing environments.link
Describes how FCMA can be used to discover regions that are involved in cognitive tasks by way of their functional connectivity.1. FCMA Step1: Feature (voxel) selection
1.1 The goal
1.2 Data preprocessing
1.3 Understanding preprocessed data
1.4 Feature selection with simplified implementation
1.5 Understading the outputs of feature selection
1.6 Perform feature selection with actual FCMA implementation
1.7 Select n top performed features
2. FCMA Step 2: Classification using the selected features (voxels)
2.1 The goal
2.2 Classification steps
2.3 Understanding the outputs of classification
2.4 Perform classification for all outer loop folds using FCMA script
3.1 Examine classification accuracy for different top-n-mask
3.2 Visualize voxels in top-n-masks
3.3 Visualize functional connectivity pattern with circos plot
from brainiak.fcma.preprocessing import prepare_fcma_data,RandomType
from brainiak.fcma.classifier import Classifier
from brainiak.fcma.util import compute_correlation
from brainiak import io
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from mpi4py import MPI
from nxviz.plots import CircosPlot
from itertools import product
from nilearn import image, plotting, datasets
import sys
import logging
import numpy as np
import pandas as pd
import nibabel as nib
import os, glob, subprocess
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
# --------------------
# Set up directories
# --------------------
cur_dir = os.getcwd()
data_dir = os.path.join(cur_dir, 'simulated_data') # What is the directory containing data?
suffix = '.nii.gz' # What is the extension of the data you're loading
mask_file = os.path.join(cur_dir, 'sim_info', 'full_mask.nii.gz') # What is the path to the whole brain mask
epoch_file = os.path.join(cur_dir, 'simulated_data', 'sim_epoch_file.npy') # What is the path to the epoch file
left_out_subj = 0 # Which participant (as an integer) are you leaving out for this cv, for the current demo, set it to 0
output_dir = os.path.join(cur_dir, 'FCMA_result') # What is the path to the folder you want to save this data in
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# ----------------------------
# Load and Preprocess the data
# ----------------------------
print("Load in the volumes, mask, and labels")
images = io.load_images_from_dir(data_dir, suffix=suffix)
mask = io.load_boolean_mask(mask_file)
epoch_list = io.load_labels(epoch_file)
print("Parse the epoch data for useful dimensions")
epochs_per_subj = epoch_list[0].shape[1]
num_subjs = len(epoch_list)
print("Preprocess the data and prepare for FCMA")
raw_data, _, labels = prepare_fcma_data(images, epoch_list, mask)
# ----------------------------------
# Outer loop - take out traning set
# for feature selection
# ----------------------------------
print("Enforce leave one out")
start_idx = int(int(left_out_subj) * epochs_per_subj)
end_idx = int(start_idx + epochs_per_subj)
print("Take out the idxs corresponding to all participants but this one")
subsampled_idx = list(set(range(len(labels))) - set(range(start_idx, end_idx)))
labels_subsampled = [labels[i] for i in subsampled_idx]
raw_data_subsampled = [raw_data[i] for i in subsampled_idx]
Load in the volumes, mask, and labels Parse the epoch data for useful dimensions Preprocess the data and prepare for FCMA Enforce leave one out Take out the idxs corresponding to all participants but this one
print(f"Label count: {len(labels_subsampled)}")
print(f"Epoch (block) count: {len(raw_data_subsampled)}")
print(f"Shape of each epoch (block): {raw_data_subsampled[0].shape}")
Label count: 140 Epoch (block) count: 140 Shape of each epoch (block): (15, 432)
# -----------------------------
# Enlarging the feature space
# -----------------------------
raw_data_subsampled_mat = [np.corrcoef(sample.T) for sample in raw_data_subsampled]
# ------------------------------------------
# Inner loop leave one out cross validation
# ------------------------------------------
vox_score = []
# for every voxel in the brain
for i in range(raw_data_subsampled_mat[0].shape[0]):
# A) extract the corresponding row from each subjects' correlation matrix
raw_data_sub_selected_vox = [sample[i,] for sample in raw_data_subsampled_mat]
# B) Perform the inner loop leave-one-out cross validation
kf = KFold(n_splits = num_subjs - 1)
score = []
for train_ind, test_ind in kf.split(labels_subsampled):
# For each fold, get training and validation set
vox_train, vox_test = [raw_data_sub_selected_vox[ind] for ind in train_ind], [raw_data_sub_selected_vox[ind] for ind in test_ind]
lab_train, lab_test = [labels_subsampled[ind] for ind in train_ind], [labels_subsampled[ind] for ind in test_ind]
# set up the model
svm_clf = SVC(kernel='linear', shrinking=False, C=1)
# train the model
svm_clf.fit(vox_train, lab_train)
# test the model
predict = svm_clf.predict(vox_test)
# C) A prediction accuracy value for this selected voxel could be computed for each fold
score.append(svm_clf.score(vox_test, lab_test))
# D) Final prediction accuracy for this voxel
vox_score.append(np.mean(score))
# rank all voxels based on their prediction accuracies, the higher the acc, the lower the rank
vox_dic = {vid:score for vid, score in zip(range(len(vox_score)), vox_score)}
vox_rank = sorted(vox_dic, key=vox_dic.get, reverse=True)
print(f"The length of the prediction accuracy list is {len(vox_score)}")
print(f"The top ranked 10 voxel IDs are {vox_rank[0:10]}")
The length of the prediction accuracy list is 432 The top ranked 10 voxel IDs are [234, 324, 382, 10, 8, 291, 17, 39, 78, 252]
# Use subprocess to run a python script with MPI. This should work on a cluster or local computer.
for left_sid in range(num_subjs):
subprocess.run(f"mpirun -np 2 python3 ./FCMA_script/fcma_voxel_selection_cv.py {data_dir} {suffix} {mask_file} {epoch_file} {left_sid} {output_dir}", shell = True)
print(f"Subject {num_subjs} done")
[os.path.basename(x) for x in glob.glob(os.path.join(output_dir, 'fc_no0*'))]
['fc_no0_result_seq.nii.gz', 'fc_no0_result_score.nii.gz', 'fc_no0_result_list.txt']
with open(output_dir + '/fc_no0_result_list.txt', 'r') as f:
line = f.readlines()
[l.split(' ')[0] for l in line[0:10]]
['234', '252', '278', '17', '382', '39', '291', '314', '119', '10']
# make top n masks
for n in [10,20,30]:
top_n = 'top_' + str(n)
#subprocess.run("module load fsl", shell = True)
subprocess.run(f"bash ./FCMA_script/make_top_voxel_mask.sh {output_dir} {n} {os.path.join(output_dir, top_n)}", shell = True)
We have selected top N features (voxels) from the original brain mask. Now we can ask if we only consider the connectivity pattern within these selected voxels, how well can a classifier do in terms of differentiating two conditions? The rationale is that if a classifier trained on the FC pattern of the selected features is enough to differentiate the two task conditions, then we could say that the selected voxels are the brain regions among which the FC changes characterize certain task differences.
For the demo, we will still focus on Subject-0. During the feature selection step, we have selected top N features (voxels) using the traning set (the data of the rest 7 subjects). Now we are going to use these features to build a predictive model and use subject-0 as a testing data to finish the first fold of our outer loop cross validation.
Preprocess all 8 subjects' data as we have discussed above, yielding a list of labels of the length equals 20 (epochs/subject) * 8 (subjects) = 160, and a list of epoch data of the same length. Within each epoch (i.e., each entry in the variable "int_data" in the code below), there is a 2d array of [timepoint, nVoxel]. Note that nVoxel depends on the size of the top n masks we are using.
Divide the data into training and testing set. This is exactly the same as the feature selection step except that now we have already selected meaningful voxels from the training set. In this demo, the training set is subject 1 - 7 and the testing set is subject-0.
Train a model using the FC pattern within the top voxels. Just like the voxel selection, here we use SVM classifer with precomputed kernel. Other classifiers can be used here, too.
Test the model using the left out subject.
# ------------------------------------------
# top n feature mask from the training set
# ------------------------------------------
# path to 3 top feature masks
top_n_mask_files = {os.path.join(output_dir, "top_10/fc_no0_result_seq_top10.nii.gz") : 'TOP 10 Voxels',
os.path.join(output_dir, "top_20/fc_no0_result_seq_top20.nii.gz") : 'TOP 20 Voxels',
os.path.join(output_dir, "top_30/fc_no0_result_seq_top30.nii.gz") : 'TOP 30 Voxels'}
# for each top feature mask:
for top_n_mask_path in top_n_mask_files.keys():
# ----------------
# preprocess data
# ----------------
# Load in the volumes, mask and labels
images = io.load_images_from_dir(data_dir, suffix=suffix)
top_n_mask = io.load_boolean_mask(top_n_mask_path)
epoch_list = io.load_labels(epoch_file)
# Parse the epoch data for useful dimensions
epochs_per_subj = epochs_per_subj = epoch_list[0].shape[1]
num_subjs = len(epoch_list)
# Prepare the data
int_data, _, labels = prepare_fcma_data(images, epoch_list, top_n_mask)
# -------------------
# Outer loop testing
# -------------------
# What indexes pick out the left out participant?
start_idx = int(int(left_out_subj) * epochs_per_subj)
end_idx = int(start_idx + epochs_per_subj)
# Take out the idxs corresponding to all participants but this one
training_idx = list(set(range(len(labels))) - set(range(start_idx, end_idx)))
testing_idx = list(range(start_idx, end_idx))
# Pull out the data
int_data_training = [int_data[i] for i in training_idx]
int_data_testing = [int_data[i] for i in testing_idx]
# Pull out the labels
labels_training = [labels[i] for i in training_idx]
labels_testing = [labels[i] for i in testing_idx]
# Prepare the data to be processed efficiently (albeit in a less easy to follow way)
rearranged_int_data = int_data_training + int_data_testing
rearranged_labels = labels_training + labels_testing
num_training_samples = epochs_per_subj * (num_subjs - 1)
# Set up data so that the internal mask is correlated with the internal mask
corr_obj = list(zip(rearranged_int_data, rearranged_int_data))
# no shrinking, set C=1
svm_clf = SVC(kernel='precomputed', shrinking=False, C=1)
clf = Classifier(svm_clf, epochs_per_subj=epochs_per_subj)
# Train the model on the training data
clf.fit(corr_obj, rearranged_labels, num_training_samples)
# Test on the testing data
predict = clf.predict()
# ---------------------
# Print out the results
# ---------------------
print(f'--{top_n_mask_files[top_n_mask_path]} RESULTS--')
print(f"Top feature size: {top_n_mask_files[top_n_mask_path]}")
print(clf.decision_function())
print(clf.predict())
# How often does the prediction match the target
num_correct = (np.asanyarray(predict) == np.asanyarray(labels_testing)).sum()
# Report accuracy
print( 'When leaving subject %d out for testing using the %s mask for an intrinsic correlation, the accuracy is %d / %d = %.2f' %
(int(left_out_subj), top_n_mask_path, num_correct, epochs_per_subj, num_correct / epochs_per_subj))
print('---------------------------------------------------------------------------------------------------')
--TOP 10 Voxels RESULTS-- Top feature size: TOP 10 Voxels [ 0.38126299 -2.41850781 0.45284516 -1.80357795 0.17842044 -0.77340787 1.75422849 1.86623616 3.07343915 -3.13254844 -1.45388323 -0.41695132 0.44155523 0.22353497 -0.83062927 0.7705409 0.99191314 -0.12486879 -1.44480709 -0.06320189] [1 0 1 0 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 0] When leaving subject 0 out for testing using the /Users/peetal/Documents/GitHub/FCMA_demo/FCMA_result/top_10/fc_no0_result_seq_top10.nii.gz mask for an intrinsic correlation, the accuracy is 8 / 20 = 0.40 --------------------------------------------------------------------------------------------------- --TOP 20 Voxels RESULTS-- Top feature size: TOP 20 Voxels [-8.86536867e-01 1.07148090e-01 -8.85313201e-01 -6.78471999e-01 2.52912073e-01 -1.22895096e+00 -1.69236952e-04 -1.41433092e-01 3.92896756e-02 -1.17931980e+00 5.94036382e-02 1.80326335e+00 3.25205485e-01 4.20668033e-01 -2.49353493e-01 9.49864680e-01 -8.31116802e-02 -5.89936394e-01 1.70349872e+00 1.34718994e-01] [0 1 0 0 1 0 0 0 1 0 1 1 1 1 0 1 0 0 1 1] When leaving subject 0 out for testing using the /Users/peetal/Documents/GitHub/FCMA_demo/FCMA_result/top_20/fc_no0_result_seq_top20.nii.gz mask for an intrinsic correlation, the accuracy is 14 / 20 = 0.70 --------------------------------------------------------------------------------------------------- --TOP 30 Voxels RESULTS-- Top feature size: TOP 30 Voxels [-1.65670387e-01 -3.42620453e-01 -6.07346123e-01 -1.22227886e-01 7.48107093e-02 -8.58807325e-01 -5.63491134e-01 -1.22717309e-01 -6.08503929e-01 -3.74452274e-01 3.77464014e-01 5.14801342e-01 9.31709915e-01 4.56100009e-01 -8.34279388e-01 9.91251666e-01 -3.33540780e-01 -1.08481783e-03 1.46670936e+00 -6.37847517e-02] [0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0] When leaving subject 0 out for testing using the /Users/peetal/Documents/GitHub/FCMA_demo/FCMA_result/top_30/fc_no0_result_seq_top30.nii.gz mask for an intrinsic correlation, the accuracy is 15 / 20 = 0.75 ---------------------------------------------------------------------------------------------------
# run classification for each left out subject for each top-n-mask size
for n, left_sid in product([10,20,30],range(8)):
top_n_mask_file_path = os.path.join(output_dir, f'top_{n}', f'fc_no{left_sid}_result_seq_top{n}.nii.gz')
results_path = os.path.join(output_dir, f'top_{n}')
subprocess.run(f"mpirun -np 2 python3 ./FCMA_script/fcma_classify.py {data_dir} {suffix} {top_n_mask_file_path} {epoch_file} {left_sid} {results_path}", shell = True)
print(f"Left out subject {left_sid}, Top {n} voxel mask, Classification done")
Left out subject 0, Top 10 voxel mask, Classification done Left out subject 1, Top 10 voxel mask, Classification done Left out subject 2, Top 10 voxel mask, Classification done Left out subject 3, Top 10 voxel mask, Classification done Left out subject 4, Top 10 voxel mask, Classification done Left out subject 5, Top 10 voxel mask, Classification done Left out subject 6, Top 10 voxel mask, Classification done Left out subject 7, Top 10 voxel mask, Classification done Left out subject 0, Top 20 voxel mask, Classification done Left out subject 1, Top 20 voxel mask, Classification done Left out subject 2, Top 20 voxel mask, Classification done Left out subject 3, Top 20 voxel mask, Classification done Left out subject 4, Top 20 voxel mask, Classification done Left out subject 5, Top 20 voxel mask, Classification done Left out subject 6, Top 20 voxel mask, Classification done Left out subject 7, Top 20 voxel mask, Classification done Left out subject 0, Top 30 voxel mask, Classification done Left out subject 1, Top 30 voxel mask, Classification done Left out subject 2, Top 30 voxel mask, Classification done Left out subject 3, Top 30 voxel mask, Classification done Left out subject 4, Top 30 voxel mask, Classification done Left out subject 5, Top 30 voxel mask, Classification done Left out subject 6, Top 30 voxel mask, Classification done Left out subject 7, Top 30 voxel mask, Classification done
# write a function to extract left-out subject id and the classification acc.
def extract_clf_acc(base_dir, top_num, num_subjs):
import os
import numpy as np
import pandas as pd
path = os.path.join(base_dir, top_num, 'classify_result.txt')
# read each line, from which extract the left out subject id and the clf acc
with open(path, "r") as file:
lines = file.readlines()
loo_id = []
acc = []
for line in lines:
id_start_index = line.index('_no') + len('_no')
id_end_index = line.index('_result_seq')
id = int(line[id_start_index:id_end_index])
loo_id.append(id)
acc_start_index = line.index(': ') + 2
acc_end_index = line.index('\n')
score = float(line[acc_start_index:acc_end_index])
acc.append(score)
# write a dataframe
colnames = ['loo_id','clf_acc','top_num']
df = pd.DataFrame(index=range(len(loo_id)), columns=colnames)
df['loo_id'] = loo_id
df['clf_acc'] = acc
df['top_num'] = np.repeat(top_num, len(loo_id)).tolist()
return df
# generate df and concatenate them together
df_list = [extract_clf_acc(output_dir,'top_10', num_subjs),
extract_clf_acc(output_dir,'top_20', num_subjs),
extract_clf_acc(output_dir,'top_30', num_subjs)]
final_df = pd.concat(df_list).reset_index(drop=True)
# a quick look at the df
ax = sns.pointplot(x="top_num", y="clf_acc", data=final_df)
ax.set_title('Classification accuracy for each top-n-mask')
ax.set_ylabel('Classification accuracy')
ax.set_xlabel('Top n mask')
print("The FCMA classification accuracy when using the functional connectivity pattern of \n voxels within each subject's top n mask to predict the current task condition (A or B) ")
The FCMA classification accuracy when using the functional connectivity pattern of voxels within each subject's top n mask to predict the current task condition (A or B)
print("The cherry-picked 432 voxel brain that consists of two brain regions")
plotting.view_img(os.path.join(cur_dir, 'sim_info', 'full_mask.nii.gz'), title = "The brain mask", colorbar = False)
The cherry-picked 432 voxel brain that consists of two brain regions