function [para, net] = config_STGConvNet_largeScale(category, type)


para.categoryName = category;
para.type = type;
para.numChain =13; % number of synthesized images
para.inPath = ['trainingVideo' filesep 'data_largeScale' filesep para.categoryName  filesep];
para.outPath= ['output' filesep para.categoryName filesep];
para.outPath_images = [para.outPath 'observed_sequence'];
para.outPath_synthesis = [para.outPath 'synthesis_sequence'];
para.outPath_variable = [para.outPath 'variable'];
para.outPath_final_result = [para.outPath 'final_result'];



%% sizes of the training images
switch para.type
    
    case 'FC_ST_2'
        para.sx=100;  %(height)
        para.sy=200;  %(length)
    case 'FC_S_3'  %(Exp 2 in the paper)
        para.sx=100;  %(height)
        para.sy=100;  %(length)
    case 'FC_S_3_large'  %(Exp 2 in the paper)   
        para.sx=224;  %(height)
        para.sy=224;  %(length)
    case 'ST_3'   %(Exp 1 in the paper)   
        para.sx=224;  %(height)
        para.sy=224;  %(length)
    otherwise
        error('No such an option');
end


%% MCMC parameters
para.stepsize = 0.3;%0.03 ; % 0.25  % Langevin step size
para.L= 20;  %10    % number of iterations in MCMC

%% others
para.isColor=true;  % true/ false

%%
para.maximumNumFrames = 30 %70;  % the maximum number of frames is used for each training video.

%% optimization parameters
para.momentum(1) = 0;  % for weights
para.momentum(2) = 0; % for bias
para.decay(1) = 0;%0.0001;
para.decay(2) = 0;
para.max_gradient = 40;

%para.numIteration=3000;  %1000


% color or grey image
if para.isColor==true
    para.numChannel=3;
else
    para.numChannel=1;   
end

%% for large scale (mini-batch)
para.sizeMiniBatch = 10;


%% read video and save into images
videoFile = dir([para.inPath '*.mp4']);   % avi
para.numVideos = length(videoFile);

if para.numVideos==0
    videoFile = dir([para.inPath '*.avi']);   % avi
    para.numVideos = length(videoFile);
end
if para.numVideos==0
   error('No training images'); 
end

if ~exist(para.outPath_synthesis)
    mkdir(para.outPath_synthesis); % working directory to store synthesized images files.
end

numFramesInTrainingVideos=zeros(1,para.numVideos);

if exist([para.outPath_images filesep 'numFramesInTrainingVideos.mat'])
    load([para.outPath_images 'numFramesInTrainingVideos.mat'],'numFramesInTrainingVideos');
else
    for iVideo = 1: para.numVideos
        
        shuttleVideo = VideoReader([para.inPath videoFile(iVideo).name]);
        outputFolder= [para.outPath_images, filesep 'sequence_' num2str(iVideo)];
        if ~exist(outputFolder)
            mkdir(outputFolder); % working directory to store synthesized images files.
        end
        
        ii = 0;
        while hasFrame(shuttleVideo)
            ii = ii+1;
            img = readFrame(shuttleVideo);
            img = imresize(img,[para.sx, para.sy], 'bilinear');
            filename = [sprintf('%03d',ii) '.jpg'];
            fullname = fullfile(outputFolder, filename);
            imwrite(img,fullname)    % Write out to a JPEG file (img1.jpg, img2.jpg, etc.)
        end
        
        if ii==0
            error('No training images');
        end
        numFramesInTrainingVideos(iVideo)=ii;
    end
    save([para.outPath_images 'numFramesInTrainingVideos.mat'],'numFramesInTrainingVideos');
end


para.numFrames = min( min(numFramesInTrainingVideos), para.maximumNumFrames); % need to fix it
para.sz = para.numFrames;
para.imageSet = zeros([para.sx, para.sy, para.numFrames, para.numChannel, para.numVideos], 'single'); % saved in CPU

for iVideo = 1: para.numVideos
    
    outputFolder= [para.outPath_images, filesep 'sequence_' num2str(iVideo)];
    imagesFile = dir([outputFolder filesep '*.jpg']);
    
    for iFrame = 1: para.numFrames
        
        %% read training image and rescale
        img = imread(fullfile(outputFolder, imagesFile(iFrame).name));
        if size(img,3)~=para.numChannel
            if para.numChannel==3
                disp('This is not a color image.')
            else
                img = rgb2gray(img);
                disp('This is not a grey image. We will change it into grey scale.')
            end
        end
        
        img = imresize(img, [para.sx, para.sy], 'bilinear' );
        img=single(img);
        para.imageSet(:,:,iFrame,:,iVideo) = img;        
    end
end

para.mean_img = mean(para.imageSet(:)); % global mean
para.imageSet = para.imageSet - para.mean_img; % substract mean for all images


%% miniBatch assignment

para.numBatch = ceil(para.numVideos / para.sizeMiniBatch);
assignment= randperm(para.numVideos);
for iBatch=1:para.numBatch 
    startID= 1 + para.sizeMiniBatch * (iBatch -1);
    endID= para.sizeMiniBatch + para.sizeMiniBatch * (iBatch -1);
    if endID>para.numVideos;
        endID = para.numVideos;
    end
    para.batch{iBatch}= assignment(startID:endID);
end

switch para.type
    
    case 'FC_ST_2'   % two-layer / single spatial-temporal fully connected filter in the top layer  (learning action pattern)
        
        para.learningScheme='end_to_end';
        para.numIteration=1000;
        net.FC=true;   % fully connected layer on the top layer + no reLU
        
        %% filter at the first layer
        net.layers{1}.numFilter=200;    % number of filters
        net.layers{1}.sizeFilter_x=7; %15    odd number
        net.layers{1}.sizeFilter_y=7; %15    odd number
        net.layers{1}.durationFilter= 7; %15
        
        net.layers{1}.subsampling_width_x=3; %round(firstLayer.halfSizeFilter/2);
        net.layers{1}.subsampling_width_y=3; %round(firstLayer.halfSizeFilter/2);
        net.layers{1}.subsampling_width_z=3; %max(1,round(firstLayer.halfSizeDurationFilter/2));
        
        net.layers{1}.halfSizeFilter_x= floor(net.layers{1}.sizeFilter_x/2);
        net.layers{1}.halfSizeFilter_y= floor(net.layers{1}.sizeFilter_y/2);
        net.layers{1}.halfSizeDurationFilter= floor(net.layers{1}.durationFilter/2);
        
        net.layers{1}.pad_x = net.layers{1}.halfSizeFilter_x;
        net.layers{1}.pad_y = net.layers{1}.halfSizeFilter_y;
        net.layers{1}.pad_z = net.layers{1}.halfSizeDurationFilter;
        
        net.layers{1}.pad=[net.layers{1}.pad_x, net.layers{1}.pad_x, net.layers{1}.pad_y, net.layers{1}.pad_y, net.layers{1}.pad_z, net.layers{1}.pad_z];
        net.layers{1}.stride=[net.layers{1}.subsampling_width_x, net.layers{1}.subsampling_width_y, net.layers{1}.subsampling_width_z];
        
        net.layers{1}.filters = gpuArray.randn([net.layers{1}.sizeFilter_x, net.layers{1}.sizeFilter_y, net.layers{1}.durationFilter, para.numChannel, net.layers{1}.numFilter], 'single')*0.001;
        net.layers{1}.bias = gpuArray.zeros(1, net.layers{1}.numFilter,'single');
        
        net.layers{1}.sumMap_sx= floor( (para.sx + 2* net.layers{1}.pad_x - net.layers{1}.sizeFilter_x) /  net.layers{1}.subsampling_width_x)+1;
        net.layers{1}.sumMap_sy= floor( (para.sy + 2* net.layers{1}.pad_y - net.layers{1}.sizeFilter_y) /  net.layers{1}.subsampling_width_y)+1;
        net.layers{1}.sumMap_sz= floor( (para.sz + 2* net.layers{1}.pad_z - net.layers{1}.durationFilter) /  net.layers{1}.subsampling_width_z)+1;
        
        net.layers{1}.momentum_filter = zeros(size(net.layers{1}.filters), 'single');
        net.layers{1}.momentum_bias = zeros(size(net.layers{1}.bias), 'single');
        
        
        %% filter at the second layer
        net.layers{2}.numFilter=1;
        net.layers{2}.sizeFilter_x=net.layers{1}.sumMap_sx; %15
        net.layers{2}.sizeFilter_y=net.layers{1}.sumMap_sy; %15
        net.layers{2}.durationFilter=net.layers{1}.sumMap_sz;% 5; %5
        
        net.layers{2}.subsampling_width_x=1%round(firstLayer.sizeFilter* 0.9);
        net.layers{2}.subsampling_width_y=net.layers{2}.subsampling_width_x;
        net.layers{2}.subsampling_width_z=1%max(1,round(secondLayer.halfSizeDurationFilter/2));
        
        net.layers{2}.halfSizeFilter_x= floor(net.layers{2}.sizeFilter_x/2);
        net.layers{2}.halfSizeFilter_y= floor(net.layers{2}.sizeFilter_y/2);
        net.layers{2}.halfSizeDurationFilter= floor(net.layers{2}.durationFilter/2);        
        
        net.layers{2}.pad_x = 0;
        net.layers{2}.pad_y = 0;
        net.layers{2}.pad_z = 0;        
        
        net.layers{2}.pad=[net.layers{2}.pad_x, net.layers{2}.pad_x, net.layers{2}.pad_y, net.layers{2}.pad_y, net.layers{2}.pad_z, net.layers{2}.pad_z];  % (fully connected layer)
        net.layers{2}.stride=[net.layers{2}.subsampling_width_x, net.layers{2}.subsampling_width_y, net.layers{2}.subsampling_width_z];
        
        % initialize the filters at the second layer
        net.layers{2}.filters = gpuArray.randn([net.layers{2}.sizeFilter_x, net.layers{2}.sizeFilter_y, net.layers{2}.durationFilter, net.layers{1}.numFilter, net.layers{2}.numFilter], 'single')*0.001;
        net.layers{2}.bias = gpuArray.zeros(1, net.layers{2}.numFilter,'single');
        
        net.layers{2}.sumMap_sx= floor( (net.layers{1}.sumMap_sx + 2* net.layers{2}.pad_x - net.layers{2}.sizeFilter_x) /  net.layers{2}.subsampling_width_x)+1;
        net.layers{2}.sumMap_sy= floor( (net.layers{1}.sumMap_sy + 2* net.layers{2}.pad_y - net.layers{2}.sizeFilter_y) /  net.layers{2}.subsampling_width_y)+1;
        net.layers{2}.sumMap_sz= floor( (net.layers{1}.sumMap_sz + 2* net.layers{2}.pad_z - net.layers{2}.durationFilter) /  net.layers{2}.subsampling_width_z)+1;
        
        net.layers{2}.momentum_filter = zeros(size(net.layers{2}.filters), 'single');
        net.layers{2}.momentum_bias = zeros(size(net.layers{2}.bias), 'single');
        
        
        %% learning rate      
        net.layers{2}.lambdaLearningRate = 0.001;  %0.0000005; % 0.0007 is small but save. 0.002 is good but sometimes fails
        net.layers{1}.lambdaLearningRate = 0.01;     %0.005;
        
        
 
        
    case 'FC_S_3'   % three-layer / single spatial-temporal fully connected filter in the top layer
        
        
        para.learningScheme='end_to_end';  % learning scheme is end-to-end
        para.numIteration=1400;  %1000
        net.FC=false;
        %% filter at the first layer
        net.layers{1}.numFilter=120;    % number of filters
        net.layers{1}.sizeFilter_x=7; %15    odd number
        net.layers{1}.sizeFilter_y=7; %15    odd number
        net.layers{1}.durationFilter= 7; %15
        
        net.layers{1}.subsampling_width_x=2; %round(firstLayer.halfSizeFilter/2);
        net.layers{1}.subsampling_width_y=2; %round(firstLayer.halfSizeFilter/2);
        net.layers{1}.subsampling_width_z=2; %max(1,round(firstLayer.halfSizeDurationFilter/2));
        
        net.layers{1}.halfSizeFilter_x= floor(net.layers{1}.sizeFilter_x/2);
        net.layers{1}.halfSizeFilter_y= floor(net.layers{1}.sizeFilter_y/2);
        net.layers{1}.halfSizeDurationFilter= floor(net.layers{1}.durationFilter/2);
        
        net.layers{1}.pad_x = net.layers{1}.halfSizeFilter_x;
        net.layers{1}.pad_y = net.layers{1}.halfSizeFilter_y;
        net.layers{1}.pad_z = net.layers{1}.halfSizeDurationFilter;
        
        net.layers{1}.pad=[net.layers{1}.pad_x, net.layers{1}.pad_x, net.layers{1}.pad_y, net.layers{1}.pad_y, net.layers{1}.pad_z, net.layers{1}.pad_z];
        net.layers{1}.stride=[net.layers{1}.subsampling_width_x, net.layers{1}.subsampling_width_y, net.layers{1}.subsampling_width_z];
        
        net.layers{1}.filters = gpuArray.randn([net.layers{1}.sizeFilter_x, net.layers{1}.sizeFilter_y, net.layers{1}.durationFilter, para.numChannel, net.layers{1}.numFilter], 'single')*0.001;
        net.layers{1}.bias = gpuArray.zeros(1, net.layers{1}.numFilter,'single');
        
        net.layers{1}.sumMap_sx= floor( (para.sx + 2* net.layers{1}.pad_x - net.layers{1}.sizeFilter_x) /  net.layers{1}.subsampling_width_x)+1;
        net.layers{1}.sumMap_sy= floor( (para.sy + 2* net.layers{1}.pad_y - net.layers{1}.sizeFilter_y) /  net.layers{1}.subsampling_width_y)+1;
        net.layers{1}.sumMap_sz= floor( (para.sz + 2* net.layers{1}.pad_z - net.layers{1}.durationFilter) /  net.layers{1}.subsampling_width_z)+1;
        
        net.layers{1}.momentum_filter = zeros(size(net.layers{1}.filters), 'single');
        net.layers{1}.momentum_bias = zeros(size(net.layers{1}.bias), 'single');
        
        
        %% filter at the second layer
        net.layers{2}.numFilter=30;
        net.layers{2}.sizeFilter_x=net.layers{1}.sumMap_sx; %15
        net.layers{2}.sizeFilter_y=net.layers{1}.sumMap_sy; %15
        net.layers{2}.durationFilter=5;% 5; %5
        
        net.layers{2}.subsampling_width_x=2;%round(firstLayer.sizeFilter* 0.9);
        net.layers{2}.subsampling_width_y=2;
        net.layers{2}.subsampling_width_z=2;%max(1,round(secondLayer.halfSizeDurationFilter/2));
        
        net.layers{2}.halfSizeFilter_x= floor(net.layers{2}.sizeFilter_x/2);
        net.layers{2}.halfSizeFilter_y= floor(net.layers{2}.sizeFilter_y/2);
        net.layers{2}.halfSizeDurationFilter= floor(net.layers{2}.durationFilter/2);
        
        
        net.layers{2}.pad_x = 0;
        net.layers{2}.pad_y = 0;
        net.layers{2}.pad_z = net.layers{2}.halfSizeDurationFilter;
        
        
        net.layers{2}.pad=[net.layers{2}.pad_x, net.layers{2}.pad_x, net.layers{2}.pad_y, net.layers{2}.pad_y, net.layers{2}.pad_z, net.layers{2}.pad_z];  % (fully connected layer)
        net.layers{2}.stride=[net.layers{2}.subsampling_width_x, net.layers{2}.subsampling_width_y, net.layers{2}.subsampling_width_z];
        
        % initialize the filters at the second layer
        net.layers{2}.filters = gpuArray.randn([net.layers{2}.sizeFilter_x, net.layers{2}.sizeFilter_y, net.layers{2}.durationFilter, net.layers{1}.numFilter, net.layers{2}.numFilter], 'single')*0.001;
        net.layers{2}.bias = gpuArray.zeros(1, net.layers{2}.numFilter,'single');
        
        net.layers{2}.sumMap_sx= floor( (net.layers{1}.sumMap_sx + 2* net.layers{2}.pad_x - net.layers{2}.sizeFilter_x) /  net.layers{2}.subsampling_width_x)+1;
        net.layers{2}.sumMap_sy= floor( (net.layers{1}.sumMap_sy + 2* net.layers{2}.pad_y - net.layers{2}.sizeFilter_y) /  net.layers{2}.subsampling_width_y)+1;
        net.layers{2}.sumMap_sz= floor( (net.layers{1}.sumMap_sz + 2* net.layers{2}.pad_z - net.layers{2}.durationFilter) /  net.layers{2}.subsampling_width_z)+1;
        
        net.layers{2}.momentum_filter = zeros(size(net.layers{2}.filters), 'single');
        net.layers{2}.momentum_bias = zeros(size(net.layers{2}.bias), 'single');
        
        
        %% filter at the third layer
        net.layers{3}.numFilter=5;%1;
        net.layers{3}.sizeFilter_x=1;%net.layers{2}.sumMap_sx; %15
        net.layers{3}.sizeFilter_y=1;%net.layers{2}.sumMap_sx; %15
        net.layers{3}.durationFilter= 2;%net.layers{2}.sumMap_sz;% 5; %5
        
        net.layers{3}.subsampling_width_x=2;%round(firstLayer.sizeFilter* 0.9);
        net.layers{3}.subsampling_width_y=net.layers{3}.subsampling_width_x;
        net.layers{3}.subsampling_width_z=1;%max(1,round(secondLayer.halfSizeDurationFilter/2));
        
        net.layers{3}.halfSizeFilter_x= floor(net.layers{3}.sizeFilter_x/2);
        net.layers{3}.halfSizeFilter_y= floor(net.layers{3}.sizeFilter_y/2);
        net.layers{3}.halfSizeDurationFilter= floor(net.layers{3}.durationFilter/2);
        
        
        net.layers{3}.pad_x = 0;
        net.layers{3}.pad_y = 0;
        net.layers{3}.pad_z = net.layers{3}.halfSizeDurationFilter;%0;
        
        
        net.layers{3}.pad=[net.layers{3}.pad_x, net.layers{3}.pad_x, net.layers{3}.pad_y, net.layers{3}.pad_y, net.layers{3}.pad_z, net.layers{3}.pad_z];  % (fully connected layer)
        net.layers{3}.stride=[net.layers{3}.subsampling_width_x, net.layers{3}.subsampling_width_y, net.layers{3}.subsampling_width_z];
        
        % initialize the filters at the second layer
        net.layers{3}.filters = gpuArray.randn([net.layers{3}.sizeFilter_x, net.layers{3}.sizeFilter_y, net.layers{3}.durationFilter, net.layers{2}.numFilter, net.layers{3}.numFilter], 'single')*0.001;
        net.layers{3}.bias = gpuArray.zeros(1, net.layers{3}.numFilter,'single');
        
        net.layers{3}.sumMap_sx= floor( (net.layers{2}.sumMap_sx + 2* net.layers{3}.pad_x - net.layers{3}.sizeFilter_x) /  net.layers{3}.subsampling_width_x)+1;
        net.layers{3}.sumMap_sy= floor( (net.layers{2}.sumMap_sy + 2* net.layers{3}.pad_y - net.layers{3}.sizeFilter_y) /  net.layers{3}.subsampling_width_y)+1;
        net.layers{3}.sumMap_sz= floor( (net.layers{2}.sumMap_sz + 2* net.layers{3}.pad_z - net.layers{3}.durationFilter) /  net.layers{3}.subsampling_width_z)+1;
        
        net.layers{3}.momentum_filter = zeros(size(net.layers{3}.filters), 'single');
        net.layers{3}.momentum_bias = zeros(size(net.layers{3}.bias), 'single');       
               
        
        
        %% learning rate       
        net.layers{3}.lambdaLearningRate = 0.0001; 
        net.layers{2}.lambdaLearningRate = 0.001;  
        net.layers{1}.lambdaLearningRate = 0.01;
        
    case 'FC_S_3_large'  %(Exp 2 in the paper)  
        
        para.learningScheme='end_to_end';  % learning scheme is end-to-end
        para.numIteration=600;  %1000
        net.FC=false;
        
        %% filter at the first layer
        net.layers{1}.numFilter=120;    % number of filters
        net.layers{1}.sizeFilter_x=7; %15    odd number
        net.layers{1}.sizeFilter_y=7; %15    odd number
        net.layers{1}.durationFilter= 7; %15
        
        net.layers{1}.subsampling_width_x=3; %round(firstLayer.halfSizeFilter/2);
        net.layers{1}.subsampling_width_y=3; %round(firstLayer.halfSizeFilter/2);
        net.layers{1}.subsampling_width_z=3; %max(1,round(firstLayer.halfSizeDurationFilter/2));
        
        net.layers{1}.halfSizeFilter_x= floor(net.layers{1}.sizeFilter_x/2);
        net.layers{1}.halfSizeFilter_y= floor(net.layers{1}.sizeFilter_y/2);
        net.layers{1}.halfSizeDurationFilter= floor(net.layers{1}.durationFilter/2);
        
        net.layers{1}.pad_x = net.layers{1}.halfSizeFilter_x;
        net.layers{1}.pad_y = net.layers{1}.halfSizeFilter_y;
        net.layers{1}.pad_z = net.layers{1}.halfSizeDurationFilter;
        
        net.layers{1}.pad=[net.layers{1}.pad_x, net.layers{1}.pad_x, net.layers{1}.pad_y, net.layers{1}.pad_y, net.layers{1}.pad_z, net.layers{1}.pad_z];
        net.layers{1}.stride=[net.layers{1}.subsampling_width_x, net.layers{1}.subsampling_width_y, net.layers{1}.subsampling_width_z];
        
        net.layers{1}.filters = gpuArray.randn([net.layers{1}.sizeFilter_x, net.layers{1}.sizeFilter_y, net.layers{1}.durationFilter, para.numChannel, net.layers{1}.numFilter], 'single')*0.001;
        net.layers{1}.bias = gpuArray.zeros(1, net.layers{1}.numFilter,'single');
        
        net.layers{1}.sumMap_sx= floor( (para.sx + 2* net.layers{1}.pad_x - net.layers{1}.sizeFilter_x) /  net.layers{1}.subsampling_width_x)+1;
        net.layers{1}.sumMap_sy= floor( (para.sy + 2* net.layers{1}.pad_y - net.layers{1}.sizeFilter_y) /  net.layers{1}.subsampling_width_y)+1;
        net.layers{1}.sumMap_sz= floor( (para.sz + 2* net.layers{1}.pad_z - net.layers{1}.durationFilter) /  net.layers{1}.subsampling_width_z)+1;
        
        net.layers{1}.momentum_filter = zeros(size(net.layers{1}.filters), 'single');
        net.layers{1}.momentum_bias = zeros(size(net.layers{1}.bias), 'single');
        
        
        %% filter at the second layer
        net.layers{2}.numFilter=30;
        net.layers{2}.sizeFilter_x=net.layers{1}.sumMap_sx; %15
        net.layers{2}.sizeFilter_y=net.layers{1}.sumMap_sy; %15
        net.layers{2}.durationFilter=4;% 9
        
        net.layers{2}.subsampling_width_x=1;%round(firstLayer.sizeFilter* 0.9);
        net.layers{2}.subsampling_width_y=1;
        net.layers{2}.subsampling_width_z=2;%max(1,round(secondLayer.halfSizeDurationFilter/2));
        
        net.layers{2}.halfSizeFilter_x= floor(net.layers{2}.sizeFilter_x/2);
        net.layers{2}.halfSizeFilter_y= floor(net.layers{2}.sizeFilter_y/2);
        net.layers{2}.halfSizeDurationFilter= floor(net.layers{2}.durationFilter/2);
        
        
        net.layers{2}.pad_x = 0;
        net.layers{2}.pad_y = 0;
        net.layers{2}.pad_z = net.layers{2}.halfSizeDurationFilter;
        
        
        net.layers{2}.pad=[net.layers{2}.pad_x, net.layers{2}.pad_x, net.layers{2}.pad_y, net.layers{2}.pad_y, net.layers{2}.pad_z, net.layers{2}.pad_z];  % (fully connected layer)
        net.layers{2}.stride=[net.layers{2}.subsampling_width_x, net.layers{2}.subsampling_width_y, net.layers{2}.subsampling_width_z];
        
        % initialize the filters at the second layer
        net.layers{2}.filters = gpuArray.randn([net.layers{2}.sizeFilter_x, net.layers{2}.sizeFilter_y, net.layers{2}.durationFilter, net.layers{1}.numFilter, net.layers{2}.numFilter], 'single')*0.001;
        net.layers{2}.bias = gpuArray.zeros(1, net.layers{2}.numFilter,'single');
        
        net.layers{2}.sumMap_sx= floor( (net.layers{1}.sumMap_sx + 2* net.layers{2}.pad_x - net.layers{2}.sizeFilter_x) /  net.layers{2}.subsampling_width_x)+1;
        net.layers{2}.sumMap_sy= floor( (net.layers{1}.sumMap_sy + 2* net.layers{2}.pad_y - net.layers{2}.sizeFilter_y) /  net.layers{2}.subsampling_width_y)+1;
        net.layers{2}.sumMap_sz= floor( (net.layers{1}.sumMap_sz + 2* net.layers{2}.pad_z - net.layers{2}.durationFilter) /  net.layers{2}.subsampling_width_z)+1;
        
        net.layers{2}.momentum_filter = zeros(size(net.layers{2}.filters), 'single');
        net.layers{2}.momentum_bias = zeros(size(net.layers{2}.bias), 'single');
        
        
        %% filter at the third layer
        net.layers{3}.numFilter=5%1;
        net.layers{3}.sizeFilter_x=1;%net.layers{2}.sumMap_sx; %15
        net.layers{3}.sizeFilter_y=1;%net.layers{2}.sumMap_sx; %15
        net.layers{3}.durationFilter= 2;%net.layers{2}.sumMap_sz;% 5; %5
        
        net.layers{3}.subsampling_width_x=1;%round(firstLayer.sizeFilter* 0.9);
        net.layers{3}.subsampling_width_y=net.layers{3}.subsampling_width_x;
        net.layers{3}.subsampling_width_z=1;%max(1,round(secondLayer.halfSizeDurationFilter/2));
        
        net.layers{3}.halfSizeFilter_x= floor(net.layers{3}.sizeFilter_x/2);
        net.layers{3}.halfSizeFilter_y= floor(net.layers{3}.sizeFilter_y/2);
        net.layers{3}.halfSizeDurationFilter= floor(net.layers{3}.durationFilter/2);
        
        
        net.layers{3}.pad_x = 0;
        net.layers{3}.pad_y = 0;
        net.layers{3}.pad_z = net.layers{3}.halfSizeDurationFilter;%0;
        
        
        net.layers{3}.pad=[net.layers{3}.pad_x, net.layers{3}.pad_x, net.layers{3}.pad_y, net.layers{3}.pad_y, net.layers{3}.pad_z, net.layers{3}.pad_z];  % (fully connected layer)
        net.layers{3}.stride=[net.layers{3}.subsampling_width_x, net.layers{3}.subsampling_width_y, net.layers{3}.subsampling_width_z];
        
        % initialize the filters at the second layer
        net.layers{3}.filters = gpuArray.randn([net.layers{3}.sizeFilter_x, net.layers{3}.sizeFilter_y, net.layers{3}.durationFilter, net.layers{2}.numFilter, net.layers{3}.numFilter], 'single')*0.001;
        net.layers{3}.bias = gpuArray.zeros(1, net.layers{3}.numFilter,'single');
        
        net.layers{3}.sumMap_sx= floor( (net.layers{2}.sumMap_sx + 2* net.layers{3}.pad_x - net.layers{3}.sizeFilter_x) /  net.layers{3}.subsampling_width_x)+1;
        net.layers{3}.sumMap_sy= floor( (net.layers{2}.sumMap_sy + 2* net.layers{3}.pad_y - net.layers{3}.sizeFilter_y) /  net.layers{3}.subsampling_width_y)+1;
        net.layers{3}.sumMap_sz= floor( (net.layers{2}.sumMap_sz + 2* net.layers{3}.pad_z - net.layers{3}.durationFilter) /  net.layers{3}.subsampling_width_z)+1;
        
        net.layers{3}.momentum_filter = zeros(size(net.layers{3}.filters), 'single');
        net.layers{3}.momentum_bias = zeros(size(net.layers{3}.bias), 'single');                  
        
        
       %% learning rate        
        net.layers{3}.lambdaLearningRate = 0.0001;  %0.002; % 0.0007 is small but save. 0.002 is good but sometimes fails
        net.layers{2}.lambdaLearningRate = 0.001;  %0.002; % 0.0007 is small but save. 0.002 is good but sometimes fails
        net.layers{1}.lambdaLearningRate = 0.01;
            
    
    case 'ST_3'
        
        para.learningScheme='layer_by_layer';  % learning scheme is end-to-end
        net.FC =false;
        para.numIteration=1200;  %1000
        
        %% filter at the first layer
        net.layers{1}.numFilter=120;    % number of filters
        net.layers{1}.sizeFilter_x=15; %15    odd number
        net.layers{1}.sizeFilter_y=15; %15    odd number
        net.layers{1}.durationFilter= 15; %15
        
        net.layers{1}.subsampling_width_x=7; %round(firstLayer.halfSizeFilter/2);
        net.layers{1}.subsampling_width_y=7; %round(firstLayer.halfSizeFilter/2);
        net.layers{1}.subsampling_width_z=7; %max(1,round(firstLayer.halfSizeDurationFilter/2));
        
        net.layers{1}.halfSizeFilter_x= floor(net.layers{1}.sizeFilter_x/2);
        net.layers{1}.halfSizeFilter_y= floor(net.layers{1}.sizeFilter_y/2);
        net.layers{1}.halfSizeDurationFilter= floor(net.layers{1}.durationFilter/2);
        
        net.layers{1}.pad_x = net.layers{1}.halfSizeFilter_x;
        net.layers{1}.pad_y = net.layers{1}.halfSizeFilter_y;
        net.layers{1}.pad_z = net.layers{1}.halfSizeDurationFilter;
        
        net.layers{1}.pad=[net.layers{1}.pad_x, net.layers{1}.pad_x, net.layers{1}.pad_y, net.layers{1}.pad_y, net.layers{1}.pad_z, net.layers{1}.pad_z];
        net.layers{1}.stride=[net.layers{1}.subsampling_width_x, net.layers{1}.subsampling_width_y, net.layers{1}.subsampling_width_z];
        
        net.layers{1}.filters = gpuArray.randn([net.layers{1}.sizeFilter_x, net.layers{1}.sizeFilter_y, net.layers{1}.durationFilter, para.numChannel, net.layers{1}.numFilter], 'single')*0.001;
        net.layers{1}.bias = gpuArray.zeros(1, net.layers{1}.numFilter,'single');
        
        net.layers{1}.sumMap_sx= floor( (para.sx + 2* net.layers{1}.pad_x - net.layers{1}.sizeFilter_x) /  net.layers{1}.subsampling_width_x)+1;
        net.layers{1}.sumMap_sy= floor( (para.sy + 2* net.layers{1}.pad_y - net.layers{1}.sizeFilter_y) /  net.layers{1}.subsampling_width_y)+1;
        net.layers{1}.sumMap_sz= floor( (para.sz + 2* net.layers{1}.pad_z - net.layers{1}.durationFilter) /  net.layers{1}.subsampling_width_z)+1;
        
        net.layers{1}.momentum_filter = zeros(size(net.layers{1}.filters), 'single');
        net.layers{1}.momentum_bias = zeros(size(net.layers{1}.bias), 'single');
        
         %% filter at the second layer
        net.layers{2}.numFilter=40;
        net.layers{2}.sizeFilter_x=7;
        net.layers{2}.sizeFilter_y=7;
        net.layers{2}.durationFilter=7;% 9
        
        net.layers{2}.subsampling_width_x=3;%round(firstLayer.sizeFilter* 0.9);
        net.layers{2}.subsampling_width_y=3;
        net.layers{2}.subsampling_width_z=3;%max(1,round(secondLayer.halfSizeDurationFilter/2));
        
        net.layers{2}.halfSizeFilter_x= floor(net.layers{2}.sizeFilter_x/2);
        net.layers{2}.halfSizeFilter_y= floor(net.layers{2}.sizeFilter_y/2);
        net.layers{2}.halfSizeDurationFilter= floor(net.layers{2}.durationFilter/2);
        
        
        net.layers{2}.pad_x = net.layers{2}.halfSizeFilter_x;
        net.layers{2}.pad_y = net.layers{2}.halfSizeFilter_y;
        net.layers{2}.pad_z = net.layers{2}.halfSizeDurationFilter;
        
        
        net.layers{2}.pad=[net.layers{2}.pad_x, net.layers{2}.pad_x, net.layers{2}.pad_y, net.layers{2}.pad_y, net.layers{2}.pad_z, net.layers{2}.pad_z];  % (fully connected layer)
        net.layers{2}.stride=[net.layers{2}.subsampling_width_x, net.layers{2}.subsampling_width_y, net.layers{2}.subsampling_width_z];
        
        % initialize the filters at the second layer
        net.layers{2}.filters = gpuArray.randn([net.layers{2}.sizeFilter_x, net.layers{2}.sizeFilter_y, net.layers{2}.durationFilter, net.layers{1}.numFilter, net.layers{2}.numFilter], 'single')*0.001;
        net.layers{2}.bias = gpuArray.zeros(1, net.layers{2}.numFilter,'single');
        
        net.layers{2}.sumMap_sx= floor( (net.layers{1}.sumMap_sx + 2* net.layers{2}.pad_x - net.layers{2}.sizeFilter_x) /  net.layers{2}.subsampling_width_x)+1;
        net.layers{2}.sumMap_sy= floor( (net.layers{1}.sumMap_sy + 2* net.layers{2}.pad_y - net.layers{2}.sizeFilter_y) /  net.layers{2}.subsampling_width_y)+1;
        net.layers{2}.sumMap_sz= floor( (net.layers{1}.sumMap_sz + 2* net.layers{2}.pad_z - net.layers{2}.durationFilter) /  net.layers{2}.subsampling_width_z)+1;
        
        net.layers{2}.momentum_filter = zeros(size(net.layers{2}.filters), 'single');
        net.layers{2}.momentum_bias = zeros(size(net.layers{2}.bias), 'single');
        
         %% filter at the third layer
        net.layers{3}.numFilter=20%1;
        net.layers{3}.sizeFilter_x=3;%net.layers{2}.sumMap_sx; %15
        net.layers{3}.sizeFilter_y=3;%net.layers{2}.sumMap_sx; %15
        net.layers{3}.durationFilter= 2;%net.layers{2}.sumMap_sz;% 5; %5
        
        net.layers{3}.subsampling_width_x=2;%round(firstLayer.sizeFilter* 0.9);
        net.layers{3}.subsampling_width_y=2;
        net.layers{3}.subsampling_width_z=1;%max(1,round(secondLayer.halfSizeDurationFilter/2));
        
        net.layers{3}.halfSizeFilter_x= floor(net.layers{3}.sizeFilter_x/2);
        net.layers{3}.halfSizeFilter_y= floor(net.layers{3}.sizeFilter_y/2);
        net.layers{3}.halfSizeDurationFilter= floor(net.layers{3}.durationFilter/2);
        
        
        net.layers{3}.pad_x = net.layers{3}.halfSizeFilter_x;
        net.layers{3}.pad_y = net.layers{3}.halfSizeFilter_y;
        net.layers{3}.pad_z = net.layers{3}.halfSizeDurationFilter;%0;
        
        
        net.layers{3}.pad=[net.layers{3}.pad_x, net.layers{3}.pad_x, net.layers{3}.pad_y, net.layers{3}.pad_y, net.layers{3}.pad_z, net.layers{3}.pad_z];  % (fully connected layer)
        net.layers{3}.stride=[net.layers{3}.subsampling_width_x, net.layers{3}.subsampling_width_y, net.layers{3}.subsampling_width_z];
        
        % initialize the filters at the second layer
        net.layers{3}.filters = gpuArray.randn([net.layers{3}.sizeFilter_x, net.layers{3}.sizeFilter_y, net.layers{3}.durationFilter, net.layers{2}.numFilter, net.layers{3}.numFilter], 'single')*0.001;
        net.layers{3}.bias = gpuArray.zeros(1, net.layers{3}.numFilter,'single');
        
        net.layers{3}.sumMap_sx= floor( (net.layers{2}.sumMap_sx + 2* net.layers{3}.pad_x - net.layers{3}.sizeFilter_x) /  net.layers{3}.subsampling_width_x)+1;
        net.layers{3}.sumMap_sy= floor( (net.layers{2}.sumMap_sy + 2* net.layers{3}.pad_y - net.layers{3}.sizeFilter_y) /  net.layers{3}.subsampling_width_y)+1;
        net.layers{3}.sumMap_sz= floor( (net.layers{2}.sumMap_sz + 2* net.layers{3}.pad_z - net.layers{3}.durationFilter) /  net.layers{3}.subsampling_width_z)+1;
        
        net.layers{3}.momentum_filter = zeros(size(net.layers{3}.filters), 'single');
        net.layers{3}.momentum_bias = zeros(size(net.layers{3}.bias), 'single');   
        
        %% learning rate
        net.layers{3}.lambdaLearningRate = 0.0003;  %0.0003; % 0.0007 is small but save. 0.002 is good but sometimes fails
        net.layers{2}.lambdaLearningRate = 0.001;  %0.001; % 0.0007 is small but save. 0.002 is good but sometimes fails
        net.layers{1}.lambdaLearningRate = 0.003; %0.03
            
    
    
    otherwise
        
        error('No such an option');
        
        
end

