function [net, currSample_video]=process_epoch_STGConvNet_miniBatch(para, net, currSample_video)


    numLayers=numel(net.layers);
    
      
%     res = struct(...
%     'x', cell(1,numLayers+1), ...              % feature map
%     'indicator', cell(1,numLayers+1), ...      % activation maps
%     'stat_weights', cell(1,numLayers+1), ...   % df / dw
%     'stat_bias', cell(1,numLayers+1),...       % df / db
%     'size_row', cell(1,numLayers+1),...          
%     'size_col', cell(1,numLayers+1),...           
%     'size_time', cell(1,numLayers+1)); 
    
    
   
    res_obs_miniBatch = struct(...
        'stat_weights', cell(1,numLayers+1), ...
        'stat_bias', cell(1,numLayers+1));
    
    res_syn_miniBatch = struct(...
        'stat_weights', cell(1,numLayers+1), ...
        'stat_bias', cell(1,numLayers+1));
    

    for iBatch = 1: para.numBatch
                
        startID_sample= 1 + para.numChain * (iBatch -1);
        endID_sample= para.numChain + para.numChain * (iBatch -1);
        currSample_video_batch = gpuArray(currSample_video(:,:,:,:,startID_sample:endID_sample));
        
        
        imageSet_batch = gpuArray(para.imageSet(:,:,:,:,para.batch{iBatch}));
        
        %% computing observed statistics         
        res_obs = compute_stat_STGConvNet(net, imageSet_batch);
        
        % average observed statistics over all miniBatches  
        for l=1:numLayers        
            if isempty(res_obs_miniBatch(l+1).stat_weights)
                res_obs_miniBatch(l+1).stat_weights = res_obs(l+1).stat_weights ./ para.numBatch;
                res_obs_miniBatch(l+1).stat_bias = res_obs(l+1).stat_bias ./ para.numBatch;
            else
                res_obs_miniBatch(l+1).stat_weights = res_obs_miniBatch(l+1).stat_weights + res_obs(l+1).stat_weights ./ para.numBatch;
                res_obs_miniBatch(l+1).stat_bias = res_obs_miniBatch(l+1).stat_bias + res_obs(l+1).stat_bias ./ para.numBatch;
            end           
        end


        %% sampling by Langevin dynamics
        currSample_video_batch = sampling_sequence_by_Langevin(net, para.L, para.stepsize, currSample_video_batch);  

        %% computing synthesized statistics  
        res_syn = compute_stat_STGConvNet(net, currSample_video_batch);
        
        % average synthesized statistics over all miniBatches  
        for l=1:numLayers           
            if isempty(res_syn_miniBatch(l+1).stat_weights)
                res_syn_miniBatch(l+1).stat_weights = res_syn(l+1).stat_weights ./ para.numBatch;
                res_syn_miniBatch(l+1).stat_bias = res_syn(l+1).stat_bias ./ para.numBatch;
            else
                res_syn_miniBatch(l+1).stat_weights = res_syn_miniBatch(l+1).stat_weights + res_syn(l+1).stat_weights ./ para.numBatch;
                res_syn_miniBatch(l+1).stat_bias = res_syn_miniBatch(l+1).stat_bias + res_syn(l+1).stat_bias ./ para.numBatch;
            end           
        end

        
        disp(['batch: '  num2str(iBatch) ' of ' num2str(para.numBatch)]);
        currSample_video(:,:,:,:,startID_sample:endID_sample) = gather(currSample_video_batch);
       
    end
    
    
     for l=1:numLayers

        %% compute the gradient for weights and bias
        gradient_weight = res_obs_miniBatch(l+1).stat_weights - res_syn_miniBatch(l+1).stat_weights *( para.sz / size(currSample_video,3)); % in case that the temporal dimension of the synthesis and the observed signal is different.
        gradient_bias = res_obs_miniBatch(l+1).stat_bias - res_syn_miniBatch(l+1).stat_bias *( para.sz / size(currSample_video,3));

        disp(['Layer ' num2str(l) ': SSD_weight: ' num2str(mean(abs(gradient_weight(:))))]);
        
        maximum_num_activations = res_obs(l+1).size_row * res_obs(l+1).size_col * res_obs(l+1).size_time;
        gradient_weight = gradient_weight / maximum_num_activations;
        if max(abs(gradient_weight(:))) > para.max_gradient 
            gradient_weight = gradient_weight / max(abs(gradient_weight(:))) * para.max_gradient;
        end
        
        gradient_bias = gradient_bias / maximum_num_activations;
        if max(abs(gradient_bias(:))) > para.max_gradient 
            gradient_bias = gradient_bias / max(abs(gradient_bias(:))) * para.max_gradient;
        end        

        
        net.layers{l}.momentum_filter = para.momentum(1) * net.layers{l}.momentum_filter + gradient_weight - para.decay(1) * net.layers{l}.filters;    
        net.layers{l}.momentum_bias = para.momentum(2) * net.layers{l}.momentum_bias + gradient_bias - para.decay(2) * net.layers{l}.bias;
    
        net.layers{l}.filters = net.layers{l}.filters + net.layers{l}.lambdaLearningRate*  net.layers{l}.momentum_filter;
        net.layers{l}.bias = net.layers{l}.bias + ( net.layers{l}.lambdaLearningRate * 2 ) * net.layers{l}.momentum_bias;  
        
    end
    

    