<?xml version="1.0"?>
<!-- This is a sample configuration file for CRAWDAD, with documentation
in place to explain what the values and configuration of various steps are.
Copy this to the directory where you will run your samples, and edit
the values within for your experiments, using the comments below as a guide
-->

<craw_conf>
  <!-- Place comments in this file just as this line is placed -->
  <experiment_details>
    <!-- experiment_details is not used for anything at the moment...
         However it makes a useful record, so fill in the fields below - don't 
         worry about the values for now -->

    <organism>M. musculus</organism>
    <instrument>ltq1</instrument>
    <operator>jasonw</operator>
    <sample_prep>NA</sample_prep>
    <date>03-04-07</date>
    <lc_time>120</lc_time>
    <hplc>Agilent_1100</hplc>
  </experiment_details>

  <!-- experiment_groups is where you define the groups that you are comparing.
       although CRAWDAD is currently limited to comparing two groups for difference
       detection, you can have more than two in this file - (for alignment and processing
       purposes 
  -->

  <experiment_groups>
    <group name='0p1'> <!-- the name is just a label -->
          <runs>mouse_0p1_1 mouse_0p1_2 mouse_0p1_3 mouse_0p1_4 mouse_0p1_5</runs>

	  <!-- 
	    if you have searched annotation data and loaded them into a database using
	    parse_sqt.pl, attach a list of sqt files here as follows
            <sqts>mouse_0p1_1.sqt mouse_0p1_2.sqt etc... </sqts>
            use None as the name for any file which is missing a sqt

	    i.e. if you have only searched the first two sqt files, then the list would like so:
	    (you must list five files)
	    
	    <sqts>mouse_0p1_1.sqt mouse_0p1_2.sqt mouse_0p1_3.sqt mouse_0p1_4.sqt mouse_0p1_5.sqt</sqts>
     

          -->
	  <sqts> mouse_0p1_1.sqt mouse_0p1_2.sqt None None None</sqts>
          <!-- New support for using your ms1 files for hardklor analysis:
               if available, list the ms1 files below and use an 'align_assess' action 
               i.e. -->
          <ms1s> mouse_0p1_1.ms1 mouse_0p1_2.ms1 mouse_0p1_3.ms1 mouse_0p1_4.ms1 mouse_0p1_5.ms1 </ms1s>
           
            


    </group>
    <group name='0p5'>
          <runs>mouse_0p5_1 mouse_0p5_2 mouse_0p5_3 mouse_0p5_4 mouse_0p5_5</runs>
	  <!-- if you have not searched your sqt files yet, there' no need to worry about the <sqts> tag yet -
	       it is only needed when finding differences -->
	  <sqts>mouse_0p5_1.sqt mouse_0p5_2.sqt mouse_0p5_3.sqt mouse_0p5_4.sqt mouse_0p5_5.sqt</sqts>
          <ms1s> mouse_0p5_1.ms1 mouse_0p5_2.ms1 mouse_0p5_3.ms1 mouse_0p5_4.ms1 mouse_0p5_5.ms1 </ms1s>
    </group>
  </experiment_groups>
  


  <!-- now you are going to define the actions CRAWDAD will take i.e. these are a set of steps to the program 
       known as 'actions' (TODO change to tasks)
       the actions tags contains optional global parameters, followed by a list of action tags.
     
       action tags are instructructions for craw_conf to run the programs which actually do the work,
       and can also specify use of the SGE cluster queue.
       
         Two attributes are important for the action tags:

         1. label - this is a unique label for the action, which the user must set - 
	    the name will be prepended to the name of the .msmat files which are output from
            the relevant action
         
         2. previous - this defines the input files, and any requied actions to be performed before
            the current action. If this is not specified, the raw .msmat files defined in <experiment_groups> are 
            used. Make sure that the actions specified as 'previous' are spelled correctly and exist.
	   
         Action tags contain a set of parameters - these are in the format
         <param name='param_name'>optional_value</param>
	 (depending upon the name used, a param may take values taken)

	 an action will produce base peak chromatogram images of the processed runs, unless instructed otherwise by
         a 'no_images' param (see below)
     
         params which are common to all types of actions are:
          
	 opts - you can enter command-line options to be passed to the program which the action tag runs
	 (i.e. craw_conf only knows about some options, but advanced users would specify additional ones)

	 no_images - do not output base-peak chromatograms for the output of this action
         [not implemented yet] email_report - email a report on the output of this action upon completion
      
         [SGE related options]
         queue - specifies that all programs run by this action are to be submitted to the SGE queue. No value
                  is neccessary. Normally the action waits for all queue tasks to be completed
         nowait - do not wait for the queue-submitted jobs to finish before proceeding to the next action. Know 
                   what you are doing before using this (at some point craw_conf will use dependency logic like make or ant,
                   but not yet.
         jobs_limit - a limit on the number of jobs which can be submitted to the sge queue - craw_conf will
                      submit an additional job when the number of submitted jobs drops to N-1
         sleep_period - length of time in seconds to pause between checking the queue status for finished jobs ( default:120sec )
         mem_req      - makes sure that the job is submitted to a computer with sufficient physical memory 
	 
	 These listed params can also be used as <global_param> tags as shown below.
	 
	 Individual actions have their particular params explained as they arise below.
         
      -->

      

  <actions>
    <!-- the following two settings are optional global parameters -->
    <global_param name='queue'/>

   <!-- Time_interpolate - this re-samples scans to be at a consistent retention time interval
        it can be used in two ways
	1. with a preset value <param name='interval'>
        2. or with method=mode and roundto=### as shown below. This sets the new scan interval to
           the mode interval between observed scans, rounded to the value specified in roundtoa
	   
	Note that as this action has no 'previous' attribure, the .msmat files specified above are used
       
     -->

   <action type='time_interpolate' label='ti'>
      <param name='method'>mode</param>
      <param name='roundto'>0.025</param>
    </action> 


   <!-- Now we smooth the data using savitsky-golay smoothing - for this we are using
        the 'ms12bins' action, which is a thin layer around calling the ms12bins program,
        which encapsulates a lot of the crawdad functionality. Watch what the parameters used
        are, and how they are reflected in the label. Also, note that the 'previous' attribute
        is set to 'ti', which is the label of the previous action - hence it will be used as 
        input to this action -->

   <action type='ms12bins' label='sg_w11_o2' previous='ti'>
      <!-- We are using the opts param here to simply pass flags to ms12bins.py -
           in this case we are using a savitsky-golay smoothing window 11-points wide to
           fit a 2nd degree polynomial for smoothing. See Numerical Recipes in C for more
           information on savitsky-golay smoothing -->
      <param name='opts'> --sg_window=11 --sg_order=2 </param>
   </action>
   
   <!-- Now we actually go ahead and align the data using a template. It would be useful to look at the
        base peak plots that have been produced by previous steps and choose a 'median' template ,i.e.
        one that looks to be 'in the middle' of the other runs 
	
	It is also useful to look at base-peak chromatograms or the msmat_img.py program to determine
        the useful portion of the run - where peaks can be found, typically after loading and before 
        re-equilibration. The start_trim and stop_trim params below specify where to trim all runs 
        before alignment - set this to about 5' before and after the boundaries for the useful region.
        Use the timescale of the master run for this purpose

	The 'diagonal' param refers to how wide of a window to use when searching for similar scans 
        to align between two runs, expressed as a +- %length of the runs. 0.1 or 0.15 are decent default
        values - you can generally eyeball the maximum difference you see between runs (or the template 
        and other runs, and use that value * 1.5 )
   
        The shift_limit and weights parameters below provide some control to the alignment process.
        shift_limit gives a limit to the maximum consecutive shrinks/streches a run may take, and 
        weights gives a 'benefit' value to a strectch, 'constant' , or 'shrink' at any given scan
        in the run to be aligned - a value of 2 for the middle term (i.e. constant) makes all 
        'steps' equally biased - we provide a small benefit to 'constant' scans as it is more likely
        that there will not be a strectch or shrink. (one might want to play with increasing this parameter,
        and viewing improvements in the alignments)
  
   -->
  
    <action type='align_set' label='al_0p1_3' previous='sg_w11_o2'>
      <param name='template'>mouse_0p1_3</param>
      <param name='start_trim'>40</param>
      <param name='stop_trim'>100</param>
      <param name='diagonal'>0.15</param>
      <param name='opts'> --shift_limit=5 --weights=1,2.1,1</param>
    </action>  
    
    <!-- TODO an action when using high-resoluton instruments (orbitrap, FT) to use hardklor to
         assess alignment quality -->

    
    <!-- 
         A global intensity normalization is applied where runs are normalized by the sum of their TIC
        
	 the intensity_min value can be used to exclude intensities below a threshold (to avoid counting noise)
         this would be set higher on an FTICR instrument due to the already existing noise reduction

	 The template can be any run (TODO - remove this flag, just pick one automatically)
    -->


    <action type='mean_normalize' label='mn_e2' previous='al_0p1_3'>
      <param name='template'>mouse_0p1_1</param>
      <param name='intensity_min'>1e2</param>
    </action>

    <!-- assess the quality of your alignment - note that this is a bit of
         a departure from the usual formats as used, for we don't actually produce
         msmat output files from this 'action'.
         This uses the Hardklor program (also from the MacCoss Group - see http://proteome.gs.washington.edu/software/hardklor
         to find peptide isotope distributions that persist over time, and then determines their standard deviation of RT before
         and after alignment. If you are not using high-resolution MS1s, the results of this would be undefined (i.e. useless) -->

      <action type='assess_align' previous='al_0p1_3' label='align_assess'>

          <!-- hardklor_conf - override a default hardklor.conf file (in $CRAWLIB/hardklor.conf) -->
          <!-- hardklor_opts - overrdie some options from the hardklor.conf file -->
          <!-- optional --><param name='hardklor_conf'>hardklor.conf</param>
          <!-- optional --><param name='hardklor_opts'> -corr 0.99 </param>
      </action>

      



    <!-- 
    And now, what you actually wanted to do in the first place - perform a statistical analysis 
    between two different groups of samples!
    
    This is the first step where you need to specify which 'groups' from
    the 'experimental_groups' listed above are used (previously, each action would be applied
    to all groups). Future iterations of crawdad might use ANOVA to find changes in multiple groups
    (example: changes in timepoints)
    
    So, basic parameters to set are:
    'group1', 'group2' - the two groups to compare
    'ttest_cutoff'     - a p-value cutoff for _each_ timepoint in an
    aligned extracted ion chromatogram (XIC) being compared.
    'ttest_len_rt'     - the length in retention time (minutes) over which
    the ttest_cutoff specified above must be different
    'webdir' - the absolute path to a directory where the output files are placed, 
    typically you would set this to /mnt/www/localhost/username/experiment_name
    
    'opts' -d- again, these are specific for the program - 
    options include: (add a double-dash to the start of the options)
    run Diffs.py - help to get a full list

    -uniprot_ids - use if your FASTA IDs has the format sp|p#####|BGAL_ECOLI
    -genbank_ids - use if your FASTA IDs are from genbanke
    -other_ids   - Diffs.py will not attempt to parse your FASTA IDs for links to the databases
    -lowmem_dr   - use a low-memory version of the difference region object
    -intensity_minimum - set a minimum intensity for either of the two 'mean of maxima' values
                         used in the difference region
    -hit_rt_window     - retention time window beyond the diff region boundary to find hits
    -hit_mz_window     - mz window outside of the difference region used to search things
    -
    -->

    <action type='diffs' label='p05_w250' previous='al_0p1_3'>
      <param name='group1'>0p1</param>
      <param name='group2'>0p5</param> 
      <param name='ttest_cutoff'>0.05</param>
      <param name='ttest_len_rt'>0.25</param>
    </action>
  </actions>
</craw_conf>
