% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/MutationProfiling.R
\name{collapseClones}
\alias{collapseClones}
\title{Constructs effective clonal sequences for all clones}
\usage{
collapseClones(
  db,
  cloneColumn = "CLONE",
  sequenceColumn = "SEQUENCE_IMGT",
  germlineColumn = "GERMLINE_IMGT_D_MASK",
  muFreqColumn = NULL,
  regionDefinition = NULL,
  method = c("mostCommon", "thresholdedFreq", "catchAll", "mostMutated", "leastMutated"),
  minimumFrequency = NULL,
  includeAmbiguous = FALSE,
  breakTiesStochastic = FALSE,
  breakTiesByColumns = NULL,
  expandedDb = FALSE,
  nproc = 1
)
}
\arguments{
\item{db}{\code{data.frame} containing sequence data. Required.}

\item{cloneColumn}{\code{character} name of the column containing clonal 
identifiers. Required.}

\item{sequenceColumn}{\code{character} name of the column containing input 
sequences. Required. The length of each input sequence should 
match that of its corresponding germline sequence.}

\item{germlineColumn}{\code{character} name of the column containing germline 
sequences. Required. The length of each germline sequence 
should match that of its corresponding input sequence.}

\item{muFreqColumn}{\code{character} name of the column containing mutation
frequency. Optional. Applicable to the \code{"mostMutated"}
and \code{"leastMutated"} methods. If not supplied, mutation
frequency is computed by calling \code{observedMutations}.
Default is \code{NULL}. See Cautions for note on usage.}

\item{regionDefinition}{\link{RegionDefinition} object defining the regions
and boundaries of the Ig sequences. Optional. Default is 
\code{NULL}.}

\item{method}{method for calculating input consensus sequence. Required. 
One of \code{"thresholdedFreq"}, \code{"mostCommon"}, 
\code{"catchAll"}, \code{"mostMutated"}, or 
\code{"leastMutated"}. See "Methods" for details.}

\item{minimumFrequency}{frequency threshold for calculating input consensus sequence.
Applicable to and required for the \code{"thresholdedFreq"} 
method. A canonical choice is 0.6. Default is \code{NULL}.}

\item{includeAmbiguous}{whether to use ambiguous characters to represent positions 
at which there are multiple characters with frequencies that 
are at least \code{minimumFrequency} or that are maximal 
(i.e. ties). Applicable to and required for the 
\code{"thresholdedFreq"} and \code{"mostCommon"} methods. 
Default is \code{FALSE}. See "Choosing ambiguous characters" 
for rules on choosing ambiguous characters.}

\item{breakTiesStochastic}{In case of ties, whether to randomly pick a sequence from 
sequences that fulfill the criteria as consensus. Applicable 
to and required for all methods except for \code{"catchAll"}. 
Default is \code{FALSE}. See "Methods" for details.}

\item{breakTiesByColumns}{A list of the form 
\code{list(c(col_1, col_2, ...), c(fun_1, fun_2, ...))}, 
where \code{col_i} is a \code{character} name of a column 
in \code{db}, and \code{fun_i} is a function to be applied 
on that column. Currently, only \code{max} and \code{min} 
are supported. Note that the two \code{c()}'s in \code{list()} 
are essential (i.e. if there is only 1 column, the list should 
be of the form \code{list(c(col_1), c(func_1))}. Applicable 
to and optional for the \code{"mostMutated"} and 
\code{"leastMutated"} methods. If supplied, \code{fun_i}'s 
are applied on \code{col_i}'s to help break ties. Default 
is \code{NULL}. See "Methods" for details.}

\item{expandedDb}{\code{logical} indicating whether or not to return the 
expanded \code{db}, containing all the sequences (as opposed
to returning just one sequence per clone).}

\item{nproc}{Number of cores to distribute the operation over. If the 
\code{cluster} has already been set earlier, then pass the 
\code{cluster}. This will ensure that it is not reset.}
}
\value{
A modified \code{db} with the following additional columns: 
          \itemize{
            \item \code{CLONAL_SEQUENCE}:  effective sequence for the clone.
            \item \code{CLONAL_GERMLINE}:  germline sequence for the clone.
            \item \code{CLONAL_SEQUENCE_MUFREQ}:  mutation frequency of 
                  \code{CLONAL_SEQUENCE}; only added for the \code{"mostMutated"}
                  and \code{"leastMutated"} methods.
          }
                     
          \code{CLONAL_SEQUENCE} is generated with the method of choice indicated 
          by \code{method}, and \code{CLONAL_GERMLINE} is generated with the 
          \code{"mostCommon"} method, along with, where applicable, user-defined 
          parameters such as \code{minimumFrequency}, \code{includeAmbiguous}, 
          \code{breakTiesStochastic}, and \code{breakTiesByColumns}.
}
\description{
\code{collapseClones} creates effective input and germline sequences for each clonal 
group and appends columns containing the consensus sequences to the input 
\code{data.frame}.
}
\section{Consensus lengths}{
 For each clone, \code{CLONAL_SEQUENCE} and 
         \code{CLONAL_GERMLINE} have the same length. 
         
         \itemize{
               \item For the \code{"thresholdedFreq"}, \code{"mostCommon"}, and 
               \code{"catchAll"} methods:
         
               The length of the consensus sequences is determined by the longest possible
               consensus sequence (baesd on \code{inputSeq} and \code{germlineSeq}) and 
               \code{regionDefinition@seqLength} (if supplied), whichever is shorter.

               Given a set of sequences of potentially varying lengths, the longest possible 
               length of their consensus sequence is taken to be the longest length along 
               which there is information contained at every nucleotide position across 
               majority of the sequences. Majority is defined to be greater than 
               \code{floor(n/2)}, where \code{n} is the number of sequences. If the longest 
               possible consensus length is 0, there will be a warning and an empty string 
               (\code{""}) will be returned. 
         
               If a length limit is defined by supplying a \code{regionDefinition} via 
               \code{regionDefinition@seqLength}, the consensus length will be further 
               restricted to the shorter of the longest possible length and 
               \code{regionDefinition@seqLength}.
         
               \item For the \code{"mostMutated"} and \code{"leastMutated"} methods:
               
               The length of the consensus sequences depends on that of the most/least 
               mutated input sequence, and, if supplied, the length limit defined by 
               \code{regionDefinition@seqLength}, whichever is shorter. If the germline 
               consensus computed using the \code{"mostCommon"} method is longer than 
               the most/least mutated input sequence, the germline consensus is trimmed 
               to be of the same length as the input consensus.
              
          }
}

\section{Methods}{
 The descriptions below use "sequences" as a generalization of input 
         sequences and germline sequences. 
         
         \itemize{
         
             \item \code{method="thresholdedFreq"}
             
                   A threshold must be supplied to the argument \code{minimumFrequency}. At 
                   each position along the length of the consensus sequence, the frequency 
                   of each nucleotide/character across sequences is tabulated. The 
                   nucleotide/character whose frequency is at least (i.e. \code{>=}) 
                   \code{minimumFrequency} becomes the consensus; if there is none, the
                   consensus nucleotide will be \code{"N"}.
                   
                   When there are ties (frequencies of multiple nucleotides/characters 
                   are at least \code{minimumFrequency}), this method can be deterministic 
                   or stochastic, depending on additional parameters.
                   
                   \itemize{
                        \item With \code{includeAmbiguous=TRUE}, ties are resolved 
                              deterministically by representing ties using ambiguous 
                              characters. See "Choosing ambiguous characters" for how 
                              ambiguous characters are chosen.
                        \item With \code{breakTiesStochastic=TRUE}, ties are resolved 
                              stochastically by randomly picking a character amongst the 
                              ties.
                        \item When both \code{TRUE}, \code{includeAmbiguous} takes 
                              precedence over \code{breakTiesStochastic}.
                        \item When both \code{FALSE}, the first character from the ties is 
                              taken to be the consensus following the order of \code{"A"}, 
                              \code{"T"}, \code{"G"}, \code{"C"}, \code{"N"}, \code{"."}, 
                              and \code{"-"}.
                   }
                   
                   Below are some examples looking at a single position based on 5 
                   sequences with \code{minimumFrequency=0.6}, 
                   \code{includeAmbiguous=FALSE}, and \code{breakTiesStochastic=FALSE}:
                   
                   \itemize{
                        \item If the sequences have \code{"A"}, \code{"A"}, \code{"A"}, 
                              \code{"T"}, \code{"C"}, the consensus will be \code{"A"}, 
                              because \code{"A"} has frequency 0.6, which is at least 
                              \code{minimumFrequency}.
                        \item If the sequences have \code{"A"}, \code{"A"}, \code{"T"}, 
                              \code{"T"}, \code{"C"}, the consensus will be \code{"N"}, 
                              because none of \code{"A"}, \code{"T"}, or \code{"C"} has 
                              frequency that is at least \code{minimumFrequency}.
                   }
         
              \item \code{method="mostCommon"}
              
                    The most frequent nucleotide/character across sequences at each 
                    position along the length of the consensus sequence makes up the consensus.
                   
                    When there are ties (multiple nucleotides/characters with equally 
                    maximal frequencies), this method can be deterministic or stochastic, 
                    depending on additional parameters. The same rules for breaking ties 
                    for \code{method="thresholdedFreq"} apply.
                   
                    Below are some examples looking at a single position based on 5 
                    sequences with \code{includeAmbiguous=FALSE}, and 
                    \code{breakTiesStochastic=FALSE}:
                    
                    \itemize{
                         \item If the sequences have \code{"A"}, \code{"A"}, \code{"T"}, 
                               \code{"A"}, \code{"C"}, the consensus will be \code{"A"}.
                         \item If the sequences have \code{"T"}, \code{"T"}, \code{"C"}, 
                               \code{"C"}, \code{"G"}, the consensus will be \code{"T"}, 
                               because \code{"T"} is before \code{"C"} in the order of 
                               \code{"A"}, \code{"T"}, \code{"G"}, \code{"C"}, \code{"N"}, 
                               \code{"."}, and \code{"-"}. 
                    }       
                    
                    
              \item \code{method="catchAll"}
              
                    This method returns a consensus sequence capturing most of the 
                    information contained in the sequences. Ambiguous characters are 
                    used where applicable. See "Choosing ambiguous characters" for how 
                    ambiguous characters are chosen. This method is deterministic and 
                    does not involve breaking ties.
                    
                    Below are some examples for \code{method="catchAll"} looking at a 
                    single position based on 5 sequences:
                    
                    \itemize{
                         \item If the sequences have \code{"N"}, \code{"N"}, \code{"N"}, 
                               \code{"N"}, \code{"N"}, the consensus will be \code{"N"}.
                         \item If the sequences have \code{"N"}, \code{"A"}, \code{"A"}, 
                               \code{"A"}, \code{"A"}, the consensus will be \code{"A"}.
                         \item If the sequences have \code{"N"}, \code{"A"}, \code{"G"}, 
                               \code{"A"}, \code{"A"}, the consensus will be \code{"R"}.
                         \item If the sequences have \code{"-"}, \code{"-"}, \code{"."}, 
                               \code{"."}, \code{"."}, the consensus will be \code{"-"}.
                         \item If the sequences have \code{"-"}, \code{"-"}, \code{"-"}, 
                               \code{"-"}, \code{"-"}, the consensus will be \code{"-"}.
                         \item If the sequences have \code{"."}, \code{"."}, \code{"."}, 
                               \code{"."}, \code{"."}, the consensus will be \code{"."}.
                   }
                   
             \item \code{method="mostMutated"} and \code{method="leastMutated"}
             
                   These methods return the most/least mutated sequence as the consensus 
                   sequence. 
                   
                   When there are ties (multple sequences have the maximal/minimal mutation
                   frequency), this method can be deterministic or stochastic, depending on 
                   additional parameters.
                   
                   \itemize{
                        \item With \code{breakTiesStochastic=TRUE}, ties are resolved 
                              stochastically by randomly picking a sequence out of 
                              sequences with the maximal/minimal mutation frequency.
                        \item When \code{breakTiesByColumns} is supplied, ties are resolved
                              deterministically. Column by column, a function is applied on 
                              the column and sequences with column value matching the 
                              functional value are retained, until ties are resolved or 
                              columns run out. In the latter case, the first remaining 
                              sequence is taken as the consensus.
                        \item When \code{breakTiesStochastic=TRUE} and 
                              \code{breakTiesByColumns} is also supplied, 
                              \code{breakTiesStochastic} takes precedence over 
                              \code{breakTiesByColumns}.
                        \item When \code{breakTiesStochastic=FALSE} and 
                              \code{breakTiesByColumns} is not supplied (i.e. \code{NULL}), 
                              the sequence that appears first amongst the ties is taken 
                              as the consensus.
                   }
         
         }
}

\section{Choosing ambiguous characters}{
 
         
         Ambiguous characters may be present in the returned consensuses when using the
         \code{"catchAll"} method and when using the \code{"thresholdedFreq"} or 
         \code{"mostCommon"} methods with \code{includeAmbiguous=TRUE}. 
         
         The rules on choosing ambiguous characters are as follows:
         
         \itemize{
              \item If a position contains only \code{"N"} across sequences, the consensus 
                    at that position is \code{"N"}.
              \item If a position contains one or more of \code{"A"}, \code{"T"}, 
                    \code{"G"}, or \code{"C"}, the consensus will be an IUPAC character 
                    representing all of the characters present, regardless of whether 
                    \code{"N"}, \code{"-"}, or \code{"."} is present.
              \item If a position contains only \code{"-"} and \code{"."} across sequences, 
                    the consensus at thatp osition is taken to be \code{"-"}. 
              \item If a position contains only one of \code{"-"} or \code{"."} across 
                    sequences, the consensus at that position is taken to be the character 
                    present. 
         }
}

\section{Cautions}{
 

         \itemize{
              \item   Note that this function does not perform multiple sequence alignment. 
                      As a prerequisite, it is assumed that the sequences in 
                      \code{sequenceColumn} and \code{germlineColumn} have been aligned 
                      somehow. In the case of immunoglobulin repertoire analysis, this 
                      usually means that the sequences are IMGT-gapped.
              \item   When using the \code{"mostMutated"} and \code{"leastMutated"} methods, 
                      if you supply both \code{muFreqColumn} and \code{regionDefinition},
                      it is your responsibility to ensure that the mutation frequency in
                      \code{muFreqColumn} was calculated with sequence lengths restricted 
                      to the \strong{same} \code{regionDefinition} you are supplying. 
                      Otherwise, the "most/least mutated" sequence you obtain might not 
                      be the most/least mutated given the \code{regionDefinition} supplied, 
                      because your mutation frequency was based on a 
                      \code{regionDefinition} different from the one supplied.
              \item   If you intend to run \code{collapseClones} before 
                      building a 5-mer targeting model, you \strong{must} choose 
                      parameters such that your collapsed clonal consensuses do 
                      \strong{not} include ambiguous characters. This is because the 
                      targeting model functions do NOT support ambiguous characters 
                      in their inputs.
              }
}

\examples{
# Subset example data
data(ExampleDb, package="alakazam")
db <- subset(ExampleDb, ISOTYPE \%in\% c("IgA", "IgG") & SAMPLE == "+7d" &
                        CLONE \%in\% c("3100", "3141", "3184"))

# thresholdedFreq method, resolving ties deterministically without using ambiguous characters
clones <- collapseClones(db, method="thresholdedFreq", minimumFrequency=0.6,
                         includeAmbiguous=FALSE, breakTiesStochastic=FALSE)

# mostCommon method, resolving ties deterministically using ambiguous characters
clones <- collapseClones(db, method="mostCommon", 
                         includeAmbiguous=TRUE, breakTiesStochastic=FALSE)

# Make a copy of db that has a mutation frequency column
db2 <- observedMutations(db, frequency=TRUE, combine=TRUE)

# mostMutated method, resolving ties stochastically
clones <- collapseClones(db2, method="mostMutated", muFreqColumn="MU_FREQ", 
                         breakTiesStochastic=TRUE, breakTiesByColumns=NULL)
                         
# mostMutated method, resolving ties deterministically using additional columns
clones <- collapseClones(db2, method="mostMutated", muFreqColumn="MU_FREQ", 
                         breakTiesStochastic=FALSE, 
                         breakTiesByColumns=list(c("DUPCOUNT"), c(max)))

# Build consensus for V segment only
# Capture all nucleotide variations using ambiguous characters 
clones <- collapseClones(db, method="catchAll", regionDefinition=IMGT_V)

# Return the same number of rows as the input
clones <- collapseClones(db, method="mostCommon", expandedDb=TRUE)

}
\seealso{
See \link{IMGT_SCHEMES} for a set of predefined \link{RegionDefinition} objects.
}
