ProteoWizard
Digestion.hpp
Go to the documentation of this file.
1 //
2 // $Id: Digestion.hpp 6909 2014-11-19 17:18:29Z chambm $
3 //
4 //
5 // Original author: Matt Chambers <matt.chambers .@. vanderbilt.edu>
6 //
7 // Copyright 2006 Louis Warschaw Prostate Cancer Center
8 // Cedars Sinai Medical Center, Los Angeles, California 90048
9 // Copyright 2008 Vanderbilt University - Nashville, TN 37232
10 //
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 //
15 // http://www.apache.org/licenses/LICENSE-2.0
16 //
17 // Unless required by applicable law or agreed to in writing, software
18 // distributed under the License is distributed on an "AS IS" BASIS,
19 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 // See the License for the specific language governing permissions and
21 // limitations under the License.
22 //
23 
24 
25 #ifndef _DIGESTION_HPP_
26 #define _DIGESTION_HPP_
27 
28 
30 #include "pwiz/data/common/cv.hpp"
32 #include "Peptide.hpp"
33 #include "boost/shared_ptr.hpp"
34 #include <string>
35 #include <limits>
36 #include <set>
37 
38 
39 namespace pwiz {
40 namespace proteome {
41 
42 
43 using namespace pwiz::cv;
44 
45 
46 /// peptide subclass that contains extra metadata provided by digestion
48 {
49  public:
50 
51  DigestedPeptide(const std::string& sequence);
52  DigestedPeptide(const char* sequence);
53 
54  DigestedPeptide(std::string::const_iterator begin,
55  std::string::const_iterator end,
56  size_t offset,
57  size_t missedCleavages,
58  bool NTerminusIsSpecific,
59  bool CTerminusIsSpecific,
60  std::string NTerminusPrefix = "",
61  std::string CTerminusSuffix = "");
62 
64  size_t offset,
65  size_t missedCleavages,
66  bool NTerminusIsSpecific,
67  bool CTerminusIsSpecific,
68  std::string NTerminusPrefix = "",
69  std::string CTerminusSuffix = "");
70 
72  DigestedPeptide& operator=(const DigestedPeptide&);
73  virtual ~DigestedPeptide();
74 
75  /// returns the zero-based offset of the N terminus of the peptide
76  /// in the polypeptide from which it was digested
77  size_t offset() const;
78 
79  /// returns the number of missed cleavage sites in the peptide
80  size_t missedCleavages() const;
81 
82  /// returns the number of termini that matched to the digestion rules
83  size_t specificTermini() const;
84 
85  /// returns true iff the N terminus matched the digestion rules
86  bool NTerminusIsSpecific() const;
87 
88  /// returns true iff the C terminus matched the digestion rules
89  bool CTerminusIsSpecific() const;
90 
91  /// returns residue preceding digestion site
92  std::string NTerminusPrefix() const;
93 
94  /// returns residue following digestion site
95  std::string CTerminusSuffix() const;
96 
97  /// returns true iff peptide sequences, masses, and all digestion metadata are equal
98  bool operator==(const DigestedPeptide& rhs) const;
99 
100  private:
101  size_t offset_;
105  std::string NTerminusPrefix_;
106  std::string CTerminusSuffix_;
107 };
108 
109 
110 /// enumerates the peptides from proteolytic digestion of a polypeptide or protein;
112 {
113  public:
114 
115  /// sets the number of peptide termini that must match to a digestion motif
116  /// note: castable to int; i.e. non=0, semi=1, fully=2
117  enum PWIZ_API_DECL Specificity
118  {
119  NonSpecific = 0, /// neither termini must match digestion motif(s)
120  SemiSpecific = 1, /// either or both termini must match digestion motif(s)
121  FullySpecific = 2 /// both termini must match digestion motif(s)
122  };
123 
124  /// sets constraints for valid peptides produced by iterating the digestion
126  {
128 
129  //double minimumMass;
130  //double maximumMass;
131 
134 
135  Specificity minimumSpecificity;
136 
138 
139  Config(int maximumMissedCleavages = 100000,
140  //double minimumMass = 0,
141  //double maximumMass = 100000,
142  int minimumLength = 0,
143  int maximumLength = 100000,
144  Specificity minimumSpecificity = FullySpecific,
145  bool clipNTerminalMethionine = true);
146  };
147 
148  /// returns the set of predefined cleavage agents defined in the PSI-MS CV
149  static const std::set<CVID>& getCleavageAgents();
150 
151  /// returns the names of the set of predefined cleavage agents defined in the PSI-MS CV
152  static const std::vector<std::string>& getCleavageAgentNames();
153 
154  /// returns the cvid of the specified cleavage agent using a case-insensitive search,
155  /// or CVID_Unknown if the agent is not found
156  static CVID getCleavageAgentByName(const std::string& agentName);
157 
158  /// returns the cvid of the specified cleavage agent looking it up by the Perl regular expression,
159  /// or CVID_Unknown if the agent is not found (the regex pattern must match exactly)
160  static CVID getCleavageAgentByRegex(const std::string& agentRegex);
161 
162  /// returns the official PSI Perl regular expression defining the places in a
163  /// polypeptide or protein that the agent will cut.
164  static const std::string& getCleavageAgentRegex(CVID agentCvid);
165 
166  /// returns a modified version of a cleavage agent regex where any ambiguous AA symbols (BJXZ)
167  /// are augmented with their unambiguous counterparts (e.g. B -> [BND])
168  static std::string disambiguateCleavageAgentRegex(const std::string& cleavageAgentRegex);
169 
170  /// specifies digestion occurs by a commonly used cleavage agent
171  Digestion(const Peptide& polypeptide,
172  CVID cleavageAgent,
173  const Config& config = Config());
174 
175  /// specifies digestion occurs by a combination of commonly used cleavage agents
176  Digestion(const Peptide& polypeptide,
177  const std::vector<CVID>& cleavageAgents,
178  const Config& config = Config());
179 
180  /// specifies digestion occurs by a user-specified, zero-width Perl regular expression
181  /// example: "(?<=K)" means "cleaves after K"
182  /// example: "((?<=D))|((?=D))" means "cleaves before or after D"
183  /// example: "(?=[DE])" means "cleaves before D or E"
184  /// example: "(?<=[FYWLKR])(?!P)" means "cleaves after any single residue from FYWLKR except when it is followed by P"
185  Digestion(const Peptide& polypeptide,
186  const std::string& cleavageAgentRegex,
187  const Config& config = Config());
188 
189  /// specifies digestion occurs by a combination of user-specified, zero-width Perl regular expressions
190  /// example: "(?<=K)" means "cleaves after K"
191  /// example: "((?<=D))|((?=D))" means "cleaves before or after D"
192  /// example: "(?=[DE])" means "cleaves before D or E"
193  /// example: "(?<=[FYWLKR])(?!P)" means "cleaves after any single residue from FYWLKR except when it is followed by P"
194  Digestion(const Peptide& polypeptide,
195  const std::vector<std::string>& cleavageAgentRegexes,
196  const Config& config = Config());
197 
198  /// returns all instances of the given peptide in the polypeptide under digestion;
199  /// note: the filters set in Digestion::Config are respected!
200  std::vector<DigestedPeptide> find_all(const Peptide& peptide) const;
201 
202  /// returns the first instance of the given peptide in the polypeptide under digestion;
203  /// if offsetHint is provided, the search will begin at that offset;
204  /// throws runtime_error if no instance of the peptide is found;
205  /// note: the filters set in Digestion::Config are respected!
206  DigestedPeptide find_first(const Peptide& peptide, size_t offsetHint = 0) const;
207 
208 
209  ~Digestion();
210 
211 
212  private:
213  class Impl; // forward-declared for const_iterator
214 
215  public:
216 
217  /// provides forward-only, read-only iteration to enumerate peptides
219  {
220  public:
221  const_iterator(const const_iterator& rhs);
222  ~const_iterator();
223 
224  const DigestedPeptide& operator*() const;
225  const DigestedPeptide* operator->() const;
226  const_iterator& operator++();
227  const_iterator operator++(int);
228  bool operator!=(const const_iterator& that) const;
229  bool operator==(const const_iterator& that) const;
230 
231  typedef std::forward_iterator_tag iterator_category;
233  typedef size_t difference_type;
234  typedef value_type* pointer;
235  typedef value_type& reference;
236 
237  private:
238  const_iterator();
239  const_iterator(const Digestion& digestion);
240 
241  friend class Digestion;
242  friend class Digestion::Impl;
243 
244  class Impl;
245  boost::shared_ptr<Impl> impl_;
246  };
247 
248  const_iterator begin() const;
249  const_iterator end() const;
250 
251  private:
252  friend class const_iterator;
253  friend class const_iterator::Impl;
254  boost::shared_ptr<Impl> impl_;
255 };
256 
257 
258 } // namespace proteome
259 } // namespace pwiz
260 
261 
262 #endif // _DIGESTION_HPP_
std::forward_iterator_tag iterator_category
Definition: Digestion.hpp:231
provides forward-only, read-only iteration to enumerate peptides
Definition: Digestion.hpp:218
PWIZ_API_DECL proteome::Peptide peptide(const Peptide &peptide)
creates a proteome::Peptide from an identdata::Peptide
boost::shared_ptr< Impl > impl_
Definition: Digestion.hpp:254
NonSpecific
Definition: Digestion.hpp:119
sets constraints for valid peptides produced by iterating the digestion
Definition: Digestion.hpp:125
PWIZ_API_DECL std::vector< CVID > cleavageAgents(const Enzymes &enzymes)
returns a list of cleavage agent CVIDs for an identdata::Enzymes instance
represents a peptide or polypeptide (a sequence of amino acids)
Definition: Peptide.hpp:61
SemiSpecific
neither termini must match digestion motif(s)
Definition: Digestion.hpp:120
PWIZ_API_DECL std::vector< std::string > cleavageAgentRegexes(const Enzymes &enzymes)
returns a list of regular expressions for an identdata::Enzymes instance
#define PWIZ_API_DECL
Definition: Export.hpp:32
PWIZ_API_DECL Formula operator*(const Formula &a, int scalar)
PWIZ_API_DECL bool operator==(const TruncatedLorentzianParameters &t, const TruncatedLorentzianParameters &u)
PWIZ_API_DECL std::string cleavageAgentRegex(const Enzyme &ez)
returns a regular expression for an identdata::Enzyme
PWIZ_API_DECL CVID cleavageAgent(const Enzyme &ez)
returns a cleavage agent CVID for an identdata::Enzyme
enumerates the peptides from proteolytic digestion of a polypeptide or protein;
Definition: Digestion.hpp:111
PWIZ_API_DECL bool operator!=(const TruncatedLorentzianParameters &t, const TruncatedLorentzianParameters &u)
peptide subclass that contains extra metadata provided by digestion
Definition: Digestion.hpp:47
Definition: cv.hpp:91