ProteoWizard
Serializer_pepXML_Test.cpp
Go to the documentation of this file.
1 //
2 // $Id: Serializer_pepXML_Test.cpp 6943 2014-11-26 17:07:14Z chambm $
3 //
4 //
5 // Original author: Matt Chambers <matt.chambers .@. vanderbilt.edu>
6 //
7 // Copyright 2010 Vanderbilt University - Nashville, TN 37232
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
21 
22 
23 #include "Serializer_pepXML.hpp"
24 #include "Diff.hpp"
25 #include "References.hpp"
26 #include "examples.hpp"
31 #include "TextWriter.hpp"
32 #include "boost/range/adaptor/transformed.hpp"
33 #include "boost/range/algorithm/max_element.hpp"
34 #include "boost/range/algorithm/min_element.hpp"
35 #include "boost/range/algorithm_ext/erase.hpp"
36 #include <cstring>
37 
38 
39 using namespace pwiz::identdata;
40 using namespace pwiz::identdata::examples;
41 using namespace pwiz::util;
42 namespace proteome = pwiz::proteome;
43 
44 ostream* os_ = 0;
45 
47 {
48  typedef int result_type;
49  int operator()(const EnzymePtr& x) const {return x->terminalSpecificity;}
50 };
51 
53 {
54  typedef int result_type;
55  int operator()(const EnzymePtr& x) const {return x->missedCleavages;}
56 };
57 
59 {
60  UserParamNameIs(const string& name) : name_(name) {}
61 
62  bool operator() (const UserParam& up) const { return up.name == name_; }
63 
64  string name_;
65 };
66 
68 {
69  mzid.bibliographicReference.clear();
70  mzid.analysisSampleCollection.samples.clear();
71  mzid.auditCollection.clear();
72  mzid.provider = Provider();
73  mzid.dataCollection.inputs.sourceFile.clear();
74 
75  BOOST_FOREACH(AnalysisSoftwarePtr& as, mzid.analysisSoftwareList)
76  {
77  as->URI.clear();
78  as->customizations.clear();
79  as->contactRolePtr.reset();
80  }
81 
83 
84  // pepXML only provides a single min_number_termini and max_num_internal_cleavages for all enzymes
85  int minSpecificity = *boost::range::min_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_specificity()));
86  int maxMissedCleavages = *boost::range::max_element(sip.enzymes.enzymes | boost::adaptors::transformed(EnzymePtr_missedCleavages()));
87  BOOST_FOREACH(const EnzymePtr& ez, sip.enzymes.enzymes)
88  {
89  ez->terminalSpecificity = (proteome::Digestion::Specificity) minSpecificity;
90  ez->missedCleavages = maxMissedCleavages;
91  }
92 
93  // pepXML doesn't map these elements
94  sip.massTable.clear();
95  sip.threshold.clear();
96  sip.databaseFilters.clear();
97  sip.databaseTranslation.reset();
98 
99  // pepXML doesn't map these attributes
100  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->name.clear();
101  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->version.clear();
102  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->releaseDate.clear();
103  mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->databaseName.clear();
104 
105  // pepXML doesn't reliably store location or file format
106  string& location = mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->location;
107  location = BFS_STRING(bfs::path(location).replace_extension("").filename());
108  mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->fileFormat = CVParam();
109 
110  string& location2 = mzid.analysisCollection.spectrumIdentification[0]->searchDatabase[0]->location;
111  location2 = BFS_STRING(bfs::path(location2).replace_extension("").filename());
112 
113  // pepXML doesn't support protein sequences
114  BOOST_FOREACH(DBSequencePtr& dbSequence, mzid.sequenceCollection.dbSequences)
115  {
116  dbSequence->seq.clear();
117  dbSequence->length = 0;
118  dbSequence->id = "DBSeq_" + dbSequence->accession;
119  }
120 
121  // pepXML can only support one mass type (we pick the max mass in case one of them is 0)
122  BOOST_FOREACH(PeptidePtr& peptide, mzid.sequenceCollection.peptides)
123  BOOST_FOREACH(ModificationPtr& mod, peptide->modification)
124  mod->monoisotopicMassDelta = mod->avgMassDelta = max(mod->monoisotopicMassDelta, mod->avgMassDelta);
125 
126  // pepXML doesn't support fragment metadata
127  mzid.dataCollection.analysisData.spectrumIdentificationList[0]->fragmentationTable.clear();
128 
129  BOOST_FOREACH(SpectrumIdentificationResultPtr& sir, mzid.dataCollection.analysisData.spectrumIdentificationList[0]->spectrumIdentificationResult)
130  BOOST_FOREACH(SpectrumIdentificationItemPtr& sii, sir->spectrumIdentificationItem)
131  {
132  // pepXML doesn't support fragment metadata or mass tables
133  sii->fragmentation.clear();
134  sii->massTablePtr.reset();
135 
136  for (size_t i=0; i < sii->peptideEvidencePtr.size(); ++i)
137  {
138  PeptideEvidence& pe = *sii->peptideEvidencePtr[i];
139 
140  // pepXML does not store peptide start and end offsets
141  pe.start = pe.end = 0;
142 
143  // pepXML's alternative_proteins do not store prev/next AA or missed cleavages
144  if (i > 0)
145  pe.pre = pe.post = '?';
146  }
147  }
148 
149  // pepXML doesn't have protein assembly
152 
153  // pepXML expects the residues to be '.' or an amino acid list
154  BOOST_FOREACH(SearchModificationPtr& sm, mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->modificationParams)
155  if (sm->residues.empty())
156  sm->residues.push_back('.');
157 }
158 
159 void testTranslation(const string& str)
160 {
161  // test that search engine name is written using preferred name
162  unit_assert(bal::contains(str, "search_engine=\"Mascot\""));
163 
164  // test that score names are written using preferred name
165  unit_assert(bal::contains(str, "name=\"ionscore\""));
166  unit_assert(bal::contains(str, "name=\"homologyscore\""));
167  unit_assert(bal::contains(str, "name=\"identityscore\""));
168  unit_assert(bal::contains(str, "name=\"expect\""));
169  unit_assert(bal::contains(str, "name=\"an extra score\""));
170 
171  // test that nativeID is preserved
172  unit_assert(bal::contains(str, "spectrumNativeID=\"controllerType=0 controllerNumber=1 scan=420\""));
173 }
174 
176 {
177  if (os_) *os_ << "begin testSerialize" << endl;
178 
179  Serializer_pepXML serializer(config);
180  ostringstream oss;
181  serializer.write(oss, mzid, "tiny.pepXML");
182 
183  if (os_) *os_ << "oss:\n" << oss.str() << endl;
184  if (config.readSpectrumQueries)
185  testTranslation(oss.str());
186 
187  shared_ptr<istringstream> iss(new istringstream(oss.str()));
188  IdentData mzid2;
189  serializer.read(iss, mzid2);
190 
191  References::resolve(mzid2);
192 
193  // remove DecoyPrefix userParam that is redundant with the decoy DB prefix cvParam
194  boost::range::remove_erase_if(mzid2.analysisProtocolCollection.spectrumIdentificationProtocol[0]->additionalSearchParams.userParams, UserParamNameIs("DecoyPrefix"));
195 
196  Diff<IdentData, DiffConfig> diff(mzid, mzid2);
197  if (os_ && diff) *os_ << diff << endl;
198  unit_assert(!diff);
199 }
200 
202 {
203  IdentData mzid;
205  stripUnmappedMetadata(mzid);
207 
208 
209  // test non-specific enzyme
210  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
211  EnzymePtr noEnzyme(new Enzyme);
212  noEnzyme->id = "ENZ_1";
213  noEnzyme->cTermGain = "OH";
214  noEnzyme->nTermGain = "H";
215  noEnzyme->missedCleavages = 2;
216  noEnzyme->minDistance = 1;
217  noEnzyme->terminalSpecificity = proteome::Digestion::NonSpecific;
218  noEnzyme->siteRegexp = "(?<=[KR])";
219  noEnzyme->enzymeName.set(MS_Trypsin_P);
220  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(noEnzyme);
222 
223 
224  // test sense="N" enzymes
225  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.clear();
226  EnzymePtr aspN(new Enzyme);
227  aspN->id = "ENZ_1";
228  aspN->cTermGain = "OH";
229  aspN->nTermGain = "H";
230  aspN->missedCleavages = 2;
231  aspN->minDistance = 1;
232  aspN->terminalSpecificity = proteome::Digestion::FullySpecific;
233  aspN->siteRegexp = "(?=[BD])";
234  aspN->enzymeName.set(MS_Asp_N);
235  mzid.analysisProtocolCollection.spectrumIdentificationProtocol[0]->enzymes.enzymes.push_back(aspN);
237 
238  aspN->missedCleavages = 4;
239  aspN->minDistance = 2;
240  aspN->terminalSpecificity = proteome::Digestion::SemiSpecific;
241  aspN->siteRegexp = "(?=[BND])";
242  aspN->enzymeName.clear();
243  aspN->enzymeName.userParams.push_back(UserParam("custom"));
245 
246 
247  // test with readSpectrumQueries == false
248 
249  // clear the original SequenceCollection
250  mzid.sequenceCollection.dbSequences.clear();
251  mzid.sequenceCollection.peptides.clear();
252  mzid.sequenceCollection.peptideEvidence.clear();
253 
254  // clear the original analysis data
255  mzid.analysisCollection.spectrumIdentification[0]->inputSpectra[0]->spectrumIDFormat = CVParam();
256  mzid.analysisCollection.spectrumIdentification[0]->spectrumIdentificationListPtr.reset();
259 
261 }
262 
264 {
265  PepXMLSpecificity result;
266  Enzyme ez;
267 
269  result = pepXMLSpecificity(ez);
270  unit_assert_operator_equal("C", result.sense);
271  unit_assert_operator_equal("KR", result.cut);
272  unit_assert_operator_equal("P", result.no_cut);
273 
274  ez.enzymeName.clear();
276  result = pepXMLSpecificity(ez);
277  unit_assert_operator_equal("C", result.sense);
278  unit_assert_operator_equal("KR", result.cut);
280 
281  ez.enzymeName.clear();
282  ez.enzymeName.userParams.push_back(UserParam("trypsin/p"));
283  result = pepXMLSpecificity(ez);
284  unit_assert_operator_equal("C", result.sense);
285  unit_assert_operator_equal("KR", result.cut);
287 
288  ez.enzymeName.clear();
289  ez.name = "trypsin/p";
290  result = pepXMLSpecificity(ez);
291  unit_assert_operator_equal("C", result.sense);
292  unit_assert_operator_equal("KR", result.cut);
294 
295  ez.name.clear();
296  ez.enzymeName.set(MS_Asp_N);
297  result = pepXMLSpecificity(ez);
298  unit_assert_operator_equal("N", result.sense);
299  unit_assert_operator_equal("BD", result.cut);
301 
302  ez.enzymeName.clear();
304  result = pepXMLSpecificity(ez);
305  unit_assert_operator_equal("C", result.sense);
306  unit_assert_operator_equal("KR", result.cut);
307  unit_assert_operator_equal("P", result.no_cut);
308 
310  result = pepXMLSpecificity(ez);
311  unit_assert_operator_equal("C", result.sense);
312  unit_assert_operator_equal("KR", result.cut);
314 
316  result = pepXMLSpecificity(ez);
317  unit_assert_operator_equal("N", result.sense);
318  unit_assert_operator_equal("BD", result.cut);
320 
321 
322  // REMEMBER: update the pepXMLSpecificity function when new CV enzymes are added
323  bool allCleavageAgentsHandled = true;
324  ez.siteRegexp.clear();
326  try
327  {
328  ez.enzymeName.clear();
329  ez.enzymeName.set(cleavageAgent);
330  result = pepXMLSpecificity(ez);
331  }
332  catch (exception& e)
333  {
334  cerr << e.what() << endl;
335  allCleavageAgentsHandled = false;
336  }
337  unit_assert(allCleavageAgentsHandled);
338 
339 
340  ez.siteRegexp = "(?<=[QWERTY])(?=[QWERTY])";
341  result = pepXMLSpecificity(ez);
342  unit_assert_operator_equal("C", result.sense);
343  unit_assert_operator_equal("QWERTY", result.cut);
344  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.no_cut);
345 
346  ez.siteRegexp = "(?<![QWERTY])(?![QWERTY])";
347  result = pepXMLSpecificity(ez);
348  unit_assert_operator_equal("C", result.sense);
349  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
350  unit_assert_operator_equal("QWERTY", result.no_cut);
351 
352  ez.siteRegexp = "(?<=[QWERTY])";
353  result = pepXMLSpecificity(ez);
354  unit_assert_operator_equal("C", result.sense);
355  unit_assert_operator_equal("QWERTY", result.cut);
357 
358  ez.siteRegexp = "(?=[QWERTY])";
359  result = pepXMLSpecificity(ez);
360  unit_assert_operator_equal("N", result.sense);
361  unit_assert_operator_equal("QWERTY", result.cut);
363 
364  ez.siteRegexp = "(?<![QWERTY])";
365  result = pepXMLSpecificity(ez);
366  unit_assert_operator_equal("C", result.sense);
367  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
369 
370  ez.siteRegexp = "(?![QWERTY])";
371  result = pepXMLSpecificity(ez);
372  unit_assert_operator_equal("N", result.sense);
373  unit_assert_operator_equal("ABCDFGHIJKLMNOPSUVZ", result.cut);
375 }
376 
377 
379 {
380  unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123.2"));
381  unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123.12"));
382  unit_assert_operator_equal("basename.2.2", stripChargeFromConventionalSpectrumId("basename.2.2.2"));
383  unit_assert_operator_equal("basename.ext.3.3", stripChargeFromConventionalSpectrumId("basename.ext.3.3.3"));
384  unit_assert_operator_equal("basename.123.123", stripChargeFromConventionalSpectrumId("basename.123.123"));
385  unit_assert_operator_equal("basename.ext.123.123", stripChargeFromConventionalSpectrumId("basename.ext.123.123"));
386  unit_assert_operator_equal("locus:1.1.1.123", stripChargeFromConventionalSpectrumId("locus:1.1.1.123.2"));
387  unit_assert_operator_equal("basename.123", stripChargeFromConventionalSpectrumId("basename.123"));
389 }
390 
391 
392 int main(int argc, char** argv)
393 {
394  TEST_PROLOG(argc, argv)
395 
396  try
397  {
398  if (argc>1 && !strcmp(argv[1],"-v")) os_ = &cout;
401  testSerialize();
402  }
403  catch (exception& e)
404  {
405  TEST_FAILED(e.what())
406  }
407  catch (...)
408  {
409  TEST_FAILED("Caught unknown exception.")
410  }
411 
413 }
AnalysisProtocolCollection analysisProtocolCollection
Definition: IdentData.hpp:1018
int operator()(const EnzymePtr &x) const
MS_Asp_N
Asp-N: Endoproteinase Asp-N.
Definition: cv.hpp:4165
void testStripChargeFromConventionalSpectrumId()
std::vector< PeptideEvidencePtr > peptideEvidence
Definition: IdentData.hpp:656
Implementation of PeptideEvidenceType from the mzIdentML schema.
Definition: IdentData.hpp:626
AnalysisCollection analysisCollection
Definition: IdentData.hpp:1016
void stripUnmappedMetadata(IdentData &mzid)
Implementation of EnzymeType from the mzIdentML schema.
Definition: IdentData.hpp:408
MS_Trypsin
Trypsin: Enzyme trypsin.
Definition: cv.hpp:4027
PWIZ_API_DECL proteome::Peptide peptide(const Peptide &peptide)
creates a proteome::Peptide from an identdata::Peptide
Calculate diffs of objects in a ProteoWizard data model hierarchy.
Definition: diff_std.hpp:142
std::vector< SpectrumIdentificationListPtr > spectrumIdentificationList
Definition: IdentData.hpp:962
#define TEST_EPILOG
Definition: unit.hpp:182
void testPepXMLSpecificity()
NonSpecific
Definition: Digestion.hpp:119
std::vector< SpectrumIdentificationProtocolPtr > spectrumIdentificationProtocol
Definition: IdentData.hpp:911
PWIZ_API_DECL void initializeBasicSpectrumIdentification(IdentData &mzid)
PWIZ_API_DECL PepXMLSpecificity pepXMLSpecificity(const Enzyme &ez)
converts an identdata::Enzyme into a pepXML cut/no_cut/sense tuple
MZIDData <-> pepXML stream serialization.
void clear()
clears the collections
DataCollection dataCollection
Definition: IdentData.hpp:1020
std::vector< EnzymePtr > enzymes
Definition: IdentData.hpp:435
void diff(const string &filename1, const string &filename2)
Serializer_pepXML configuration.
Implementation of ProviderType from the mzIdentML schema.
Definition: IdentData.hpp:234
std::vector< PeptidePtr > peptides
Definition: IdentData.hpp:655
Uncontrolled user parameters (essentially allowing free text). Before using these, one should verify whether there is an appropriate CV term available, and if so, use the CV term instead.
Definition: ParamTypes.hpp:185
void testSerializeReally(IdentData &mzid, const Serializer_pepXML::Config &config)
void testTranslation(const string &str)
void testSerialize()
SemiSpecific
neither termini must match digestion motif(s)
Definition: Digestion.hpp:120
std::vector< UserParam > userParams
a collection of uncontrolled user terms
Definition: ParamTypes.hpp:253
#define unit_assert_operator_equal(expected, actual)
Definition: unit.hpp:92
AnalysisSampleCollection analysisSampleCollection
Definition: IdentData.hpp:1012
Implementation of the MzIdentMLType from the mzIdentML schema.
Definition: IdentData.hpp:993
int operator()(const EnzymePtr &x) const
MS_Trypsin_P
Trypsin/P: Cleavage agent Trypsin/P.
Definition: cv.hpp:4192
boost::shared_ptr< Peptide > PeptidePtr
Definition: TraData.hpp:236
std::string name
the name for the parameter.
Definition: ParamTypes.hpp:188
ParamContainer enzymeName
Definition: IdentData.hpp:420
#define BFS_STRING(p)
Definition: Filesystem.hpp:53
static const std::set< CVID > & getCleavageAgents()
returns the set of predefined cleavage agents defined in the PSI-MS CV
std::vector< SourceFilePtr > sourceFile
Definition: IdentData.hpp:946
std::vector< SpectrumIdentificationPtr > spectrumIdentification
Definition: IdentData.hpp:897
PWIZ_API_DECL void resolve(ContactRole &cr, IdentData &mzid)
std::vector< BibliographicReferencePtr > bibliographicReference
Definition: IdentData.hpp:1022
Implementation of SpectrumIdentificationProtocolType from the mzIdentML schema.
Definition: IdentData.hpp:545
std::vector< ContactPtr > auditCollection
Definition: IdentData.hpp:1010
#define TEST_FAILED(x)
Definition: unit.hpp:176
std::vector< DBSequencePtr > dbSequences
Definition: IdentData.hpp:654
ostream * os_
UserParamNameIs(const string &name)
SequenceCollection sequenceCollection
Definition: IdentData.hpp:1014
void set(CVID cvid, const std::string &value="", CVID units=CVID_Unknown)
set/add a CVParam (not recursive)
#define TEST_PROLOG(argc, argv)
Definition: unit.hpp:174
PWIZ_API_DECL CVID cleavageAgent(const Enzyme &ez)
returns a cleavage agent CVID for an identdata::Enzyme
KernelTraitsBase< Kernel >::space_type::abscissa_type x
std::vector< AnalysisSoftwarePtr > analysisSoftwareList
Definition: IdentData.hpp:1006
void read(boost::shared_ptr< std::istream > is, IdentData &mzid, const pwiz::util::IterationListenerRegistry *=0) const
read in MZIDData object from a pepXML istream
ProteinDetectionListPtr proteinDetectionListPtr
Definition: IdentData.hpp:963
#define unit_assert(x)
Definition: unit.hpp:85
static const std::string & getCleavageAgentRegex(CVID agentCvid)
returns the official PSI Perl regular expression defining the places in a polypeptide or protein that...
PWIZ_API_DECL std::string stripChargeFromConventionalSpectrumId(const std::string &id)
strips charge state from known conventions of the pepXML spectrum attribute; used to find a unique id...
void write(std::ostream &os, const IdentData &mzid, const std::string &filepath, const pwiz::util::IterationListenerRegistry *=0) const
write MZIDData object to ostream as pepXML
represents a tag-value pair, where the tag comes from the controlled vocabulary
Definition: ParamTypes.hpp:44
int main(int argc, char **argv)
Implementation of ProteinDetectionType from the mzIdentML schema.
Definition: IdentData.hpp:872