00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036 
00037 
00038 
00039 
00040 
00041 
00042 
00043 
00044 
00045 
00046 
00047 #include <cassert>
00048 #include <fstream>
00049 #include <algorithm>
00050 #include "ncl/nxsmultiformat.h"
00051 #include "ncl/nxsstring.h"
00052 
00053 const unsigned MAX_BUFFER_SIZE = 0x80000;
00054 
00055 
00056 
00057 
00058 
00059 
00060 
00061 const char * gFormatNames[] = { "nexus",
00062                                 "dnafasta",
00063                                 "aafasta",
00064                                 "rnafasta",
00065                                 "dnaphylip",
00066                                 "rnaphylip",
00067                                 "aaphylip",
00068                                 "discretephylip",
00069                                 "dnaphylipinterleaved",
00070                                 "rnaphylipinterleaved",
00071                                 "aaphylipinterleaved",
00072                                 "discretephylipinterleaved",
00073                                 "dnarelaxedphylip",
00074                                 "rnarelaxedphylip",
00075                                 "aarelaxedphylip",
00076                                 "discreterelaxedphylip",
00077                                 "dnarelaxedphylipinterleaved",
00078                                 "rnarelaxedphylipinterleaved",
00079                                 "aarelaxedphylipinterleaved",
00080                                 "discreterelaxedphylipinterleaved",
00081                                 "dnaaln",
00082                                 "rnaaln",
00083                                 "aaaln",
00084                                 "phyliptree",
00085                                 "relaxedphyliptree",
00086                                 "nexml",
00087                             };
00088 const unsigned gNumFormats = 26;
00089 const unsigned PHYLIP_NMLNGTH = 10;
00090 
00091 std::vector<std::string> MultiFormatReader::getFormatNames()
00092     {
00093     std::vector<std::string> v(gNumFormats);
00094     for (unsigned i = 0; i < gNumFormats; ++i)
00095         {
00096         v[i] = std::string(gFormatNames[i]);
00097         }
00098     return v;
00099     }
00100 
00101 
00102 
00103 class FileToCharBuffer
00104 {
00105         char prevChar;
00106         std::istream & inf;
00107         unsigned remaining;
00108         unsigned pos;
00109     public:
00110         unsigned totalSize;
00111     protected:
00112         unsigned lineNumber;
00113         unsigned prevNewlinePos;
00114     public:
00115         
00116 
00117 
00118 
00119 
00120         FileToCharBuffer(std::istream & instream);
00121 
00122         
00123 
00124 
00125 
00126         bool refillBuffer(unsigned offset);
00127         char current() const
00128             {
00129             return buffer[pos];
00130             }
00131         bool advance()
00132             {
00133             if (pos + 1 >= inbuffer)
00134                 {
00135                 if (!refillBuffer(0))
00136                     return false;
00137                 }
00138             else
00139                 ++pos;
00140             const char c = current();
00141             if (c == 13)
00142                 {
00143                 ++lineNumber;
00144                 prevNewlinePos = position();
00145                 }
00146             else if (c == 10)
00147                 {
00148                 if (prev() != 13)
00149                     ++lineNumber;
00150                 prevNewlinePos = position();
00151                 }
00152             return true;
00153             }
00154         bool advance_then_store(char & c)
00155             {
00156             if (!this->advance())
00157                 return false;
00158             c = this->current();
00159             return true;
00160             }
00161         bool skip_to_beginning_of_line(char & next);
00162         char prev() const
00163             {
00164             if (pos == 0)
00165                 return prevChar;
00166             return buffer[pos - 1];
00167             }
00168         ~FileToCharBuffer()
00169             {
00170             delete [] buffer;
00171             }
00172         unsigned position() const
00173             {
00174             return totalSize +  pos - remaining - inbuffer;
00175             }
00176         unsigned line() const
00177             {
00178             return lineNumber;
00179             }
00180         unsigned column() const
00181             {
00182             unsigned p = position();
00183             if (p < prevNewlinePos)
00184                 return 0;
00185             return p - prevNewlinePos;
00186             }
00187         char * buffer;
00188         unsigned inbuffer;
00189 
00190 };
00191 
00192 
00193 void MultiFormatReader::ReadFilepath(const char * filepath, const char * formatName)
00194     {
00195     if (!formatName)
00196         return;
00197     DataFormatType f =  formatNameToCode(formatName);
00198     if (f == UNSUPPORTED_FORMAT)
00199         {
00200         NxsString m;
00201         m << "Unsupported format: " << formatName;
00202         throw NxsException(m);
00203         }
00204     this->ReadFilepath(filepath, f);
00205     }
00206 
00207 void MultiFormatReader::ReadStream(std::istream & inf, const char * formatName)
00208     {
00209     if (!formatName)
00210         return;
00211     DataFormatType f =  formatNameToCode(formatName);
00212     if (f == UNSUPPORTED_FORMAT)
00213         {
00214         NxsString m;
00215         m << "Unsupported format: " << formatName;
00216         throw NxsException(m);
00217         }
00218     this->ReadStream(inf, f);
00219     }
00220 
00221 FileToCharBuffer::FileToCharBuffer(std::istream & instream)
00222     :prevChar('\n'),
00223     inf(instream),
00224     pos(0),
00225     totalSize(0),
00226     lineNumber(1),
00227     prevNewlinePos(0),
00228     buffer(0L)
00229     {
00230     std::streampos s = inf.tellg();
00231     inf.seekg (0, std::ios::end);
00232     std::streampos e = inf.tellg();
00233     if (e <= s)
00234         {
00235         inbuffer = 0;
00236         remaining = 0;
00237         return;
00238         }
00239     inf.seekg(s);
00240     totalSize = static_cast<unsigned>(e - s);
00241     inbuffer = std::min(MAX_BUFFER_SIZE, totalSize);
00242     remaining = totalSize - inbuffer;
00243     buffer = new char [inbuffer];
00244     inf.read(buffer, inbuffer);
00245     const char c = current();
00246     if (c == 13)
00247         {
00248         ++lineNumber;
00249         prevNewlinePos = position();
00250         }
00251     else if (c == 10)
00252         {
00253         if (prev() != 13)
00254             ++lineNumber;
00255         prevNewlinePos = position();
00256         }
00257     }
00258 
00259 bool FileToCharBuffer::refillBuffer(unsigned offset)
00260     {
00261     if (remaining  == 0)
00262         return false;
00263     if (offset == 0)
00264         prevChar = buffer[inbuffer-1];
00265     inbuffer = std::min(inbuffer - offset, remaining);
00266     remaining -= inbuffer;
00267     inf.read(buffer + offset, inbuffer);
00268     pos = offset;
00269     return true;
00270     }
00271 
00272 
00273 MultiFormatReader::DataFormatType MultiFormatReader::formatNameToCode(const std::string &s)
00274     {
00275     std::string l(s);
00276     NxsString::to_lower(l);
00277     int ind = NxsString::index_in_array(l, gFormatNames, gNumFormats);
00278     if (ind < 0)
00279         return UNSUPPORTED_FORMAT;
00280     NCL_ASSERT(ind < UNSUPPORTED_FORMAT);
00281     return MultiFormatReader::DataFormatType(ind);
00282     }
00283 
00284 
00285 
00286 
00287 
00288 
00289 
00290 bool  MultiFormatReader::readFastaSequences(
00291     FileToCharBuffer & ftcb,
00292     const NxsDiscreteDatatypeMapper &dm,
00293     std::list<std::string> & taxaNames,
00294     std::list<NxsDiscreteStateRow> & matList,
00295     size_t & longest)
00296     {
00297     char * contents = ftcb.buffer;
00298     NCL_ASSERT(contents);
00299     NxsString err;
00300     for (;;)
00301         {
00302         if (ftcb.current() == '>' && ( ftcb.prev() == '\n' ||  ftcb.prev() == '\r'))
00303             {
00304             std::string n;
00305             if (!ftcb.advance())
00306                 break;
00307             for (;;)
00308                 {
00309                 char c = ftcb.current();
00310                 if (c == '\n' || c == '\r')
00311                     break;
00312                 n.append(1, c);
00313                 if (!ftcb.advance())
00314                     break;
00315                 }
00316             std::string nameStripped = NxsString::strip_surrounding_whitespace(n);
00317             taxaNames.push_back(nameStripped);
00318 
00319             matList.push_back(NxsDiscreteStateRow());
00320             if (!ftcb.advance())
00321                 break;
00322             NxsDiscreteStateRow & row = *(matList.rbegin());
00323             row.reserve(longest);
00324             for (;;)
00325                 {
00326                 char c = ftcb.current();
00327                 if (c == '>' && (ftcb.prev() == '\n' || ftcb.prev() == '\r'))
00328                     break;
00329                 if (isgraph(c))
00330                     {
00331                     int stateCode = dm.GetStateCodeStored(c);
00332                     if (stateCode == NXS_INVALID_STATE_CODE)
00333                         {
00334                         err << "Illegal state code \"" << c << "\" found when reading character " << row.size() << " for taxon " << n;
00335                         throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00336                         }
00337                     row.push_back(stateCode);
00338                     }
00339                 if (!ftcb.advance())
00340                     break;
00341                 }
00342             longest = std::max(longest, row.size());
00343             }
00344         else if (!ftcb.advance())
00345             break;
00346         }
00347     
00348     std::list<NxsDiscreteStateRow>::iterator sIt = matList.begin();
00349     bool allSameLength = true;
00350     for (; sIt != matList.end(); ++sIt)
00351         {
00352         NxsDiscreteStateRow & row = *sIt;
00353         if (row.size() < longest)
00354             {
00355             allSameLength = false;
00356             break;
00357             }
00358         }
00359     return allSameLength;
00360     }
00361 
00362 std::string  MultiFormatReader::readPhylipName(FileToCharBuffer & ftcb, unsigned i, bool relaxedNames)
00363     {
00364     NxsString err;
00365     std::string n;
00366     if (relaxedNames)
00367         {
00368         do {
00369             n.append(1,ftcb.current());
00370             if (!ftcb.advance())
00371                 {
00372                 err << "End of file found when reading the name of taxon " << i+1 << ", \"" << n << "\"";
00373                 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00374                 }
00375             }
00376         while (isgraph(ftcb.current()));
00377         while (!isgraph(ftcb.current()))
00378             {
00379             if (!ftcb.advance())
00380                 {
00381                 err << "End of file found when expecting the beginning of the data for taxon " << i+1 << ", \"" << n << "\"";
00382                 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00383                 }
00384             }
00385         }
00386     else
00387         {
00388         std::string ws;
00389         for (unsigned letter = 0; letter < PHYLIP_NMLNGTH; ++letter)
00390             {
00391             char c = ftcb.current();
00392             if (isgraph(c))
00393                 {
00394                 n.append(ws);
00395                 n.append(1,c);
00396                 ws.clear();
00397                 }
00398             else
00399                 ws.append(1, c);
00400             if (!ftcb.advance())
00401                 {
00402                 err << "End of file found when reading the name for taxon " << i+1 << ", \"" << n << "\"";
00403                 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00404                 }
00405             }
00406         }
00407     return n;
00408     }
00409 
00410 void  MultiFormatReader::readPhylipData(
00411     FileToCharBuffer & ftcb,
00412     const NxsDiscreteDatatypeMapper &dm,
00413     std::list<std::string> & taxaNames,
00414     std::list<NxsDiscreteStateRow> & matList,
00415     const unsigned n_taxa,
00416     const unsigned n_char,
00417     bool relaxedNames)
00418     {
00419     NCL_ASSERT(n_taxa > 0 && n_char > 0);
00420     NxsString err;
00421     matList.clear();
00422     matList.assign(n_taxa, NxsDiscreteStateRow(n_char, NXS_INVALID_STATE_CODE));
00423     std::list<NxsDiscreteStateRow>::iterator mIt = matList.begin();
00424     while (!isgraph(ftcb.current()))
00425         {
00426         if (!ftcb.advance())
00427             goto funcExit;
00428         }
00429 
00430     for (unsigned i = 0; i < n_taxa; ++i)
00431         {
00432         std::string n = readPhylipName(ftcb, i, relaxedNames);
00433         taxaNames.push_back(n);
00434         NCL_ASSERT(mIt != matList.end());
00435         NxsDiscreteStateRow & row = *mIt++;
00436         for (unsigned j = 0; j < n_char; ++j)
00437             {
00438             bool readChar = false;
00439             for (;;)
00440                 {
00441                 const char c = ftcb.current();
00442                 if (isgraph(c))
00443                     {
00444                     if (isdigit(c))
00445                         {
00446                         err << "Number encountered (and ignored) within sequence for taxon " << n;
00447                         NexusWarn(err, NxsReader::PROBABLY_INCORRECT_CONTENT_WARNING, ftcb.position(), ftcb.line(), ftcb.column());
00448                         err.clear();
00449                         }
00450                     else
00451                         {
00452                         const int stateCode = dm.GetStateCodeStored(c);
00453                         if (stateCode == NXS_INVALID_STATE_CODE)
00454                             {
00455                             if (c == '.')
00456                                 {
00457                                 if (i == 0)
00458                                     {
00459                                     err << "Illegal match character state code  \".\" found in the first taxon for character " << j + 1 ;
00460                                     throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00461                                     }
00462                                 NxsDiscreteStateRow & firstRow = *(matList.begin());
00463                                 row[j] = firstRow.at(j);
00464                                 }
00465                             else
00466                                 {
00467                                 err << "Illegal state code \"" << c << "\" found when reading site " << j + 1 << " for taxon " << n;
00468                                 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00469                                 }
00470                             }
00471                         else
00472                             row[j] = stateCode;
00473                         readChar = true;
00474                         }
00475                     }
00476                 if (!ftcb.advance())
00477                     goto funcExit;
00478                 if (readChar)
00479                     break;
00480                 }
00481             }
00482         char f = ftcb.current();
00483         while (f != '\r' && f != '\n')
00484             {
00485             if (isgraph(f))
00486                 {
00487                 err << "Sequence longer than " << n_char << " found for taxon " << n << ". The character \""<< f << "\" was found, and will be ignored. If the file position of this error corresponds to sequences for the next taxon in the matrix, then that is an indication that the sequences for taxon " << n << " are too short.";
00488                 NexusWarn(err, NxsReader::PROBABLY_INCORRECT_CONTENT_WARNING, ftcb.position(), ftcb.line(), ftcb.column());
00489                 err.clear();
00490                 }
00491             if (!ftcb.advance())
00492                 goto funcExit;
00493             f = ftcb.current();
00494             }
00495         while (!isgraph(ftcb.current()))
00496             {
00497             if (!ftcb.advance())
00498                 goto funcExit;
00499             }
00500         }
00501     funcExit:
00502         if (matList.size() != n_taxa)
00503             {
00504             err << "Unexpected end of file.\nExpecting data for " << n_taxa << " taxa, but only found data for " << matList.size();
00505             throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00506             }
00507         const NxsDiscreteStateRow & lastRow = *matList.rbegin();
00508         if (lastRow.size() != n_char)
00509             {
00510             err << "Unexpected end of file.\nExpecting " << n_char << " characters for taxon " <<  *(taxaNames.rbegin()) << ", but only found " << lastRow.size() << " characters.";
00511             throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00512             }
00513     }
00514 
00515 
00516 void  MultiFormatReader::readInterleavedPhylipData(
00517     FileToCharBuffer & ftcb,
00518     const NxsDiscreteDatatypeMapper &dm,
00519     std::list<std::string> & taxaNames,
00520     std::list<NxsDiscreteStateRow> & matList,
00521     const unsigned n_taxa,
00522     const unsigned n_char,
00523     bool relaxedNames)
00524     {
00525     NCL_ASSERT(n_taxa > 0 && n_char > 0);
00526     NxsString err;
00527     matList.clear();
00528     matList.assign(n_taxa, NxsDiscreteStateRow(n_char, NXS_INVALID_STATE_CODE));
00529     std::list<NxsDiscreteStateRow>::iterator mIt = matList.begin();
00530     unsigned startCharIndex = 0;
00531     unsigned endCharIndex = n_char;
00532     while (!isgraph(ftcb.current()))
00533         {
00534         if (!ftcb.advance())
00535             goto funcExit;
00536         }
00537     while (startCharIndex < n_char)
00538         {
00539         for (unsigned i = 0; i < n_taxa; ++i)
00540             {
00541             if (startCharIndex == 0)
00542                 {
00543                 std::string n = readPhylipName(ftcb, i, relaxedNames);
00544                 taxaNames.push_back(n);
00545                 }
00546             if (i == 0)
00547                 mIt = matList.begin();
00548             NCL_ASSERT(mIt != matList.end());
00549             NxsDiscreteStateRow & row = *mIt++;
00550             unsigned j = startCharIndex;
00551             for (;;)
00552                 {
00553                 const char c = ftcb.current();
00554                 if (isgraph(c))
00555                     {
00556                     if (j >= endCharIndex)
00557                         {
00558                         if (i == 0)
00559                             {
00560                             err << "Too many characters were found for the taxon " << *(taxaNames.begin());
00561                             throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00562                             }
00563                         else
00564                             {
00565                             std::list<std::string>::const_iterator nIt = taxaNames.begin();
00566                             for (unsigned q = 0; q < i ; ++q)
00567                                 ++nIt;
00568                             err << "Illegal character \"" << c << "\" found, after all of the data for this interleave page has been read for the taxon " << *nIt;
00569                             throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00570                             }
00571                         }
00572                     if (isdigit(c))
00573                         {
00574                         std::list<std::string>::const_iterator nIt = taxaNames.begin();
00575                         for (unsigned q = 0; q < i ; ++q)
00576                             ++nIt;
00577                         err << "Number encountered (and ignored) within sequence for taxon " << *nIt;
00578                         NexusWarn(err, NxsReader::PROBABLY_INCORRECT_CONTENT_WARNING, ftcb.position(), ftcb.line(), ftcb.column());
00579                         err.clear();
00580                         }
00581                     else
00582                         {
00583                         const int stateCode = dm.GetStateCodeStored(c);
00584                         if (stateCode == NXS_INVALID_STATE_CODE)
00585                             {
00586                             if (c == '.')
00587                                 {
00588                                 if (i == 0)
00589                                     {
00590                                     err << "Illegal match character state code  \".\" found in the first taxon for character " << j + 1 ;
00591                                     throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00592                                     }
00593                                 NxsDiscreteStateRow & firstRow = *(matList.begin());
00594                                 row[j] = firstRow.at(j);
00595                                 }
00596                             else
00597                                 {
00598                                 std::list<std::string>::const_iterator nIt = taxaNames.begin();
00599                                 for (unsigned q = 0; q < i ; ++q)
00600                                     ++nIt;
00601                                 err << "Illegal state code \"" << c << "\" found when reading site " << j + 1 << " for taxon " << *nIt;
00602                                 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00603                                 }
00604                             }
00605                         else
00606                             row[j] = stateCode;
00607                         j++;
00608                         }
00609                     }
00610                 else if (c == '\r' || c == '\n')
00611                     {
00612                     if (i == 0)
00613                         endCharIndex = j;
00614                     else if (j != endCharIndex)
00615                         {
00616                         std::list<std::string>::const_iterator nIt = taxaNames.begin();
00617                         for (unsigned q = 0; q < i ; ++q)
00618                             ++nIt;
00619                         err << "Expecting " << endCharIndex -  startCharIndex << "characters  in this interleave page (based on the number of characters in the first taxon), but only found " << j - startCharIndex << " for taxon " << *nIt;
00620                         throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00621                         }
00622                     break;
00623                     }
00624                 if (!ftcb.advance())
00625                     goto funcExit;
00626                 }
00627             while (!isgraph(ftcb.current()))
00628                 {
00629                 if (!ftcb.advance())
00630                     goto funcExit;
00631                 }
00632             }
00633         startCharIndex = endCharIndex;
00634         endCharIndex = n_char;
00635         }
00636     funcExit:
00637         if (matList.size() != n_taxa)
00638             {
00639             err << "Unexpected end of file.\nExpecting data for " << n_taxa << " taxa, but only found data for " << matList.size();
00640             throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00641             }
00642         const NxsDiscreteStateRow & lastRow = *matList.rbegin();
00643         if (lastRow.size() != n_char)
00644             {
00645             err << "Unexpected end of file.\nExpecting " << n_char << " characters for taxon " <<  *(taxaNames.rbegin()) << ", but only found " << lastRow.size() << " characters.";
00646             throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00647             }
00648     }
00649 
00650 bool FileToCharBuffer::skip_to_beginning_of_line(char & next)
00651     {
00652     next = this->current();
00653     for (;;)
00654         {
00655         const char c = next;
00656         if (!this->advance_then_store(next))
00657             return false;
00658         if (c == '\n')
00659             return true;
00660         if (c == '\r')
00661             {
00662             if (next == '\n' && (!this->advance_then_store(next)))
00663                 return false;
00664             return true;
00665             }
00666         }
00667     }
00668 
00669 bool  MultiFormatReader::readAlnData(
00670     FileToCharBuffer & ftcb,
00671     const NxsDiscreteDatatypeMapper &dm,
00672     std::list<std::string> & taxaNames,
00673     std::list<NxsDiscreteStateRow> & matList)
00674     {
00675     taxaNames.clear();
00676     char * contents = ftcb.buffer;
00677     NCL_ASSERT(contents);
00678     NxsString err;
00679     char c;
00680     if (!ftcb.current())
00681         throw NxsException("Could not read from file", ftcb.position(), ftcb.line(), ftcb.column());
00682 
00683     c = ftcb.current();
00684     unsigned index = 0;
00685     const char * firstWord = "CLUSTAL";
00686     std::string found;
00687     const unsigned lenFirstWord = strlen(firstWord);
00688     while (index < lenFirstWord)
00689         {
00690         found.append(1, c);
00691         if (toupper(c) != firstWord[index] || !ftcb.advance())
00692             {
00693             err << "Expecting file to start \"CLUSTAL\" found \"" << found << "\"";
00694             throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00695             }
00696         ++index;
00697         c = ftcb.current();
00698         }
00699     do {
00700         if (!ftcb.skip_to_beginning_of_line(c))
00701             throw NxsException("Expecting multi-line file",ftcb.position(), ftcb.line(), ftcb.column());
00702     } while (!isgraph(c));
00703     bool readingFirstBlock = true;
00704     for (;;)
00705         {
00706         
00707         while (!isgraph(c))
00708             {
00709             if (!ftcb.skip_to_beginning_of_line(c))
00710                 {
00711                 if (taxaNames.empty())
00712                     throw NxsException("Sequences after clustal header", ftcb.position(), ftcb.line(), ftcb.column());
00713                 goto funcExit;
00714                 }
00715             }
00716         unsigned curr_tax_ind = 0;
00717         std::list<std::string>::const_iterator taxNameIt;
00718         std::list<NxsDiscreteStateRow>::iterator matRowIt;
00719         if (!readingFirstBlock)
00720             {
00721             taxNameIt = taxaNames.begin();
00722             matRowIt = matList.begin();
00723             }
00724         NxsDiscreteStateRow * row = NULL;
00725         
00726         for (;isgraph(c);)
00727             {
00728             std::string n;
00729             for (;;)
00730                 {
00731                 n.append(1, c);
00732                 if (!ftcb.advance())
00733                     break;
00734                 c = ftcb.current();
00735                 if (!isgraph(c))
00736                     break;
00737                 }
00738             if (readingFirstBlock)
00739                 {
00740                 taxaNames.push_back(n);
00741                 matList.push_back(NxsDiscreteStateRow());
00742                 row = &(*(matList.rbegin()));
00743                 }
00744             else if (curr_tax_ind > taxaNames.size())
00745                 {
00746                 err << "Expecting a line beginning with whitespace (or a blank line), but found \"" << n << "\"";
00747                 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00748                 }
00749             else
00750                 {
00751                 std::string prev_name = *taxNameIt++;
00752                 if (!NxsString::case_insensitive_equals(prev_name.c_str(), n.c_str()))
00753                     {
00754                     err << "Expecting data for taxon # " << (1 + curr_tax_ind) << " \"" << prev_name << "\" but got \"" << n << "\"";
00755                     throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00756                     }
00757                 row = &(*matRowIt++);
00758                 }
00759 
00760 
00761             while (ftcb.advance_then_store(c))
00762                 {
00763                 if (isgraph(c))
00764                     break;
00765                 }
00766             if  (!isgraph(c))
00767                 {
00768                 err << "Unexpected end-of-file after taxon name \"" << n << "\"";
00769                 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00770                 }
00771             
00772             bool eof = false;
00773             bool eoseq = false;
00774             for (;!eoseq;)
00775                 {
00776                 if (isgraph(c))
00777                     {
00778                     if (isdigit(c))
00779                         {
00780                         if (!ftcb.skip_to_beginning_of_line(c))
00781                             {
00782                             if (!readingFirstBlock && (curr_tax_ind + 1) != taxaNames.size())
00783                                 {
00784                                 err << "Unexpected End of file. Expecting data for " << taxaNames.size() << " sequences";
00785                                 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00786                                 }
00787                             goto funcExit;
00788                             }
00789                         break;
00790                         }
00791                     else
00792                         {
00793                         int stateCode = dm.GetStateCodeStored(c);
00794                         if (stateCode == NXS_INVALID_STATE_CODE)
00795                             {
00796                             err << "Illegal state code \"" << c << "\" found when reading character " << row->size() << " for taxon " << n;
00797                             throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00798                             }
00799                         row->push_back(stateCode);
00800                         eof = !ftcb.advance_then_store(c);
00801                         }
00802                     }
00803                 if ((!eof) && (!isgraph(c)))
00804                     {
00805                     if (c == '\n')
00806                         {
00807                         eof = !ftcb.advance_then_store(c);
00808                         eoseq = true;
00809                         }
00810                     else if (c == '\r')
00811                         {
00812                         eof = !ftcb.advance_then_store(c);
00813                         if (!eof && c == '\n')
00814                             eof = !ftcb.advance_then_store(c);
00815                         eoseq = true;
00816                         }
00817                     else
00818                         eof = !ftcb.advance_then_store(c);
00819                     }
00820                 if (eof)
00821                     {
00822                     if (!readingFirstBlock && (curr_tax_ind + 1) != taxaNames.size())
00823                         {
00824                         err << "Unexpected End of file. Expecting data for " << taxaNames.size() << " sequences";
00825                         throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00826                         }
00827                     goto funcExit;
00828                     }
00829                 }
00830             if (isgraph(c))
00831                 curr_tax_ind++;
00832             else
00833                 {
00834                 if (!readingFirstBlock && (1 + curr_tax_ind) != taxaNames.size())
00835                     {
00836                     err << "Unexpected line beginning with whitespace. Expecting data for " << taxaNames.size() << " sequences";
00837                     throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00838                     }
00839                 curr_tax_ind = 0;
00840                 readingFirstBlock = false;
00841                 }
00842             }
00843         }
00844 
00845     funcExit:
00846         
00847         std::list<NxsDiscreteStateRow>::iterator sIt = matList.begin();
00848         long longest = -1;
00849         for (; sIt != matList.end(); ++sIt)
00850             {
00851             NxsDiscreteStateRow & row = *sIt;
00852             if (longest == -1)
00853                 longest = (long) row.size();
00854             else if (longest != (long) row.size())
00855                 return false;
00856             }
00857         return true;
00858     }
00859 
00860 void MultiFormatReader::addTaxaNames(const std::list<std::string> & taxaNames, NxsTaxaBlockAPI * taxa)
00861     {
00862     NCL_ASSERT(taxa);
00863     std::list<std::string>::const_iterator nIt = taxaNames.begin();
00864 
00865     std::vector<NxsNameToNameTrans> nameTrans;
00866     bool nameTransNeeded = false;
00867     NxsString t;
00868 
00869     for (; nIt != taxaNames.end(); ++nIt)
00870         {
00871         std::string name = *nIt;
00872         NxsNameToNameTrans trans(name, name);
00873         for (unsigned i = 1; ; ++i)
00874             {
00875             try {
00876                 taxa->AddTaxonLabel(name);
00877                 break;
00878                 }
00879             catch (DuplicatedLabelNxsException & x)
00880                 {
00881                 if (!this->conversionOutputRecord.addNumbersToDisambiguateNames)
00882                     throw;
00883                 nameTransNeeded = true;
00884                 t.assign(*nIt);
00885                 t << i;
00886                 trans.second = t;
00887                 name = t;
00888                 }
00889             }
00890         if (this->conversionOutputRecord.addNumbersToDisambiguateNames)
00891             nameTrans.push_back(trans);
00892         }
00893 
00894 
00895     
00896     if (nameTransNeeded)
00897         this->conversionOutputRecord.writeNameTranslation(nameTrans, taxa);
00898     }
00899 
00900 void MultiFormatReader::moveDataToMatrix(std::list<NxsDiscreteStateRow> & matList,  NxsDiscreteStateMatrix &mat)
00901     {
00902     mat.clear();
00903     mat.resize(matList.size());
00904     NxsDiscreteStateMatrix::iterator dIt = mat.begin();
00905     std::list<NxsDiscreteStateRow>::iterator sIt = matList.begin();
00906     for (; sIt != matList.end(); ++sIt, ++dIt)
00907         {
00908         NxsDiscreteStateRow & source = *sIt;
00909         NxsDiscreteStateRow & dest = *dIt;
00910         dest.swap(source);
00911         }
00912     }
00913 
00914 void  MultiFormatReader::moveDataToDataBlock(const std::list<std::string> & taxaNames, std::list<NxsDiscreteStateRow> & matList, const unsigned nchar, NxsDataBlock * dataB)
00915     {
00916     NCL_ASSERT(dataB);
00917     NxsString d;
00918     d << "Dimensions ntax = " << matList.size() << " nchar = " << nchar << " ; ";
00919     std::istringstream fakeDimStream(d);
00920     NxsToken fakeDimToken(fakeDimStream);
00921     NxsString newTaxLabel("NewTaxa");
00922     NxsString ntaxLabel("NTax");
00923     NxsString ncharLabel("NChar");
00924     dataB->HandleDimensions(fakeDimToken, newTaxLabel, ntaxLabel, ncharLabel);
00925 
00926     NCL_ASSERT(dataB->taxa);
00927     addTaxaNames(taxaNames, dataB->taxa);
00928 
00929     moveDataToMatrix(matList, dataB->discreteMatrix);
00930     }
00931 
00932 void  MultiFormatReader::moveDataToUnalignedBlock(const std::list<std::string> & taxaNames, std::list<NxsDiscreteStateRow> & matList, NxsUnalignedBlock * uB)
00933     {
00934     NCL_ASSERT(uB);
00935     NxsString d;
00936     d << "Dimensions NewTaxa ntax = " << matList.size() << " ; ";
00937     std::istringstream fakeDimStream(d);
00938     NxsToken fakeDimToken(fakeDimStream);
00939     uB->HandleDimensions(fakeDimToken);
00940 
00941     NCL_ASSERT(uB->taxa);
00942     addTaxaNames(taxaNames, uB->taxa);
00943 
00944     moveDataToMatrix(matList, uB->uMatrix);
00945     }
00946 
00947 void  MultiFormatReader::readFastaFile(std::istream & inf, NxsCharactersBlock::DataTypesEnum dt)
00948     {
00949     NxsString blockID("DATA");
00950     NxsBlock *nb = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
00951     NCL_ASSERT(nb);
00952     if (!nb)
00953         return;
00954     nb->SetNexus(this);
00955 
00956     NxsDataBlock * dataB = static_cast<NxsDataBlock *>(nb); 
00957     FileToCharBuffer ftcb(inf);
00958     if (ftcb.buffer)
00959         {
00960         dataB->Reset();
00961         dataB->datatype = dt;
00962         dataB->ResetSymbols();
00963         dataB->gap = '-';
00964         NxsPartition dtParts;
00965         std::vector<NxsCharactersBlock::DataTypesEnum> dtv;
00966         dataB->CreateDatatypeMapperObjects(dtParts, dtv);
00967 
00968         const NxsDiscreteDatatypeMapper * dm = dataB->GetDatatypeMapperForChar(0);
00969 
00970         std::list<std::string> taxaNames;
00971         std::list<NxsDiscreteStateRow> matList;
00972         size_t longest = 0;
00973         bool aligned = true;
00974         try {
00975             aligned = readFastaSequences(ftcb, *dm, taxaNames, matList, longest);
00976             }
00977         catch (...)
00978             {
00979             cloneFactory.BlockError(dataB);
00980             throw;
00981             }
00982 
00983         if (aligned)
00984             {
00985             moveDataToDataBlock(taxaNames, matList, longest, dataB);
00986             BlockReadHook(blockID, dataB);
00987             }
00988         else
00989             {
00990             cloneFactory.BlockError(dataB);
00991             blockID.assign("UNALIGNED");
00992             NxsBlock * nub = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
00993             if (!nub)
00994                 {
00995                 NCL_ASSERT(nub);
00996                 return;
00997                 }
00998             nub->SetNexus(this);
00999 
01000             NxsUnalignedBlock * unalignedB = static_cast<NxsUnalignedBlock *>(nub); 
01001             unalignedB->Reset();
01002             unalignedB->datatype = dt;
01003             unalignedB->ResetSymbols();
01004             unalignedB->ResetDatatypeMapper();
01005             moveDataToUnalignedBlock(taxaNames, matList, unalignedB);
01006             BlockReadHook(blockID, unalignedB);
01007             }
01008         }
01009     else
01010         {
01011         cloneFactory.BlockError(dataB);
01012         NxsString err;
01013         err << "No Data read -- file appears to be empty";
01014         this->NexusError(err, 0, -1, -1);
01015         }
01016     }
01017 
01018 void  MultiFormatReader::ReadFilepath(const char * filepath, DataFormatType format)
01019     {
01020     if (format == NEXUS_FORMAT)
01021         {
01022         NxsReader::ReadFilepath(filepath);
01023         }
01024     else
01025         {
01026         std::ifstream inf;
01027         try{
01028             inf.open(filepath, std::ios::binary);
01029             if (!inf.good())
01030                 {
01031                 NxsString err;
01032                 err << "Could not open the file \"" << filepath <<"\"";
01033                 this->NexusError(err, 0, -1, -1);
01034                 }
01035             }
01036         catch (...)
01037             {
01038             NxsString err;
01039             err << '\"' << filepath <<"\" does not refer to a valid file." ;
01040             this->NexusError(err, 0, -1, -1);
01041             }
01042         this->ReadStream(inf, format, filepath);
01043         }
01044     }
01045 
01046 void  MultiFormatReader::ReadStream(std::istream & inf, DataFormatType format, const char * filepath)
01047     {
01048     if (format == NEXUS_FORMAT)
01049         {
01050         NxsReader::ReadFilestream(inf);
01051         }
01052     else
01053         {
01054         if (format == FASTA_DNA_FORMAT)
01055             readFastaFile(inf, NxsCharactersBlock::dna);
01056         else if (format == FASTA_RNA_FORMAT)
01057             readFastaFile(inf, NxsCharactersBlock::rna);
01058         else if (format == FASTA_AA_FORMAT)
01059             readFastaFile(inf, NxsCharactersBlock::protein);
01060         else if (format == PHYLIP_DNA_FORMAT)
01061             readPhylipFile(inf, NxsCharactersBlock::dna, false, false);
01062         else if (format == PHYLIP_RNA_FORMAT)
01063             readPhylipFile(inf, NxsCharactersBlock::rna, false, false);
01064         else if (format == PHYLIP_AA_FORMAT)
01065             readPhylipFile(inf, NxsCharactersBlock::protein, false, false);
01066         else if (format == PHYLIP_DISC_FORMAT)
01067             readPhylipFile(inf, NxsCharactersBlock::standard, false, false);
01068         else if (format == INTERLEAVED_PHYLIP_DNA_FORMAT)
01069             readPhylipFile(inf, NxsCharactersBlock::dna, false, true);
01070         else if (format == INTERLEAVED_PHYLIP_RNA_FORMAT)
01071             readPhylipFile(inf, NxsCharactersBlock::rna, false, true);
01072         else if (format == INTERLEAVED_PHYLIP_AA_FORMAT)
01073             readPhylipFile(inf, NxsCharactersBlock::protein, false, true);
01074         else if (format == INTERLEAVED_PHYLIP_DISC_FORMAT)
01075             readPhylipFile(inf, NxsCharactersBlock::standard, false, true);
01076         else if (format == RELAXED_PHYLIP_DNA_FORMAT)
01077             readPhylipFile(inf, NxsCharactersBlock::dna, true, false);
01078         else if (format == RELAXED_PHYLIP_RNA_FORMAT)
01079             readPhylipFile(inf, NxsCharactersBlock::rna, true, false);
01080         else if (format == RELAXED_PHYLIP_AA_FORMAT)
01081             readPhylipFile(inf, NxsCharactersBlock::protein, true, false);
01082         else if (format == RELAXED_PHYLIP_DISC_FORMAT)
01083             readPhylipFile(inf, NxsCharactersBlock::standard, true, false);
01084         else if (format == INTERLEAVED_RELAXED_PHYLIP_DNA_FORMAT)
01085             readPhylipFile(inf, NxsCharactersBlock::dna, true, true);
01086         else if (format == INTERLEAVED_RELAXED_PHYLIP_RNA_FORMAT)
01087             readPhylipFile(inf, NxsCharactersBlock::rna, true, true);
01088         else if (format == INTERLEAVED_RELAXED_PHYLIP_AA_FORMAT)
01089             readPhylipFile(inf, NxsCharactersBlock::protein, true, true);
01090         else if (format == INTERLEAVED_RELAXED_PHYLIP_DISC_FORMAT)
01091             readPhylipFile(inf, NxsCharactersBlock::standard, true, true);
01092         else if (format == ALN_DNA_FORMAT)
01093             readAlnFile(inf, NxsCharactersBlock::dna);
01094         else if (format == ALN_RNA_FORMAT)
01095             readAlnFile(inf, NxsCharactersBlock::rna);
01096         else if (format == ALN_AA_FORMAT)
01097             readAlnFile(inf, NxsCharactersBlock::protein);
01098         else if (format == RELAXED_PHYLIP_TREE_FORMAT)
01099             readPhylipTreeFile(inf, true);
01100         else if (format == PHYLIP_TREE_FORMAT)
01101             readPhylipTreeFile(inf, false);
01102         else
01103             {
01104             NxsString m;
01105             if (filepath)
01106                 m << "The file " << filepath << " is not in a supported format.";
01107             else
01108                 m << "Unsupported format.";
01109             NexusError(m, 0, -1, -1);
01110             return;
01111             }
01112         PostExecuteHook();
01113         }
01114     }
01115 
01116 
01117 
01118 
01119 unsigned MultiFormatReader::readPhylipHeader(std::istream & inf, unsigned & ntax, unsigned & nchar)
01120     {
01121     if (inf.good())
01122         inf >> ntax;
01123     if (inf.good())
01124         inf >> nchar;
01125     if (!inf.good() || ntax == 0 || nchar == 0)
01126         {
01127         NxsString err("Expecting the file to start with the number of taxa then the number of characters.");
01128         throw NxsException(err, 0, -1, -1);
01129         }
01130     return (unsigned) inf.tellg();
01131     }
01132 
01133 void MultiFormatReader::readPhylipTreeFile(std::istream & inf, bool relaxedNames)
01134     {
01135     NxsString blockID("TREES");
01136     NxsBlock *nb = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
01137     NCL_ASSERT(nb);
01138     if (!nb)
01139         return;
01140     nb->SetNexus(this);
01141 
01142     
01143 
01144 
01145 
01146     NxsTreesBlock * treesB = static_cast<NxsTreesBlock *>(nb);
01147     NxsString err;
01148     try {
01149         treesB->Reset();
01150         NxsToken inTokens(inf);
01151         treesB->ReadPhylipTreeFile(inTokens);
01152         if (!relaxedNames)
01153             {
01154             const NxsTaxaBlockAPI * taxa = treesB->GetTaxaBlockPtr(0L);
01155             if (!taxa)
01156                 {
01157                 err << "No taxa found in tree description (which probably means that no tree was found).";
01158                 throw NxsException(err, inTokens);
01159                 }
01160             const std::vector<std::string> l = taxa->GetAllLabels();
01161             for (std::vector<std::string>::const_iterator lIt = l.begin(); lIt != l.end(); ++lIt)
01162                 {
01163                 if (lIt->length() > PHYLIP_NMLNGTH)
01164                     {
01165                     err << "The taxon label " << *lIt << " has more than the allowed number of charcters (" << PHYLIP_NMLNGTH << ')';
01166                     throw NxsException(err);
01167                     }
01168                 }
01169             }
01170         BlockReadHook(blockID, treesB);
01171         }
01172     catch (...)
01173         {
01174         cloneFactory.BlockError(nb);
01175         throw;
01176         }
01177     }
01178 
01179 
01180 
01181 
01182 void MultiFormatReader::readAlnFile(std::istream & inf, NxsCharactersBlock::DataTypesEnum dt)
01183     {
01184     NxsString blockID("DATA");
01185     NxsBlock *nb = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
01186     NCL_ASSERT(nb);
01187     if (!nb)
01188         return;
01189     nb->SetNexus(this);
01190     
01191 
01192 
01193 
01194     NxsDataBlock * dataB = static_cast<NxsDataBlock *>(nb);
01195 
01196     try {
01197         dataB->Reset();
01198         dataB->datatype = dt;
01199         dataB->ResetSymbols();
01200         dataB->gap = '-';
01201         NxsPartition dtParts;
01202         std::vector<NxsCharactersBlock::DataTypesEnum> dtv;
01203         dataB->CreateDatatypeMapperObjects(dtParts, dtv);
01204 
01205         const NxsDiscreteDatatypeMapper * dm = dataB->GetDatatypeMapperForChar(0);
01206         NCL_ASSERT(dm);
01207         FileToCharBuffer ftcb(inf);
01208         if (ftcb.buffer)
01209             {
01210             std::list<std::string> taxaNames;
01211             std::list<NxsDiscreteStateRow> matList;
01212             if (!readAlnData(ftcb, *dm, taxaNames, matList))
01213                 throw NxsException("Expecting the same number of characters for all sequences in the ALN file");
01214             const unsigned nchar = matList.begin()->size();
01215             moveDataToDataBlock(taxaNames, matList, nchar, dataB);
01216             BlockReadHook(blockID, dataB);
01217             }
01218         }
01219     catch (...)
01220         {
01221         cloneFactory.BlockError(nb);
01222         throw;
01223         }
01224 }
01225 
01226 
01227 
01228 
01229 void MultiFormatReader::readPhylipFile(std::istream & inf, NxsCharactersBlock::DataTypesEnum dt, bool relaxedNames, bool interleaved)
01230     {
01231     NxsString blockID("DATA");
01232     NxsBlock *nb = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
01233     NCL_ASSERT(nb);
01234     if (!nb)
01235         return;
01236     nb->SetNexus(this);
01237     
01238 
01239 
01240 
01241     NxsDataBlock * dataB = static_cast<NxsDataBlock *>(nb);
01242 
01243     try {
01244         dataB->Reset();
01245         dataB->datatype = dt;
01246         dataB->ResetSymbols();
01247         dataB->gap = '-';
01248         NxsPartition dtParts;
01249         std::vector<NxsCharactersBlock::DataTypesEnum> dtv;
01250         dataB->CreateDatatypeMapperObjects(dtParts, dtv);
01251 
01252         const NxsDiscreteDatatypeMapper * dm = dataB->GetDatatypeMapperForChar(0);
01253         NCL_ASSERT(dm);
01254         unsigned ntax, nchar;
01255         unsigned headerLen = readPhylipHeader(inf, ntax, nchar);
01256         FileToCharBuffer ftcb(inf);
01257         ftcb.totalSize += headerLen;
01258         if (ftcb.buffer)
01259             {
01260             std::list<std::string> taxaNames;
01261             std::list<NxsDiscreteStateRow> matList;
01262             if (interleaved)
01263                 readInterleavedPhylipData(ftcb, *dm, taxaNames, matList, ntax, nchar, relaxedNames);
01264             else
01265                 readPhylipData(ftcb, *dm, taxaNames, matList, ntax, nchar, relaxedNames);
01266             moveDataToDataBlock(taxaNames, matList, nchar, dataB);
01267             BlockReadHook(blockID, dataB);
01268             }
01269         }
01270     catch (...)
01271         {
01272         cloneFactory.BlockError(nb);
01273         throw;
01274         }
01275 }