/** * \file PDBParser.h \brief A class for reading PDB files * * Copyright 2007-2010 IMP Inventors. All rights reserved. * */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IMP_USE_BOOST_FILESYSTEM #include #include #endif IMPATOM_BEGIN_NAMESPACE namespace { std::string nicename(std::string name) { #if defined(IMP_USE_BOOST_FILESYSTEM) && BOOST_VERSION > 103600 boost::filesystem::path path(name); return path.filename(); #else return name; #endif } } PDBSelector::~PDBSelector(){} struct IndexCompare { bool operator()(Particle* a, Particle* b) const { return Residue(a).get_index() < Residue(b).get_index(); } }; struct TypeCompare { bool operator()(Particle* a, Particle* b) const { return Atom(a).get_atom_type() < Atom(b).get_atom_type(); } }; namespace { void sort_residues(Chain c) { HierarchiesTemp dchildren =c.get_children(); ParticlesTemp children(dchildren.begin(), dchildren.end()); std::sort(children.begin(), children.end(), IndexCompare()); c.clear_children(); for (unsigned int i=0; i< children.size(); ++i) { c.add_child(Hierarchy(children[i])); } } void sort_atoms(Residue c) { HierarchiesTemp dchildren =c.get_children(); ParticlesTemp children(dchildren.begin(), dchildren.end()); std::sort(children.begin(), children.end(), TypeCompare()); c.clear_children(); for (unsigned int i=0; i< children.size(); ++i) { c.add_child(Hierarchy(children[i])); } } void canonicalize(Hierarchy h) { for (unsigned int i=0; i < h.get_number_of_children(); ++i) { canonicalize(h.get_child(i)); } if (h.get_as_chain()) { sort_residues(h.get_as_chain()); } /*if (h.get_as_residue()) { sort_atoms(h.get_as_residue()); }*/ } // try putting the numbers after std::string try_rename(std::string str) { std::string out; for (unsigned int i=0; i< str.size(); ++i) { if (std::isalpha(str[i], std::locale())) out.push_back(str[i]); } for (unsigned int i=0; i< str.size(); ++i) { if (!std::isalpha(str[i], std::locale())) out.push_back(str[i]); } return out; } Element get_element_from_pdb_line(const std::string& pdb_line) { // 1. determine element from element column std::string elem = internal::atom_element(pdb_line); boost::trim(elem); Element e = get_element_table().get_element(elem); if (e != UNKNOWN_ELEMENT) return e; // 2. determine element from the atom name std::string atom_name = internal::atom_type(pdb_line); IMP_USAGE_CHECK(atom_name.length() == 4, "Invalid atom name."); if (internal::is_ATOM_rec(pdb_line)) { char c0=atom_name[0]; // if we have space/digit followed by character, try character if ((isdigit(atom_name[0]) || isspace(atom_name[0])) && isalpha(atom_name[1])) c0=atom_name[1]; // H, C, N, O, S, P - most likely to occur // we don't want to get Ne element for NE AtomType switch (c0) { case 'H': return H; case 'C': return C; case 'N': return N; case 'O': return O; case 'S': return S; case 'P': return P; default: break; } } if (internal::is_HETATM_rec(pdb_line)) { // if we have a character in position 0 try the first two characters // in order to handle cases below correctly // HETATM 2209 MG MG D 392 12.055 -14.213 -59.777 1.00 10.93 // HETATM 1 CL CL E 393 -24.319 -2.178 -56.585 0.40 2.37 // HETATM 2 O HOH E 1 -25.636 -4.048 -37.445 1.00 4.32 std::string elem_string; if (isalpha(atom_name[0])) elem_string.push_back(atom_name[0]); if (isalpha(atom_name[1])) elem_string.push_back(atom_name[1]); e = get_element_table().get_element(elem_string); if (e != UNKNOWN_ELEMENT) return e; } IMP_LOG(VERBOSE,"Unable to parse element from line: " << pdb_line << "\n"); return UNKNOWN_ELEMENT; } Particle* atom_particle(Model *m, const std::string& pdb_line) { AtomType atom_name; std::string string_name = internal::atom_type(pdb_line); // determine element Element e = get_element_from_pdb_line(pdb_line); // determine AtomType if (internal::is_HETATM_rec(pdb_line)){ string_name= "HET:"+string_name; if (!get_atom_type_exists(string_name)) { atom_name=add_atom_type(string_name, e); } else { atom_name=AtomType(string_name); } } else { // ATOM line boost::trim(string_name); if (!AtomType::get_key_exists(string_name)) { string_name= try_rename(string_name); if (!AtomType::get_key_exists(string_name)) { IMP_LOG(VERBOSE, "ATOM record type not found: \"" << string_name << "\" from " << pdb_line << std::endl); // return NULL; // never skip ATOMs!!! atom_name=add_atom_type(string_name, e); } else { atom_name = AtomType(string_name); } } else { atom_name = AtomType(string_name); } } // new particle Particle* p = new Particle(m); algebra::Vector3D v(internal::atom_xcoord(pdb_line), internal::atom_ycoord(pdb_line), internal::atom_zcoord(pdb_line)); // atom decorator Atom d = Atom::setup_particle(p, atom_name); p->set_name(std::string("Atom "+ atom_name.get_string())); core::XYZ::setup_particle(p, v).set_coordinates_are_optimized(true); d.set_input_index(internal::atom_number(pdb_line)); d.set_element(e); // check if the element matches Element e2 = get_element_for_atom_type(atom_name); if (e != e2) { IMP_WARN("AtomType element and PDB line elements don't match. AtomType " << e2 << " determined from PDB line " << e << " line " << pdb_line << std::endl); } return p; } Particle* residue_particle(Model *m, const std::string& pdb_line) { Particle* p = new Particle(m); int residue_index = internal::atom_residue_number(pdb_line); char residue_icode = internal::atom_residue_icode(pdb_line); std::string rn = internal::atom_residue_name(pdb_line); boost::trim(rn); ResidueType residue_name = ResidueType(rn); // residue decorator Residue rd = Residue::setup_particle(p, residue_name, residue_index, (int)residue_icode); p->set_name(residue_name.get_string()); return p; } Particle* chain_particle(Model *m, char chain_id) { Particle* p = new Particle(m); Chain::setup_particle(p, chain_id); p->set_name(std::string("Chain "+std::string(1, chain_id))); Molecule::setup_particle(p); return p; } } namespace { Hierarchies read_pdb(std::istream &in, std::string name, Model *model, const PDBSelector& selector, bool select_first_model, bool split_models) { // hierarchy decorator Hierarchies ret; std::string root_name; Particle* root_p = NULL; Particle* cp = NULL; Particle* rp = NULL; char curr_residue_icode = '-'; char curr_chain = '-'; bool chain_name_set = false; bool first_model_read = false; bool has_atom=false; std::string line; while (!in.eof()) { getline(in, line); if (in.eof()) break; if (in.bad() || in.fail()) { IMP_THROW("Error reading from PDB file", IOException); } // handle MODEL reading if (internal::is_MODEL_rec(line)) { if(first_model_read && select_first_model) break; if (split_models) { std::ostringstream oss; oss << "Model " << internal::model_index(line); root_name= oss.str(); root_p=NULL; } } // check that line is an HETATM or ATOM rec and that selector accepts line. // if this is the case construct a new Particle using line and add the // Particle to the Model if (internal::is_ATOM_rec(line) || internal::is_HETATM_rec(line)) { if (!selector(line)) { IMP_LOG(VERBOSE, "Selector rejected line " << line << std::endl); continue; } int residue_index = internal::atom_residue_number(line); char residue_icode = internal::atom_residue_icode(line); char chain = internal::atom_chain_id(line); // create atom particle Particle* ap = atom_particle(model, line); // make sure that all children have coordinates, // (no residues without valid atoms) if (ap) { // check if new chain if (root_p== NULL) { root_p = new Particle(model); ret.push_back(Hierarchy::setup_particle(root_p)); if (!root_name.empty() || !name.empty()) { root_p->set_name(name+root_name); } } if (cp == NULL || chain != curr_chain) { curr_chain = chain; // create new chain particle cp = chain_particle(model, chain); chain_name_set = false; Hierarchy(root_p).add_child(Chain(cp)); rp=NULL; // make sure we get a new residue } // check if new residue if (rp == NULL || residue_index != Residue::decorate_particle(rp).get_index() || residue_icode != curr_residue_icode) { curr_residue_icode = residue_icode; // create new residue particle rp = residue_particle(model, line); Chain(cp).add_child(Residue(rp)); } // set chain name (protein/nucleotide/other) according to residue name if (!chain_name_set) { Chain cd(cp); chain_name_set = true; } Residue(rp).add_child(Atom(ap)); has_atom=true; } } } if (!has_atom) { IMP_WARN("Sorry, unable to read atoms from PDB file." " Thanks for the effort."); return Hierarchies(); } for (unsigned int i=0; i< ret.size(); ++i) { canonicalize(ret[i]); } IMP_IF_CHECK(USAGE_AND_INTERNAL) { for (unsigned int i=0; i< ret.size(); ++i) { if (!ret[i].get_is_valid(true)) { IMP_ERROR("Invalid hierarchy produced "); IMP_ERROR_WRITE(IMP::core::show(ret[i], IMP_STREAM)); throw InternalException("Bad hierarchy"); // should clean up } } } return ret; } } Hierarchy read_pdb(TextInput in, Model *model) { Hierarchies ret= read_pdb(in,nicename(in.get_name()), model, NonWaterPDBSelector(), true, false); if (ret.empty()) { IMP_THROW("No molecule read from file " << in.get_name(), ValueException); } return ret[0]; } Hierarchy read_pdb(TextInput in, Model *model, const PDBSelector& selector, bool select_first_model) { Hierarchies ret= read_pdb(in, nicename(in.get_name()), model, selector, select_first_model, false); if (ret.empty()) { IMP_THROW("No molecule read from file " << in.get_name(), ValueException); } return ret[0]; } Hierarchies read_multimodel_pdb(TextInput in, Model *model, const PDBSelector& selector) { return read_pdb(in, nicename(in.get_name()), model, selector, false, true); } // mol2.cpp bool check_arbond(Particle* atom_p); void write_pdb(const Particles& ps, TextOutput out) { int last_index=0; bool use_input_index=true; for (unsigned int i=0; i< ps.size(); ++i) { if (Atom(ps[i]).get_input_index() != last_index+1) { use_input_index=false; break; } else { ++last_index; } } for (unsigned int i=0; i< ps.size(); ++i) { if (Atom::particle_is_instance(ps[i])) { Atom ad(ps[i]); Residue rd= get_residue(ad); // really dumb and slow, fix later char chain; Chain c=get_chain(rd, true); if (c) { chain= c.get_id(); } else { chain=' '; /* pdb_file << std::setw(7) << atomid << " "; pdb_file.setf(std::ios::left,std::ios::adjustfield); pdb_file << std::setw(4) << atom_type << " "; pdb_file << "HET 1" << " "; pdb_file.setf(std::ios::right,std::ios::adjustfield); pdb_file.setf(std::ios::fixed,std::ios::floatfield); core::XYZ xyz= core::XYZ::cast(p); pdb_file << std::setw(8) << std::setprecision(3) << xyz.get_x(); pdb_file << std::setw(8) << std::setprecision(3) << xyz.get_y(); pdb_file << std::setw(8) << std::setprecision(3) << xyz.get_z() << std::endl;*/ } out.get_stream() << pdb_string(core::XYZ(ps[i]).get_coordinates(), use_input_index? ad.get_input_index(): i, ad.get_atom_type(), rd.get_residue_type(), chain, rd.get_index(), rd.get_insertion_code(), ad.get_occupancy(), ad.get_tempFactor(), ad.get_element()); if (!out) { IMP_THROW("Error writing to file in write_pdb", IOException); } } } } void write_pdb(Hierarchy mhd, TextOutput out) { Particles ps= get_leaves(mhd); write_pdb(ps, out); } void write_pdb(const Hierarchies& mhd, TextOutput out) { for (unsigned int i=0; i< mhd.size(); ++i) { write_pdb(mhd[i], out); } } void write_multimodel_pdb(const Hierarchies& mhd, TextOutput oout) { std::ostream &out=oout; for (unsigned int i=0; i< mhd.size(); ++i) { out << "MODEL "; out.setf(std::ios::right, std::ios::adjustfield); out.width(4); out << (i+1) << std::endl; write_pdb(mhd[i], out); out << "ENDMDL" << std::endl; } } // change atom type to string for Hao's hetatom code std::string pdb_string(const algebra::VectorD<3>& v, int index, AtomType at, ResidueType rt, char chain, int res_index, char res_icode, double occupancy, double tempFactor,Element e) { std::stringstream out; std::string atom_name = at.get_string(); if (atom_name.find("HET:")==0) { out << "HETATM"; } else { out << "ATOM "; } //7-11 : atom id out.setf(std::ios::right, std::ios::adjustfield); out.width(5); out << index; // 12: skip an undefined position out.width(1); out << " "; // 13-16: atom name if (atom_name.find("HET:")==0){ out << std::string(atom_name, 4); } else { if (atom_name.size() == 4) { out << atom_name; } else if (atom_name.size() ==3) { out << " " << atom_name; } else if (atom_name.size() ==2) { out << " " << atom_name << " "; } else { out << " " << atom_name << " "; } } // 17: skip the alternate indication position out.width(1); out << " "; // 18-20 : residue name out << std::right << std::setw(3) << rt.get_string(); //skip 21 out.width(1); out << " "; // 22: chain identifier out << chain; //23-26: residue number out.setf(std::ios::right, std::ios::adjustfield); out.width(4); out << res_index; //27: residue insertion code out.width(1); out << res_icode; out.setf(std::ios::fixed, std::ios::floatfield); out << " "; // skip 3 undefined positions (28-30) // coordinates (31-38,39-46,47-54) out.width(8); out.precision(3); out << v[0]; out.width(8); out.precision(3); out << v[1]; out.width(8); out.precision(3); out << v[2]; //55:60 occupancy out.width(6); out.precision(2); out << occupancy; //61-66: temp. factor out.width(6); out.precision(2); out << tempFactor; // 73 - 76 LString(4) Segment identifier, left-justified. out.width(10); out << ""; //TODO // 77 - 78 LString(2) Element symbol, right-justified. out.width(2); out.setf(std::ios::right, std::ios::adjustfield); out << get_element_table().get_name(e); // 79 - 80 LString(2) Charge on the atom. out.width(2); out << "" << std::endl; //TODO return out.str(); } std::string conect_record_string(int a1_ind,int a2_ind){ // const IMP::atom::Atom &a1, const IMP::atom::Atom &a2){ // IMP::atom::Atom *a3,IMP::atom::Atom *a4,IMP::atom::Atom *a5) { std::stringstream out; out.setf(std::ios::left, std::ios::adjustfield); // 1-6 Record name "CONECT" out.width(6); out<<"CONECT"; //7 - 11 Atom serial number out.width(5); out<get_input_index(); // } // //22 - 26 Serial number of bonded atom // if(a4 != NULL) { // out<get_input_index(); // } // //27 - 31 Serial number of bonded atom // if(a5 != NULL) { // out<get_input_index(); // } return out.str(); } IMPATOM_END_NAMESPACE