class Bio::SPTR

Parser class for UniProtKB/SwissProt and TrEMBL database entry.

Public Instance Methods

aalen()
Alias for: sequence_length
aaseq()
Alias for: seq
cc(topic = nil) click to toggle source

returns contents in the CC lines.

returns an object of contents in the TOPIC.

returns contents of the “ALTERNATIVE PRODUCTS”.

  • #cc('ALTERNATIVE PRODUCTS') -> Hash

    {'Event' => str, 
     'Named isoforms' => int,  
     'Comment' => str,
     'Variants'=>[{'Name' => str, 'Synonyms' => str, 'IsoId' => str, 'Sequence' => []}]}
    
    CC   -!- ALTERNATIVE PRODUCTS:
    CC       Event=Alternative splicing; Named isoforms=15;
    ...
    CC         placentae isoforms. All tissues differentially splice exon 13;
    CC       Name=A; Synonyms=no del;
    CC         IsoId=P15529-1; Sequence=Displayed;

returns contents of the “DATABASE”.

  • #cc('DATABASE') -> Array

    [{'NAME'=>str,'NOTE'=>str, 'WWW'=>URI,'FTP'=>URI}, ...]
    
    CC   -!- DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].

returns contents of the “MASS SPECTROMETRY”.

  • #cc('MASS SPECTROMETRY') -> Array

    [{'MW"=>float,'MW_ERR'=>float, 'METHOD'=>str,'RANGE'=>str}, ...]
    
    CC   -!- MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].

CC lines (>=0, optional)

CC   -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT
CC       IN LIVER, KIDNEY, LUNG AND BRAIN.

CC   -!- TOPIC: FIRST LINE OF A COMMENT BLOCK;
CC       SECOND AND SUBSEQUENT LINES OF A COMMENT BLOCK.

See also www.expasy.org/sprot/userman.html#CC_line

# File lib/bio/db/embl/sptr.rb, line 775
def cc(topic = nil)
  unless @data['CC']
    cc  = Hash.new
    comment_border= '-' * (77 - 4 + 1)
    dlm = /-!- /

    # 12KD_MYCSM has no CC lines.
    return cc if get('CC').size == 0
    
    cc_raw = fetch('CC')

    # Removing the copyright statement.
    cc_raw.sub!(/ *---.+---/m, '')

    # Not any CC Lines without the copyright statement.
    return cc if cc_raw == ''

    begin
      cc_raw, copyright = cc_raw.split(/#{comment_border}/)[0]
      cc_raw = cc_raw.sub(dlm,'')
      cc_raw.split(dlm).each do |tmp|
        tmp = tmp.strip

        if /(^[A-Z ]+[A-Z]): (.+)/ =~ tmp
          key  = $1
          body = $2
          body.gsub!(/- (?!AND)/,'-')
          body.strip!
          unless cc[key]
            cc[key] = [body]
          else
            cc[key].push(body)
          end
        else
          raise ["Error: [#{entry_id}]: CC Lines", '"', tmp, '"',
                 '', get('CC'),''].join("\n")
        end
      end
    rescue NameError
      if fetch('CC') == ''
        return {}
      else
        raise ["Error: Invalid CC Lines: [#{entry_id}]: ",
               "\n'#{self.get('CC')}'\n", "(#{$!})"].join
      end
    rescue NoMethodError
    end
    
    @data['CC'] = cc
  end


  case topic
  when 'ALLERGEN'
    return @data['CC'][topic]
  when 'ALTERNATIVE PRODUCTS'
    return cc_alternative_products(@data['CC'][topic])
  when 'BIOPHYSICOCHEMICAL PROPERTIES'
    return cc_biophysiochemical_properties(@data['CC'][topic])
  when 'BIOTECHNOLOGY'
    return @data['CC'][topic]
  when 'CATALITIC ACTIVITY'
    return cc_catalytic_activity(@data['CC'][topic])
  when 'CAUTION'
    return cc_caution(@data['CC'][topic])
  when 'COFACTOR'
    return @data['CC'][topic]
  when 'DEVELOPMENTAL STAGE'
    return @data['CC'][topic].join('')
  when 'DISEASE'
    return @data['CC'][topic].join('')
  when 'DOMAIN'
    return @data['CC'][topic]
  when 'ENZYME REGULATION'
    return @data['CC'][topic].join('')
  when 'FUNCTION'
    return @data['CC'][topic].join('')
  when 'INDUCTION'
    return @data['CC'][topic].join('')
  when 'INTERACTION'
    return cc_interaction(@data['CC'][topic])
  when 'MASS SPECTROMETRY'
    return cc_mass_spectrometry(@data['CC'][topic])
  when 'MISCELLANEOUS'
    return @data['CC'][topic]
  when 'PATHWAY'
    return cc_pathway(@data['CC'][topic])
  when 'PHARMACEUTICAL'
    return @data['CC'][topic]
  when 'POLYMORPHISM'
    return @data['CC'][topic]
  when 'PTM'
    return @data['CC'][topic]
  when 'RNA EDITING'
    return cc_rna_editing(@data['CC'][topic])
  when 'SIMILARITY'
    return @data['CC'][topic]
  when 'SUBCELLULAR LOCATION'
    return cc_subcellular_location(@data['CC'][topic])
  when 'SUBUNIT'
    return @data['CC'][topic]
  when 'TISSUE SPECIFICITY'
    return @data['CC'][topic]
  when 'TOXIC DOSE'
    return @data['CC'][topic]
  when 'WEB RESOURCE'
    return cc_web_resource(@data['CC'][topic])
  when 'DATABASE'
    # DATABASE: NAME=Text[; NOTE=Text][; WWW="Address"][; FTP="Address"].
    tmp = Array.new
    db = @data['CC']['DATABASE']
    return db unless db

    db.each do |e|
      db = {'NAME' => nil, 'NOTE' => nil, 'WWW' => nil, 'FTP' => nil}
      e.sub(/.$/,'').split(/;/).each do |line|
        case line
        when /NAME=(.+)/
          db['NAME'] = $1
        when /NOTE=(.+)/
          db['NOTE'] = $1
        when /WWW="(.+)"/
          db['WWW'] = $1
        when /FTP="(.+)"/
          db['FTP'] = $1
        end 
      end
      tmp.push(db)
    end
    return tmp
  when nil
    return @data['CC']
  else
    return @data['CC'][topic]
  end
end
dr(key = nil) click to toggle source

#dr

# File lib/bio/db/embl/sptr.rb, line 1131
def dr(key = nil)
  unless key
    embl_dr
  else
    (embl_dr[key] or []).map {|x|
      {'Accession' => x[0],
       'Version' => x[1],
       ' ' => x[2],
       'Molecular Type' => x[3]}
    }
  end
end
Also aliased as: embl_dr
dt(key = nil) click to toggle source

returns a Hash of information in the DT lines.

hash keys: 
  ['created', 'sequence', 'annotation']

Since UniProtKB release 7.0 of 07-Feb-2006, the DT line format is changed, and the word “annotation” is no longer used in DT lines. Despite the change, the word “annotation” is still used for keeping compatibility.

returns a String of information in the DT lines by a given key.

DT Line; date (3/entry)

DT DD-MMM-YYY (integrated into UniProtKB/XXXXX.)
DT DD-MMM-YYY (sequence version NN)
DT DD-MMM-YYY (entry version NN)

The format have been changed in UniProtKB release 7.0 of 07-Feb-2006. Below is the older format.

Old format of DT Line; date (3/entry)

DT DD-MMM-YYY (rel. NN, Created)
DT DD-MMM-YYY (rel. NN, Last sequence update)
DT DD-MMM-YYY (rel. NN, Last annotation update)
# File lib/bio/db/embl/sptr.rb, line 158
def dt(key = nil)
  return dt[key] if key
  return @data['DT'] if @data['DT']

  part = self.get('DT').split(/\n/)
  @data['DT'] = {
    'created'    => part[0].sub(/\w{2}   /,'').strip,
    'sequence'   => part[1].sub(/\w{2}   /,'').strip,
    'annotation' => part[2].sub(/\w{2}   /,'').strip
  }
end
embl_dr(key = nil)

Backup Bio::EMBLDB#dr as #embl_dr

Alias for: dr
entry()
Alias for: entry_id
entry_id() click to toggle source

returns a ENTRY_NAME in the ID line.

# File lib/bio/db/embl/sptr.rb, line 99
def entry_id
  id_line('ENTRY_NAME')
end
Also aliased as: entry_name, entry
entry_name()
Alias for: entry_id
ft(feature_key = nil) click to toggle source

returns contents in the feature table.

Examples

sp = Bio::SPTR.new(entry)
ft = sp.ft
ft.class #=> Hash
ft.keys.each do |feature_key|
  ft[feature_key].each do |feature|
    feature['From'] #=> '1'
    feature['To']   #=> '21'
    feature['Description'] #=> ''
    feature['FTId'] #=> ''
    feature['diff'] #=> []
    feature['original'] #=> [feature_key, '1', '21', '', '']
  end
end
  • #ft -> Hash

    {FEATURE_KEY => [{'From' => int, 'To' => int, 
                      'Description' => aStr, 'FTId' => aStr,
                      'diff' => [original_residues, changed_residues],
                      'original' => aAry }],...}

returns an Array of the information about the feature_name in the feature table.

  • #ft -> Array of Hash

    [{'From' => str, 'To' => str, 'Description' => str, 'FTId' => str},...]

FT Line; feature table data (>=0, optional)

Col     Data item
-----   -----------------
 1- 2   FT
 6-13   Feature name 
15-20   `FROM' endpoint
22-27   `TO' endpoint
35-75   Description (>=0 per key)
-----   -----------------

Note: 'FROM' and 'TO' endopoints are allowed to use non-numerial charactors including '<', '>' or '?'. (c.f. '<1', '?42')

See also www.expasy.org/sprot/userman.html#FT_line

# File lib/bio/db/embl/sptr.rb, line 1196
def ft(feature_key = nil)
  return ft[feature_key] if feature_key
  return @data['FT'] if @data['FT']

  table = []
  begin
    get('FT').split("\n").each do |line|
      if line =~ /^FT   \w/
        feature = line.chomp.ljust(74)
        table << [feature[ 5..12].strip,   # Feature Name
                  feature[14..19].strip,   # From
                  feature[21..26].strip,   # To
                  feature[34..74].strip ]  # Description
      else
        table.last << line.chomp.sub!(/^FT +/, '')
      end
    end

    # Joining Description lines
    table = table.map { |feature| 
      ftid = feature.pop if feature.last =~ /FTId=/
      if feature.size > 4
        feature = [feature[0], 
                   feature[1], 
                   feature[2], 
                   feature[3, feature.size - 3].join(" ")]
      end
      feature << if ftid then ftid else '' end
    }

    hash = {}
    table.each do |feature|
      hash[feature[0]] = [] unless hash[feature[0]]
      hash[feature[0]] << {
        # Removing '<', '>' or '?' in FROM/TO endopoint.
        'From' => feature[1].sub(/\D/, '').to_i,  
        'To'   => feature[2].sub(/\D/, '').to_i, 
        'Description' => feature[3], 
        'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
        'diff' => [],
        'original' => feature
      }

      case feature[0]
      when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
        case hash[feature[0]].last['Description']
        when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
          original_res = $1
          changed_res = $2
          original_res = original_res.gsub(/ /,'').strip
          chenged_res = changed_res.gsub(/ /,'').strip
        when /Missing/i
          original_res = seq.subseq(hash[feature[0]].last['From'],
                                    hash[feature[0]].last['To'])
          changed_res = ''
        end
        hash[feature[0]].last['diff'] = [original_res, chenged_res]
      end
    end
  rescue
    raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
  end

  @data['FT'] = hash
end
gene_name() click to toggle source

returns a String of the first gene name in the GN line.

# File lib/bio/db/embl/sptr.rb, line 438
def gene_name
  gene_names.first
end
gene_names() click to toggle source

returns a Array of gene names in the GN line.

# File lib/bio/db/embl/sptr.rb, line 427
def gene_names
  gn # set @data['GN'] if it hasn't been already done
  if @data['GN'].first.class == Hash then
    @data['GN'].collect { |element| element[:name] }
  else
    @data['GN'].first
  end
end
gn() click to toggle source

returns gene names in the GN line.

New UniProt/SwissProt format:

  • #gn -> [ <gene record>* ]

where <gene record> is:

{ :name => '...', 
  :synonyms => [ 's1', 's2', ... ],
  :loci   => [ 'l1', 'l2', ... ],
  :orfs     => [ 'o1', 'o2', ... ] 
}

Old format:

  • #gn -> Array # AND

  • #gn -> Array # OR

GN Line: Gene name(s) (>=0, optional)

# File lib/bio/db/embl/sptr.rb, line 351
def gn
  unless @data['GN']
    case fetch('GN')
    when /Name=/,/ORFNames=/,/OrderedLocusNames=/,/Synonyms=/
      @data['GN'] = gn_uniprot_parser
    else
      @data['GN'] = gn_old_parser
    end
  end
  @data['GN']
end
hi() click to toggle source

The HI line

#hi #=> hash

# File lib/bio/db/embl/sptr.rb, line 691
def hi
  unless @data['HI']
    @data['HI'] = []
    fetch('HI').split(/\. /).each do |hlist|
      hash = {'Category' => '',  'Keywords' => [], 'Keyword' => ''}
      hash['Category'], hash['Keywords'] = hlist.split(': ')
      hash['Keywords'] = hash['Keywords'].split('; ')
      hash['Keyword'] = hash['Keywords'].pop
      hash['Keyword'].sub!(/\.$/, '')
      @data['HI'] << hash
    end
  end
  @data['HI']
end
id_line(key = nil) click to toggle source

returns a Hash of the ID line.

returns a content (Int or String) of the ID line by a given key. Hash keys: ['ENTRY_NAME', 'DATA_CLASS', 'MODECULE_TYPE', 'SEQUENCE_LENGTH']

ID Line (since UniProtKB release 9.0 of 31-Oct-2006)

ID   P53_HUMAN               Reviewed;         393 AA.
#"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{SEQUENCE_LENGTH}."

Examples

obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"Reviewed", 
                  "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>nil}

obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"

ID Line (older style)

ID   P53_HUMAN      STANDARD;      PRT;   393 AA.
#"ID  #{ENTRY_NAME} #{DATA_CLASS}; #{MOLECULE_TYPE}; #{SEQUENCE_LENGTH}."

Examples

obj.id_line  #=> {"ENTRY_NAME"=>"P53_HUMAN", "DATA_CLASS"=>"STANDARD", 
                  "SEQUENCE_LENGTH"=>393, "MOLECULE_TYPE"=>"PRT"}

obj.id_line('ENTRY_NAME') #=> "P53_HUMAN"
# File lib/bio/db/embl/sptr.rb, line 74
def id_line(key = nil)
  return id_line[key] if key
  return @data['ID'] if @data['ID']

  part = @orig['ID'].split(/ +/)         
  if part[4].to_s.chomp == 'AA.' then
    # after UniProtKB release 9.0 of 31-Oct-2006
    # (http://www.uniprot.org/docs/sp_news.htm)
    molecule_type   = nil
    sequence_length = part[3].to_i
  else
    molecule_type   = part[3].sub(/;/,'')
    sequence_length = part[4].to_i
  end
  @data['ID'] = {
    'ENTRY_NAME'      => part[1],
    'DATA_CLASS'      => part[2].sub(/;/,''),
    'MOLECULE_TYPE'   => molecule_type,
    'SEQUENCE_LENGTH' => sequence_length
  }
end
molecule() click to toggle source

returns a MOLECULE_TYPE in the ID line.

A short-cut for #id_line('MOLECULE_TYPE').

# File lib/bio/db/embl/sptr.rb, line 109
def molecule
  id_line('MOLECULE_TYPE')
end
Also aliased as: molecule_type
molecule_type()
Alias for: molecule
oh() click to toggle source

The OH Line;

OH NCBI_TaxID=TaxID; HostName. br.expasy.org/sprot/userman.html#OH_line

# File lib/bio/db/embl/sptr.rb, line 521
def oh
  unless @data['OH']
    @data['OH'] = fetch('OH').split("\. ").map {|x|
      if x =~ /NCBI_TaxID=(\d+);/
        taxid = $1
      else
        raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
                              $!, "\n", get('OH'), "\n"].join
        
      end
      if x =~ /NCBI_TaxID=\d+; (.+)/ 
        host_name = $1
        host_name.sub!(/\.$/, '')
      else
        host_name = nil
      end
      {'NCBI_TaxID' => taxid, 'HostName' => host_name}
    }
  end
  @data['OH']
end
os(num = nil) click to toggle source

returns a Array of Hashs or a String of the OS line when a key given.

  • Bio::EMBLDB#os -> Array

[{'name' => '(Human)', 'os' => 'Homo sapiens'}, 
 {'name' => '(Rat)', 'os' => 'Rattus norveticus'}]
{'name' => "(Human)", 'os' => 'Homo sapiens'}
  • #os['name'] -> “(Human)”

  • Bio::EPTR#os(0) -> “Homo sapiens (Human)”

OS Line; organism species (>=1)

OS   Genus species (name).
OS   Genus species (name0) (name1).
OS   Genus species (name0) (name1).
OS   Genus species (name0), G s0 (name0), and G s (name0) (name1).
OS   Homo sapiens (Human), and Rarrus norveticus (Rat)
OS   Hippotis sp. Clark and Watts 825.
OS   unknown cyperaceous sp.
# File lib/bio/db/embl/sptr.rb, line 460
def os(num = nil)
  unless @data['OS']
    os = Array.new
    fetch('OS').split(/, and|, /).each do |tmp|
      if tmp =~ /(\w+ *[\w\d \:\\+\-\.]+[\w\d\.])/
        org = $1
        tmp =~ /(\(.+\))/ 
        os.push({'name' => $1, 'os' => org})
      else
        raise "Error: OS Line. #{$!}\n#{fetch('OS')}\n"
      end
    end
    @data['OS'] = os
  end

  if num
    # EX. "Trifolium repens (white clover)"
    return "#{@data['OS'][num]['os']} #{@data['OS'][num]['name']}"
  else
    return @data['OS']
  end
end
ox() click to toggle source

returns a Hash of oraganism taxonomy cross-references.

  • #ox -> Hash

    {'NCBI_TaxID' => ['1234','2345','3456','4567'], ...}

OX Line; organism taxonomy cross-reference (>=1 per entry)

OX   NCBI_TaxID=1234;
OX   NCBI_TaxID=1234, 2345, 3456, 4567;
# File lib/bio/db/embl/sptr.rb, line 504
def ox
  unless @data['OX']
    tmp = fetch('OX').sub(/\.$/,'').split(/;/).map { |e| e.strip }
    hsh = Hash.new
    tmp.each do |e|
      db,refs = e.split(/=/)
      hsh[db] = refs.split(/, */)
    end
    @data['OX'] = hsh
  end
  return @data['OX']
end
protein_name() click to toggle source

returns the proposed official name of the protein. Returns a String.

Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full name which is taken from “RecName: Full=” or “SubName: Full=” line normally in the beginning of the DE lines. Unlike parser for old format, no special treatments for fragment or precursor.

For old format, the method parses the DE lines and returns the protein name as a String.

DE Line; description (>=1)

"DE #{OFFICIAL_NAME} (#{SYNONYM})"
"DE #{OFFICIAL_NAME} (#{SYNONYM}) [CONTEINS: #1; #2]."
OFFICIAL_NAME  1/entry
SYNONYM        >=0
CONTEINS       >=0
# File lib/bio/db/embl/sptr.rb, line 251
def protein_name
  @data['DE'] ||= parse_DE_line_rel14(get('DE'))
  parsed_de_line = @data['DE']
  if parsed_de_line then
    # since UniProtKB release 14.0 of 22-Jul-2008
    name = nil
    parsed_de_line.each do |a|
      case a[0]
      when 'RecName', 'SubName'
        if name_pair = a[1..-1].find { |b| b[0] == 'Full' } then
          name = name_pair[1]
          break
        end
      end
    end
    name = name.to_s
  else
    # old format (before Rel. 13.x)
    name = ""
    if de_line = fetch('DE') then
      str = de_line[/^[^\[]*/] # everything preceding the first [ (the "contains" part)
      name = str[/^[^(]*/].strip
      name << ' (Fragment)' if str =~ /fragment/i
    end
  end
  return name
end
ref() click to toggle source

returns contents in the R lines.

where <reference information Hash> is:

{'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
 'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}

R Lines

  • RN RC RP RX RA RT RL RG

# File lib/bio/db/embl/sptr.rb, line 557
def ref
  unless @data['R']
    @data['R'] = [get('R').split(/\nRN   /)].flatten.map { |str|
      hash = {'RN' => '', 'RC' => '', 'RP' => '', 'RX' => '', 
             'RA' => '', 'RT' => '', 'RL' => '', 'RG' => ''}
      str = 'RN   ' + str unless /^RN   / =~ str

      str.split("\n").each do |line|
        if /^(R[NPXARLCTG])   (.+)/ =~ line
          hash[$1] += $2 + ' '
        else
          raise "Invalid format in R lines, \n[#{line}]\n"
        end
      end

      hash['RN'] = set_RN(hash['RN'])
      hash['RC'] = set_RC(hash['RC'])
      hash['RP'] = set_RP(hash['RP'])
      hash['RX'] = set_RX(hash['RX'])
      hash['RA'] = set_RA(hash['RA'])
      hash['RT'] = set_RT(hash['RT'])
      hash['RL'] = set_RL(hash['RL'])
      hash['RG'] = set_RG(hash['RG'])

      hash
    }

  end
  @data['R']
end
references() click to toggle source

returns Bio::Reference object from Bio::EMBLDB::Common#ref.

# File lib/bio/db/embl/sptr.rb, line 651
def references
  unless @data['references']
    ary = self.ref.map {|ent|
      hash = Hash.new('')
      ent.each {|key, value|
        case key
        when 'RA'
          hash['authors'] = value.split(/, /)
        when 'RT'
          hash['title'] = value
        when 'RL'
          if value =~ /(.*) (\d+) \((\d+)\), (\d+-\d+) \((\d+)\)$/
            hash['journal'] = $1
            hash['volume']  = $2
            hash['issue']   = $3
            hash['pages']   = $4
            hash['year']    = $5
          else
            hash['journal'] = value
          end
        when 'RX'  # PUBMED, MEDLINE, DOI
          value.each do |tag, xref|
            hash[ tag.downcase ]  = xref
          end
        end
      }
      Reference.new(hash)
    }
    @data['references'] = References.new(ary)
  end
  @data['references']
end
seq() click to toggle source

returns a Bio::Sequence::AA of the amino acid sequence.

blank Line; sequence data (>=1)

# File lib/bio/db/embl/sptr.rb, line 1306
def seq
  unless @data['']
    @data[''] = Sequence::AA.new( fetch('').gsub(/ |\d+/,'') )
  end
  return @data['']
end
Also aliased as: aaseq
sequence_length() click to toggle source

returns a SEQUENCE_LENGTH in the ID line.

A short-cut for #id_line('SEQUENCE_LENGHT').

# File lib/bio/db/embl/sptr.rb, line 118
def sequence_length
  id_line('SEQUENCE_LENGTH')
end
Also aliased as: aalen
set_RN(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 588
def set_RN(data)
  data.strip
end
sq(key = nil) click to toggle source

returns a Hash of conteins in the SQ lines.

  • Bio::SPTRL#sq -> hsh

returns a value of a key given in the SQ lines.

  • Bio::SPTRL#sq(key) -> int or str

  • Keys: ['MW', 'mw', 'molecular', 'weight', 'aalen', 'len', 'length',

    'CRC64']

SQ Line; sequence header (1/entry)

SQ   SEQUENCE   233 AA;  25630 MW;  146A1B48A1475C86 CRC64;
SQ   SEQUENCE  \d+ AA; \d+ MW;  [0-9A-Z]+ CRC64;

MW, Dalton unit. CRC64 (64-bit Cyclic Redundancy Check, ISO 3309).

# File lib/bio/db/embl/sptr.rb, line 1278
def sq(key = nil)
  unless @data['SQ']
    if fetch('SQ') =~ /(\d+) AA\; (\d+) MW; (.+) CRC64;/
      @data['SQ'] = { 'aalen' => $1.to_i, 'MW' => $2.to_i, 'CRC64' => $3 }
    else
      raise "Invalid SQ Line: \n'#{fetch('SQ')}'"
    end
  end

  if key
    case key
    when /mw/, /molecular/, /weight/
      @data['SQ']['MW']
    when /len/, /length/, /AA/
      @data['SQ']['aalen']
    else
      @data['SQ'][key]
    end
  else 
    @data['SQ']
  end
end
synonyms() click to toggle source

returns synonyms (unofficial and/or alternative names). Returns an Array containing String objects.

Since UniProtKB release 14.0 of 22-Jul-2008, the DE line format have been changed. The method returns the full or short names which are taken from “RecName: Short=”, “RecName: EC=”, and AltName lines, except after “Contains:” or “Includes:”. For keeping compatibility with old format parser, “RecName: EC=N.N.N.N” is reported as “EC N.N.N.N”. In addition, to prevent confusion, “Allergen=” and “CD_antigen=” prefixes are added for the corresponding fields.

For old format, the method parses the DE lines and returns synonyms. synonyms are each placed in () following the official name on the DE line.

# File lib/bio/db/embl/sptr.rb, line 294
def synonyms
  ary = Array.new
  @data['DE'] ||= parse_DE_line_rel14(get('DE'))
  parsed_de_line = @data['DE']
  if parsed_de_line then
    # since UniProtKB release 14.0 of 22-Jul-2008
    parsed_de_line.each do |a|
      case a[0]
      when 'Includes', 'Contains'
        break #the each loop
      when 'RecName', 'SubName', 'AltName'
        a[1..-1].each do |b|
          if name = b[1] and b[1] != self.protein_name then
            case b[0]
            when 'EC'
              name = "EC " + b[1]
            when 'Allergen', 'CD_antigen'
              name = b[0] + '=' + b[1]
            else
              name = b[1]
            end
            ary.push name
          end
        end
      end #case a[0]
    end #parsed_de_line.each
  else
    # old format (before Rel. 13.x)
    if de_line = fetch('DE') then
      line = de_line.sub(/\[.*\]/,'') # ignore stuff between [ and ].  That's the "contains" part
    line.scan(/\([^)]+/) do |synonym| 
      unless synonym =~ /fragment/i then 
        ary << synonym[1..-1].strip # index to remove the leading (  
      end
      end
    end
  end
  return ary
end

Private Instance Methods

cc_alternative_products(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 913
def cc_alternative_products(data)
  ap = data.join('')
  return ap unless ap

  # Event, Named isoforms, Comment, [Name, Synonyms, IsoId, Sequnce]+
  tmp = {'Event' => "", 'Named isoforms' => "", 'Comment' => "", 
         'Variants'  => []}
  if /Event=(.+?);/ =~ ap
    tmp['Event'] = $1
    tmp['Event'] = tmp['Event'].sub(/;/,'').split(/, /)
  end
  if /Named isoforms=(\S+?);/ =~ ap
    tmp['Named isoforms'] = $1
  end
  if /Comment=(.+?);/m =~ ap
    tmp['Comment'] = $1
  end
  ap.scan(/Name=.+?Sequence=.+?;/).each do |ent|
    tmp['Variants'] << cc_alternative_products_variants(ent)
  end
  return tmp
end
cc_alternative_products_variants(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 937
def cc_alternative_products_variants(data)
  variant = {'Name' => '', 'Synonyms' => [], 'IsoId' => [], 'Sequence' => []}
  data.split(/; /).map {|x| x.split(/=/) }.each do |e|
    case e[0]
    when 'Sequence', 'Synonyms', 'IsoId'
      e[1] = e[1].sub(/;/,'').split(/, /)
    end
    variant[e[0]] = e[1]
  end
  variant
end
cc_biophysiochemical_properties(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 951
def cc_biophysiochemical_properties(data)
  data = data[0]

  hash = {'Absorption' => {}, 
          'Kinetic parameters' => {},
          'pH dependence' => "",
          'Redox potential' => "",
          'Temperature dependence' => ""}
  if data =~ /Absorption: Abs\(max\)=(.+?);/
    hash['Absorption']['Abs(max)'] = $1
  end
  if data =~ /Absorption: Abs\(max\)=.+; Note=(.+?);/
    hash['Absorption']['Note'] = $1
  end
  if data =~ /Kinetic parameters: KM=(.+?); Vmax=(.+?);/
    hash['Kinetic parameters']['KM'] = $1
    hash['Kinetic parameters']['Vmax'] = $2
  end
  if data =~ /Kinetic parameters: KM=.+; Vmax=.+; Note=(.+?);/
    hash['Kinetic parameters']['Note'] = $1
  end
  if data =~ /pH dependence: (.+?);/
    hash['pH dependence'] = $1
  end
  if data =~ /Redox potential: (.+?);/
    hash['Redox potential'] = $1
  end
  if data =~ /Temperature dependence: (.+?);/
    hash['Temperature dependence'] = $1
  end
  hash
end
cc_caution(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 986
def cc_caution(data)
  data.join('')
end
cc_interaction(data) click to toggle source

returns conteins in a line of the CC INTERACTION section.

CC       P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
# File lib/bio/db/embl/sptr.rb, line 995
def cc_interaction(data)
  str = data.join('')
  it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
  it.map {|ent|
    ent.map! {|x| x.strip }
    if ent[0] =~ /^(.+):(.+)/
      spac = $1
      spid = $2.split(' ')[0]
      optid = nil
    elsif ent[0] =~ /Self/
      spac = self.entry_id
      spid = self.entry_id
      optid = nil
    end
    if ent[0] =~ /^.+:.+ (.+)/
      optid = $1
    end

    {'SP_Ac' => spac,
     'identifier' => spid,
     'NbExp' => ent[1],
     'IntAct' => ent[2].split(', '),
     'optional_identifier' => optid}
  }
end
cc_mass_spectrometry(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 1023
def cc_mass_spectrometry(data)
  # MASS SPECTROMETRY: MW=XXX[; MW_ERR=XX][; METHOD=XX][;RANGE=XX-XX].
  return data unless data

  data.map { |m|
    mass = {'MW' => nil, 'MW_ERR' => nil, 'METHOD' => nil, 'RANGE' => nil,
            'NOTE' => nil}
    m.sub(/.$/,'').split(/;/).each do |line|
      case line
      when /MW=(.+)/
        mass['MW'] = $1
      when /MW_ERR=(.+)/
        mass['MW_ERR'] = $1
      when /METHOD=(.+)/
        mass['METHOD'] = $1
      when /RANGE=(\d+-\d+)/ 
        mass['RANGE'] = $1          # RANGE class ? 
      when /NOTE=(.+)/
        mass['NOTE'] = $1
      end 
    end
    mass
  }
end
cc_pathway(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 1050
def cc_pathway(data)
  data.map {|x| x.sub(/\.$/, '') }.map {|x|
    x.split(/; | and |: /)
  }[0]
end
cc_rna_editing(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 1058
def cc_rna_editing(data)
  data = data.join('')
  entry = {'Modified_positions' => [], 'Note' => ""}
  if data =~ /Modified_positions=(.+?)(\.|;)/
    entry['Modified_positions'] = $1.sub(/\.$/, '').split(', ')
  else
    raise ArgumentError, "Invarid CC RNA Editing lines (#{self.entry_id}):#{$!}\n#{get('CC')}"
  end
  if data =~ /Note=(.+)/
    entry['Note'] = $1
  end
  entry
end
cc_subcellular_location(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 1074
def cc_subcellular_location(data)
  data.map {|x| 
    x.split('. ').map {|y| 
      y.split('; ').map {|z| 
        z.sub(/\.$/, '') 
      } 
    } 
  }[0]
end
cc_web_resource(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 1092
def cc_web_resource(data)
  data.map {|x|
    entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
    x.split(';').each do |y|
      case y
      when /(Name|Note)\=(.+)/
        key = $1
        val = $2.strip
        entry[key] = val
      when /(NAME|NOTE)\=(.+)/
        key = $1.downcase.capitalize
        val = $2.strip
        entry[key] = val
      when /URL\=\"(.+)\"/
        entry['URL'] = $1.strip
      end
    end
    entry
  }
end
gn_old_parser() click to toggle source

returns contents in the old style GN line.

GN Line: Gene name(s) (>=0, optional)

GN   HNS OR DRDX OR OSMZ OR BGLY.
GN   CECA1 AND CECA2.
GN   CECA1 AND (HOGE OR FUGA).

GN NAME1 [(AND|OR) NAME]+.

#gn -> Array # AND

#gn[0] -> Array   # OR
#gene_names -> Array
# File lib/bio/db/embl/sptr.rb, line 375
def gn_old_parser
  names = Array.new
  if get('GN').size > 0
    names = fetch('GN').sub(/\.$/,'').split(/ AND /)
    names.map! { |synonyms|
      synonyms = synonyms.gsub(/\(|\)/,'').split(/ OR /).map { |e|
        e.strip 
      }
    }
  end
  @data['GN'] = names
end
gn_uniprot_parser() click to toggle source

returns contents in the structured GN line. The new format of the GN line is:

GN   Name=; Synonyms=[, ...]; OrderedLocusNames=[, ...];
GN   ORFNames=[, ...];
  • #gn -> [ <gene record>* ]

where <gene record> is:

{ :name => '...', 
  :synonyms => [ 's1', 's2', ... ],
  :loci   => [ 'l1', 'l2', ... ],
  :orfs     => [ 'o1', 'o2', ... ] 
}
# File lib/bio/db/embl/sptr.rb, line 401
def gn_uniprot_parser
  @data['GN'] = Array.new
  gn_line = fetch('GN').strip
  records = gn_line.split(/\s*and\s*/)
  records.each do |record|
    gene_hash = {:name => '', :synonyms => [], :loci => [], :orfs => []}
    record.each_line(';') do |element|
      case element
      when /Name=/ then
        gene_hash[:name] = $'[0..-2]
      when /Synonyms=/ then
        gene_hash[:synonyms] = $'[0..-2].split(/\s*,\s*/)
      when /OrderedLocusNames=/ then
        gene_hash[:loci] = $'[0..-2].split(/\s*,\s*/)
      when /ORFNames=/ then
        gene_hash[:orfs] = $'[0..-2].split(/\s*,\s*/)
      end
    end
    @data['GN'] << gene_hash
  end
  return @data['GN']
end
parse_DE_line_rel14(str) click to toggle source

(private) parses DE line (description lines) since UniProtKB release 14.0 of 22-Jul-2008

Return array containing array.

www.uniprot.org/docs/sp_news.htm

# File lib/bio/db/embl/sptr.rb, line 177
def parse_DE_line_rel14(str)
  # Retruns if it is not the new format since Rel.14
  return nil unless /^DE   (RecName|AltName|SubName)\: / =~ str
  ret = []
  cur = nil
  str.each_line do |line|
    case line
    when /^DE   (Includes|Contains)\: *$/
      cur = [ $1 ]
      ret.push cur
      cur = nil
      #subcat_and_desc = nil
      next
    when /^DE   *(RecName|AltName|SubName)\: +(.*)/
      category = $1
      subcat_and_desc = $2
      cur = [ category ]
      ret.push cur
    when /^DE   *(Flags)\: +(.*)/
      category = $1
      desc = $2
      flags = desc.strip.split(/\s*\;\s*/) || []
      cur = [ category, flags ]
      ret.push cur
      cur = nil
      #subcat_and_desc = nil
      next
    when /^DE   *(.*)/
      subcat_and_desc = $1
    else
      warn "Warning: skipped DE line in unknown format: #{line.inspect}"
      #subcat_and_desc = nil
      next
    end
    case subcat_and_desc
    when nil
      # does nothing
    when /\A([^\=]+)\=(.*)/
      subcat = $1
      desc = $2
      desc.sub!(/\;\s*\z/, '')
      unless cur
        warn "Warning: unknown category in DE line: #{line.inspect}"
        cur = [ '' ]
        ret.push cur
      end
      cur.push [ subcat, desc ]
    else
      warn "Warning: skipped DE line description in unknown format: #{line.inspect}"
    end
  end
  ret
end
set_RA(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 626
def set_RA(data)
  data = data.sub(/; *$/, '')
end
set_RC(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 592
def set_RC(data)
  data.scan(/([STP]\w+)=(.+);/).map { |comment|
    [comment[1].split(/, and |, /)].flatten.map { |text|
      {'Token' => comment[0], 'Text' => text}
    }
  }.flatten
end
set_RG(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 642
def set_RG(data)
  data = data.split('; ')
end
set_RL(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 637
def set_RL(data)
  data = data.strip
end
set_RP(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 601
def set_RP(data)
  data = data.strip
  data = data.sub(/\.$/, '')
  data.split(/, AND |, /i).map {|x| 
    x = x.strip
    x = x.gsub('  ', ' ')
  }
end
set_RT(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 631
def set_RT(data)
  data = data.sub(/; *$/, '')
  data = data.gsub(/(^"|"$)/, '')
end
set_RX(data) click to toggle source
# File lib/bio/db/embl/sptr.rb, line 611
def set_RX(data)
  rx = {'MEDLINE' => nil, 'PubMed' => nil, 'DOI' => nil}
  if data =~ /MEDLINE=(.+?);/
    rx['MEDLINE'] = $1
  end
  if data =~ /PubMed=(.+?);/
    rx['PubMed'] = $1
  end
  if data =~ /DOI=(.+?);/
    rx['DOI'] = $1
  end
  rx
end