From: Martin Hansen Newsgroups: comp.lang.ruby Subject: Need for speed -> a C extension? Date: Mon, 18 Apr 2011 10:15:53 -0500 Organization: Service de news de lacave.net Lines: 264 Message-ID: NNTP-Posting-Host: bristol.highgroove.com Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 7bit X-Trace: talisker.lacave.net 1303142271 77839 65.111.164.187 (18 Apr 2011 15:57:51 GMT) X-Complaints-To: abuse@lacave.net NNTP-Posting-Date: Mon, 18 Apr 2011 15:57:51 +0000 (UTC) X-Received-From: This message has been automatically forwarded from the ruby-talk mailing list by a gateway at comp.lang.ruby. If it is SPAM, it did not originate at comp.lang.ruby. Please report the original sender, and not us. Thanks! For more details about this gateway, please visit: http://blog.grayproductions.net/categories/the_gateway X-Mail-Count: 381781 X-Ml-Name: ruby-talk X-Rubymirror: Yes X-Ruby-Talk: Path: csiph.com!x330-a1.tempe.blueboxinc.net!usenet.pasdenom.info!news.stben.net!talisker.lacave.net!lacave.net!not-for-mail Xref: x330-a1.tempe.blueboxinc.net comp.lang.ruby:3105 Hello all, The below code is too slow for practical use. I need it to run at least 20 times faster. Perhaps that is possible with some C code? I have no experience with writing Ruby extensions. What are the pitfalls? Which part of the code should be ported? Any pointers to get me started? Cheers, Martin #!/usr/bin/env ruby # IUPAC nucleotide pair ambiguity equivalents are saved in an # array of bit fields. BIT_A = 1 << 0 BIT_T = 1 << 1 BIT_C = 1 << 2 BIT_G = 1 << 3 EQUAL = Array.new(256, 0) EQUAL['A'.ord] = BIT_A EQUAL['T'.ord] = BIT_T EQUAL['U'.ord] = BIT_T EQUAL['C'.ord] = BIT_C EQUAL['G'.ord] = BIT_G EQUAL['M'.ord] = (BIT_A|BIT_C) EQUAL['R'.ord] = (BIT_A|BIT_G) EQUAL['W'.ord] = (BIT_A|BIT_T) EQUAL['S'.ord] = (BIT_C|BIT_G) EQUAL['Y'.ord] = (BIT_C|BIT_T) EQUAL['K'.ord] = (BIT_G|BIT_T) EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T) EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T) EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T) EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G) EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T) # Module containing code to locate nucleotide patterns in sequences allowing for # ambiguity codes and a given maximum edit distance. # Insertions are nucleotides found in the pattern but not in the sequence. # Deletions are nucleotides found in the sequence but not in the pattern. # # Inspired by the paper by Bruno Woltzenlogel Paleo (page 197): # http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf module PatternMatcher # ------------------------------------------------------------------------------ # str.match(pattern[, pos[, max_edit_distance]]) # -> Match or nil # # ------------------------------------------------------------------------------ # Method to locate the next pattern match starting from a given position. A match # is allowed to contain a given maximum edit distance. If a match is located a # Match object will be returned otherwise nil. def match(pattern, pos = 0, max_edit_distance = 0) @pattern = pattern @pos = pos @max_edit_distance = max_edit_distance @vector = vector_init while @pos < @seq.length vector_update return match_new if match_found? @pos += 1 end end # ------------------------------------------------------------------------------ # str.scan(pattern[, pos[, max_edit_distance]]) # -> Array # str.scan(pattern[, pos[, max_edit_distance]]) { |match| # block # } # -> Match # # ------------------------------------------------------------------------------ # Method to iterate through a sequence to locate pattern matches starting # from a given position and allowing for a maximum edit distance. # Matches found in block context return the Match object. Otherwise matches are # returned in an Array. def scan(pattern, pos = 0, max_edit_distance = 0) matches = [] while match = match(pattern, pos, max_edit_distance) if block_given? yield match else matches << match end pos = match.pos + 1 end return matches unless block_given? end private # Method to initailize the score vector and return this. def vector_init vector = [] (0 ... @pattern.length + 1).each do |i| vector[i] = Score.new(matches = 0, mismatches = 0, insertions = i) end vector end # Method to update the score vector. def vector_update new_vector = @vector.dup (0 ... @pattern.length).each do |i| if match?(@seq[@pos], @pattern[i]) new_vector[i + 1] = @vector[i].dup new_vector[i + 1].matches += 1 else mismatch = @vector[i].dup insertion = new_vector[i].dup deletion = @vector[i + 1].dup if deletion?(mismatch, insertion, deletion) deletion.deletions += 1 new_vector[i + 1] = deletion elsif mismatch?(mismatch, insertion, deletion) mismatch.mismatches += 1 new_vector[i + 1] = mismatch elsif insertion?(mismatch, insertion, deletion) insertion.insertions += 1 new_vector[i + 1] = insertion end end end @vector = new_vector end # Method to determine if a match occurred. def match?(char1, char2) (EQUAL[char1.upcase.ord] & EQUAL[char2.upcase.ord]) != 0 end # Method to determine if a mismatch occured. def mismatch?(mismatch, insertion, deletion) if mismatch.edit_distance <= insertion.edit_distance and mismatch.edit_distance <= deletion.edit_distance true end end # Method to determine if an insertion occured. def insertion?(mismatch, insertion, deletion) if insertion.edit_distance <= mismatch.edit_distance and insertion.edit_distance <= deletion.edit_distance true end end # Method to determine if a deletion occured. def deletion?(mismatch, insertion, deletion) if deletion.edit_distance <= mismatch.edit_distance and deletion.edit_distance <= insertion.edit_distance true end end # Method to print the score vector. def vector_print @vector.each do |s| puts s end puts end # Method that returns a Match object initialized with # information from the score vector. def match_new matches = @vector.last.matches mismatches = @vector.last.mismatches insertions = @vector.last.insertions deletions = @vector.last.deletions length = @pattern.length - insertions + deletions pos = @pos - length + 1 match = @seq[pos ... pos + length] Match.new(pos, match, matches, mismatches, insertions, deletions, length) end # Method that determines if a match was found by analyzing the score vector. def match_found? if @vector.last.edit_distance <= @max_edit_distance true end end # Class to instantiate Score objects that holds score information. class Score attr_accessor :matches, :mismatches, :insertions, :deletions def initialize(matches = 0, mismatches = 0, insertions = 0, deletions = 0) @matches = matches @mismatches = mismatches @insertions = insertions @deletions = deletions end # Method to calculate and return the edit distance. def edit_distance self.mismatches + self.insertions + self.deletions end private def to_s "(#{[self.matches, self.mismatches, self.insertions, self.deletions].join(',')})" end end # Class for creating Match objects which contain the description of a # match between a nucleotide sequence and a pattern. class Match attr_reader :pos, :match, :matches, :mismatches, :insertions, :deletions, :edit_distance, :length def initialize(pos, match, matches, mismatches, insertions, deletions, length) @pos = pos @match = match @matches = matches @mismatches = mismatches @insertions = insertions @deletions = deletions @edit_distance = mismatches + insertions + deletions @length = length end end end -- Posted via http://www.ruby-forum.com/.