Path: csiph.com!usenet.pasdenom.info!gegeweb.org!de-l.enfer-du-nord.net!feeder1.enfer-du-nord.net!feeds.phibee-telecom.net!newsfeed.xs4all.nl!newsfeed5.news.xs4all.nl!xs4all!post.news.xs4all.nl!not-for-mail Return-Path: X-Original-To: python-list@python.org Delivered-To: python-list@mail.python.org X-Spam-Status: OK 0.000 X-Spam-Evidence: '*H*': 1.00; '*S*': 0.00; 'value,': 0.03; 'args': 0.04; 'string.': 0.04; '"__main__":': 0.07; '__name__': 0.07; 'main()': 0.07; 'parser': 0.07; '*end*': 0.09; 'collections': 0.09; 'parsers': 0.09; 'received:80.91': 0.09; 'received:80.91.229': 0.09; 'received:gmane.org': 0.09; 'received:list': 0.09; 'sep': 0.09; 'def': 0.10; '"*"': 0.16; '"sample': 0.16; '(x,': 0.16; '1):': 0.16; '1.05': 0.16; '2),': 0.16; 'argparse': 0.16; 'b):': 0.16; 'combinations': 0.16; 'contextlib': 0.16; 'count)': 0.16; 'defaultdict': 0.16; 'itertools': 0.16; 'main():': 0.16; 'measured': 0.16; 'pairs': 0.16; 'received:80.91.229.3': 0.16; 'received:dip.t-dialin.net': 0.16; 'received:plane.gmane.org': 0.16; 'received:t-dialin.net': 0.16; 'statistics,': 0.16; 'wed,': 0.16; 'wrote:': 0.17; 'char': 0.17; 'comparing': 0.17; 'skip:{ 20': 0.17; 'yield': 0.17; 'skip:p 30': 0.20; 'import': 0.21; 'demonstrate': 0.23; 'random': 0.24; 'header:User-Agent:1': 0.26; 'common': 0.26; 'skip:@ 10': 0.27; 'skip:e 30': 0.27; "doesn't": 0.28; 'header:X-Complaints-To:1': 0.28; 'chris': 0.28; 'tail': 0.29; 'words': 0.29; 'this.': 0.29; 'code': 0.31; 'help,': 0.32; "skip:' 20": 0.32; 'to:addr:python- list': 0.33; 'likely': 0.33; 'compared': 0.35; 'pm,': 0.35; 'subject:?': 0.35; 'received:org': 0.36; 'characters': 0.36; 'skip:* 60': 0.36; 'skip:{ 10': 0.36; 'should': 0.36; 'skip:p 20': 0.36; 'two': 0.37; 'previous': 0.37; 'rather': 0.37; 'subject:: ': 0.38; 'to:addr:python.org': 0.39; 'skip:" 10': 0.40; 'header:Received:5': 0.40; 'skip:u 10': 0.60; 'back': 0.62; 'skip:n 10': 0.63; 'more': 0.63; 'account': 0.67; 'obvious': 0.71; '(word': 0.84; 'common,': 0.84; 'otten': 0.84; 'type=int,': 0.84; 'words)': 0.84; 'faster.': 0.91 X-Injected-Via-Gmane: http://gmane.org/ To: python-list@python.org From: Peter Otten <__peter__@web.de> Subject: Re: Comparing strings from the back? Date: Wed, 05 Sep 2012 11:48:38 +0200 Organization: None References: Mime-Version: 1.0 Content-Type: text/plain; charset="ISO-8859-1" Content-Transfer-Encoding: 7Bit X-Gmane-NNTP-Posting-Host: p5084b17a.dip.t-dialin.net User-Agent: KNode/4.7.3 X-BeenThere: python-list@python.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: General discussion list for the Python programming language List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Newsgroups: comp.lang.python Message-ID: Lines: 133 NNTP-Posting-Host: 2001:888:2000:d::a6 X-Trace: 1346838532 news.xs4all.nl 6898 [2001:888:2000:d::a6]:55788 X-Complaints-To: abuse@xs4all.nl Xref: csiph.com comp.lang.python:28475 Chris Angelico wrote: > On Wed, Sep 5, 2012 at 6:29 PM, Peter Otten <__peter__@web.de> wrote: >> comparing every pair in a sample of 1000 8-char words >> taken from '/usr/share/dict/words' >> >> head >> 1: 477222 ************************************************************ >> 2: 18870 ** >> ... tail 1: 386633 ************************************************************ 2: 74966 *********** > Not understanding this. What are the statistics, I measured how many chars they have in common for all combinations of 1000 words taken from /usr/share/dict/words. 477222 pairs have one char in common, 18870 pairs have two chars in common when compared from the *beginning* of the string. 386633 pairs have one char in common, 74966 pairs have two chars in common when compared from the *end* of the string. and what (if it's not obvious from the previous answer) do they prove? They demonstrate that for two words from that particular corpus it is likely that a[::-1] == b[::-1] has to take more (1.34 versus 1.05 on average) characters into account than a == b, i. e. comparing from the back should be slower rather than faster. If that doesn't help, here's the code ;) import random import itertools from contextlib import contextmanager from collections import defaultdict @contextmanager def open_corpus(args): with open(args.name) as instream: words = (line.strip() for line in instream) #words = (word for word in words if not word.endswith("'s")) yield words def pick(items, count): items = list(items) return random.sample(items, count) def count_common(a, b): i = 0 for i, (x, y) in enumerate(zip(a, b), 1): if x != y: break return i def corpus(args): with open_corpus(args) as words: wordlens = (len(line.strip()) for line in words) freq = defaultdict(int) for wordlen in wordlens: freq[wordlen] += 1 show_histogram(freq) def show_histogram(freq): peak = max(freq.values()) freq_width = len(str(peak)) max_freq = max(freq) len_width = len(str(max_freq)) for i in range(min(freq), max_freq+1): value = freq[i] print("{:{}}: {:{}} {}".format( i, len_width, value, freq_width, "*" * (60 * value // peak))) def compare(args): with open_corpus(args) as words: words = pick( (word for word in words if len(word) == args.word_len), args.sample_size) n = 0 head_common = defaultdict(int) tail_common = defaultdict(int) n_tail = n_head = 0 for n, (a, b) in enumerate(itertools.combinations(words, 2), 1): cc = count_common(a, b) n_head += cc head_common[cc] += 1 ccr = count_common(a[::-1], b[::-1]) n_tail += ccr tail_common[ccr] += 1 print(n, "combinations") print("average common chars (head) {:.3}".format(n_head/n)) print("average common chars (tail) {:.3}".format(n_tail/n)) print("comparing every pair in a " "sample of {sample} {len}-char words\n" "taken from {name!r}".format( sample=args.sample_size, len=args.word_len, name=args.name)) print() print("head") show_histogram(head_common) print() print("tail") show_histogram(tail_common) def main(): import argparse parser = argparse.ArgumentParser() parsers = parser.add_subparsers() pcorpus = parsers.add_parser("corpus") pcorpus.add_argument("name") pcorpus.set_defaults(func=corpus) pcompare = parsers.add_parser("compare") pcompare.add_argument("name") pcompare.add_argument("--sample-size", type=int, default=1000) pcompare.add_argument("--word-len", type=int, default=8) pcompare.set_defaults(func=compare) args = parser.parse_args() args.func(args) if __name__ == "__main__": main()