Path: csiph.com!v102.xanadu-bbs.net!xanadu-bbs.net!feeder.erje.net!eu.feeder.erje.net!xlned.com!feeder7.xlned.com!newsfeed.xs4all.nl!newsfeed1.news.xs4all.nl!xs4all!newsgate.cistron.nl!newsgate.news.xs4all.nl!post.news.xs4all.nl!not-for-mail Return-Path: X-Original-To: python-list@python.org Delivered-To: python-list@mail.python.org X-Spam-Status: OK 0.008 X-Spam-Evidence: '*H*': 0.98; '*S*': 0.00; 'args': 0.07; 'parser': 0.07; 'sys': 0.07; '__name__': 0.09; 'received:209.85.219': 0.09; 'try:': 0.09; 'python': 0.11; "'%b": 0.16; "'__main__':": 0.16; 'err:': 0.16; 'ioerror,': 0.16; 'reimport': 0.16; 'skip:{ 30': 0.16; 'year)': 0.16; 'year,': 0.18; '<': 0.19; '8bit%:5': 0.22; 'import': 0.22; 'to:name:python-list@python.org': 0.22; 'month,': 0.24; 'skip:l 30': 0.24; 'skip:{ 20': 0.24; 'script': 0.25; 'message-id:@mail.gmail.com': 0.30; "skip:' 10": 0.31; '"",': 0.31; 'extract': 0.31; "skip:' 40": 0.31; 'file': 0.32; 'run': 0.32; 'skip:# 10': 0.33; 'skip:d 20': 0.34; 'received:209.85': 0.35; 'except': 0.35; 'received:google.com': 0.35; 'complete.': 0.36; 'possible': 0.36; 'received:209': 0.37; 'feedback': 0.38; 'skip:o 20': 0.38; 'skip:& 10': 0.38; '8bit%:4': 0.38; 'to:addr :python-list': 0.38; 'to:addr:python.org': 0.39; 'unable': 0.39; 'skip:p 20': 0.39; 'read': 0.60; 'skip:o 30': 0.61; 'url:co': 0.67; 'line,': 0.68; '100%': 0.77; 'faster.': 0.84; 'hour,': 0.84; 'subject:skip:o 10': 0.84 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=x-received:mime-version:from:date:message-id:subject:to :content-type; bh=92EMwGDxE4IHmfHO0Kgn8L2Ln9WpvLyy0dvhbav1WgQ=; b=m2sx+ztSFEV7WBDFHd2c3SWjd9WBmCRudz+dhmDnSZ5qlP6xhhY/4DHiZ4gsX6U4Sn 0vBVdIGTdziwlkcqMrUQ2qFJarJ0v0ihHRZtEaUtaZR1RP0T8DPuJnxjyoKG6EqNTBai vFjsVc5gVsvmWm1E0cOWZhNSkLHpkSKrIcozqm5f2G4keL7kKyyq+0I2NXXyvwoO4zeC BKz6vN5ilbXgp8fU3OAeZQ13lZiuvz9W5J16cWjgup7qzwdkLLsvcYl395OIjWd5t+is IXrQw2TXrM1q2jeqhFdaTR/8BeAX/UxRX3P8AcR1p7AVbmg6GC8ltj4RUaIcbbVhZ3Rk Y01w== X-Received: by 10.60.17.105 with SMTP id n9mr16210477oed.64.1366679993395; Mon, 22 Apr 2013 18:19:53 -0700 (PDT) MIME-Version: 1.0 From: Rodrick Brown Date: Mon, 22 Apr 2013 21:19:23 -0400 Subject: optomizations To: "python-list@python.org" Content-Type: multipart/alternative; boundary=089e013c682c872fbc04dafcfbb4 X-BeenThere: python-list@python.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: General discussion list for the Python programming language List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Newsgroups: comp.lang.python Message-ID: Lines: 135 NNTP-Posting-Host: 2001:888:2000:d::a6 X-Trace: 1366680414 news.xs4all.nl 2181 [2001:888:2000:d::a6]:47093 X-Complaints-To: abuse@xs4all.nl Xref: csiph.com comp.lang.python:44130 --089e013c682c872fbc04dafcfbb4 Content-Type: text/plain; charset=ISO-8859-1 I would like some feedback on possible solutions to make this script run faster. The system is pegged at 100% CPU and it takes a long time to complete. #!/usr/bin/env python import gzip import re import os import sys from datetime import datetime import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-f', dest='inputfile', type=str, help='data file to parse') parser.add_argument('-o', dest='outputdir', type=str, default=os.getcwd(), help='Output directory') args = parser.parse_args() if len(sys.argv[1:]) < 1: parser.print_usage() sys.exit(-1) print(args) if args.inputfile and os.path.exists(args.inputfile): try: with gzip.open(args.inputfile) as datafile: for line in datafile: line = line.replace('mediacdn.xxx.com', 'media.xxx.com') line = line.replace('staticcdn.xxx.co.uk', ' static.xxx.co.uk') line = line.replace('cdn.xxx', 'www.xxx') line = line.replace('cdn.xxx', 'www.xxx') line = line.replace('cdn.xx', 'www.xx') siteurl = line.split()[6].split('/')[2] line = re.sub(r'\bhttps?://%s\b' % siteurl, "", line, 1) (day, month, year, hour, minute, second) = (line.split()[3]).replace('[','').replace(':','/').split('/') datelog = '{} {} {}'.format(month, day, year) dateobj = datetime.strptime(datelog, '%b %d %Y') outfile = '{}{}{}_combined.log'.format(dateobj.year, dateobj.month, dateobj.day) outdir = (args.outputdir + os.sep + siteurl) if not os.path.exists(outdir): os.makedirs(outdir) with open(outdir + os.sep + outfile, 'w+') as outf: outf.write(line) except IOError, err: sys.stderr.write("Error unable to read or extract inputfile: {} {}\n".format(args.inputfile, err)) sys.exit(-1) --089e013c682c872fbc04dafcfbb4 Content-Type: text/html; charset=ISO-8859-1 Content-Transfer-Encoding: quoted-printable
I would like some feedback on possible solutions to make t= his script run faster.=A0
The system is pegged at 100% CPU and it= takes a long time to complete.=A0

#!/usr/bin/env python

import gzip<= /div>
import re
import os
import sys
from= datetime import datetime
import argparse

if __name__ =3D=3D '__main__':
=A0 =A0 parser =3D ar= gparse.ArgumentParser()
=A0 =A0 parser.add_argument('-f',= dest=3D'inputfile', type=3Dstr, help=3D'data file to parse'= ;)
=A0 =A0 parser.add_argument('-o', dest=3D'outputdi= r', type=3Dstr, default=3Dos.getcwd(), help=3D'Output directory'= ;)
=A0 =A0 args =3D parser.parse_args()

=A0 =A0 = if len(sys.argv[1:]) < 1:
=A0 =A0 =A0 =A0 parser.print_usage()=
=A0 =A0 =A0 =A0 sys.exit(-1)

=A0 =A0 pr= int(args)
=A0 =A0 if args.inputfile and os.path.exists(args.input= file):
=A0 =A0 =A0 =A0 try:
=A0 =A0 =A0 =A0 =A0 =A0 with gzip.open(= args.inputfile) as datafile:
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 for = line in datafile:
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 line = =3D line.replace('mediacdn.xxx.com<= /a>', 'media.xxx.com')
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 line =3D line.replace('staticcdn.xxx.co.uk', 'static.xxx.co.uk')
=A0 = =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 line =3D line.replace('cdn.xxx'= , 'www.xxx')
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 line =3D line.replace('cdn= .xxx', 'www.xxx')
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= =A0 line =3D line.replace('cdn.xx', 'www.xx')
= =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 siteurl =3D line.split()[6].split(&= #39;/')[2]
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 line =3D re.sub(r'\bhttps?= ://%s\b' % siteurl, "", line, 1)

=A0= =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 (day, month, year, hour, minute, secon= d) =3D (line.split()[3]).replace('[','').replace(':'= ;,'/').split('/')
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 datelog =3D '{} {} {}'= .format(month, day, year)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0= dateobj =3D datetime.strptime(datelog, '%b %d %Y')

<= /div>
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 outfile =3D '{}{}{}_c= ombined.log'.format(dateobj.year, dateobj.month, dateobj.day)
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 outdir =3D (args.outputdir + o= s.sep + siteurl)

=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 = =A0 =A0 if not os.path.exists(outdir):
=A0 =A0 =A0 =A0 =A0 =A0 = =A0 =A0 =A0 =A0 =A0 =A0 os.makedirs(outdir)

=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 with open(outdir + os.sep + outfile= , 'w+') as outf:
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 = =A0 =A0 outf.write(line)

=A0 =A0 =A0 =A0 except IO= Error, err:
=A0 =A0 =A0 =A0 =A0 =A0 sys.stderr.write("Error = unable to read or extract inputfile: {} {}\n".format(args.inputfile, e= rr))
=A0 =A0 =A0 =A0 =A0 =A0 sys.exit(-1)

--089e013c682c872fbc04dafcfbb4--