Path: csiph.com!v102.xanadu-bbs.net!xanadu-bbs.net!feeder.erje.net!eu.feeder.erje.net!xlned.com!feeder1.xlned.com!newsfeed.xs4all.nl!newsfeed1a.news.xs4all.nl!xs4all!newsgate.cistron.nl!newsgate.news.xs4all.nl!post.news.xs4all.nl!not-for-mail Return-Path: X-Original-To: python-list@python.org Delivered-To: python-list@mail.python.org X-Spam-Status: OK 0.000 X-Spam-Evidence: '*H*': 1.00; '*S*': 0.00; 'python.': 0.02; 'programmer': 0.03; 'output': 0.05; 'root': 0.05; 'tree': 0.05; "'',": 0.07; 'dev': 0.07; 'duplicate': 0.07; 'parser': 0.07; 'suppose': 0.07; 'sys': 0.07; 'ascii': 0.09; 'parameter': 0.09; 'subject:files': 0.09; 'python': 0.11; 'language.': 0.14; "'r'": 0.16; '*args': 0.16; 'file_name': 0.16; 'filesystem': 0.16; 'finds': 0.16; 'hostname': 0.16; 'md5': 0.16; 'optional': 0.16; 'prefer.': 0.16; 'reimport': 0.16; 'renders': 0.16; 'stdout': 0.16; 'subject:program': 0.16; 'track.': 0.16; 'video.': 0.16; '\xc2\xa0i': 0.16; 'size,': 0.16; 'sender:addr:gmail.com': 0.17; 'thanks,': 0.17; 'do.': 0.18; 'file,': 0.19; 'import': 0.22; 'print': 0.22; 'error': 0.23; 'specifies': 0.24; 'file.': 0.24; 'skip:" 20': 0.27; 'host': 0.29; 'mode': 0.30; 'sets': 0.30; 'message-id:@mail.gmail.com': 0.30; "i'm": 0.30; 'program,': 0.31; 'code': 0.31; 'comments': 0.31; 'easier': 0.31; 'too.': 0.31; 'own,': 0.31; 'perl': 0.31; 'sep': 0.31; 'file': 0.32; 'regular': 0.32; 'another': 0.32; 'text': 0.33; 'checking': 0.33; 'mac': 0.33; 'skip:# 10': 0.33; 'skip:& 30': 0.33; "i'd": 0.34; 'except': 0.35; 'but': 0.35; 'received:google.com': 0.35; 'there': 0.35; 'culture': 0.36; 'processed': 0.36; 'skip:f 40': 0.36; 'subject:data': 0.36; 'done': 0.36; 'doing': 0.36; 'skip:& 10': 0.38; 'skip:[ 10': 0.38; 'to:addr:python-list': 0.38; 'files': 0.38; 'skip:- 10': 0.38; 'rather': 0.38; 'does': 0.39; 'help,': 0.39; 'to:addr:python.org': 0.39; 'skip:p 20': 0.39; 'read': 0.60; 'most': 0.60; 'free': 0.61; 'skip:* 40': 0.61; 'skip:* 10': 0.61; 'show': 0.63; 'information': 0.63; 'our': 0.64; '8bit%:10': 0.64; 'different': 0.65; 'size.': 0.65; 'world': 0.66; 'header:Reply- To:1': 0.67; 'reverse': 0.68; 'below.': 0.71; 'reply-to:no real name:2**0': 0.71; 'skip:a 40': 0.72; 'stat': 0.84; '*for': 0.91; 'visitors.': 0.91; '***': 0.95; '<>*': 0.95 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=mime-version:reply-to:sender:date:message-id:subject:from:to :content-type; bh=vqEdcolvvRF3VrGwhwgl7jGgT4Hcy2bGLdMvGk0ARrc=; b=08Z59jwE9r20hJB/qjh0qBynjimzs4APmzfoMzp1f/vxoNqcCCdWQ/E1XgWFLXEOKR YGMu3yylEG7wb2Y7U7ld0+iPG5DRnOLGmXCk5w/SS2QKyIRRkydNYujynqyDsa9X0qXY IFF7+7Ep0BV7R2Nq6fhHF6BSM0lCMoKY49Iyd7ectM57iD3cvN44OUWyUjbzQzZV4j+7 WBKO5sx8MC/yvzBE/swLuYT2/ditcDZ0wEigOeCjpYjJh87o/lnT8jPVFGp0WNLGN6Og ZEEvIr5AK5nj8mj7ZKHDB3kUbaGzBPJTk/7tV5DFccxc/I3ylwjYAqMYjA9wY+Fyyn6c rUEg== MIME-Version: 1.0 X-Received: by 10.195.11.234 with SMTP id el10mr6842478wjd.95.1411063871092; Thu, 18 Sep 2014 11:11:11 -0700 (PDT) Sender: extasia@gmail.com Date: Thu, 18 Sep 2014 11:11:11 -0700 X-Google-Sender-Auth: f4Vs3C5bhPnXelajdEICHLNjkBM Subject: program to generate data helpful in finding duplicate large files From: David Alban To: python-list@python.org Content-Type: multipart/alternative; boundary=047d7b87375ccabcdf05035ae890 X-BeenThere: python-list@python.org X-Mailman-Version: 2.1.15 Precedence: list Reply-To: extasia@extasia.org List-Id: General discussion list for the Python programming language List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Newsgroups: comp.lang.python Message-ID: Lines: 240 NNTP-Posting-Host: 2001:888:2000:d::a6 X-Trace: 1411063879 news.xs4all.nl 2933 [2001:888:2000:d::a6]:34404 X-Complaints-To: abuse@xs4all.nl Xref: csiph.com comp.lang.python:78031 --047d7b87375ccabcdf05035ae890 Content-Type: text/plain; charset=UTF-8 greetings, i'm a long time perl programmer who is learning python. i'd be interested in any comments you might have on my code below. feel free to respond privately if you prefer. i'd like to know if i'm on the right track. the program works, and does what i want it to do. is there a different way a seasoned python programmer would have done things? i would like to learn the culture as well as the language. am i missing anything? i know i'm not doing error checking below. i suppose comments would help, too. i wanted a program to scan a tree and for each regular file, print a line of text to stdout with information about the file. this will be data for another program i want to write which finds sets of duplicate files larger than a parameter size. that is, using output from this program, the sets of files i want to find are on the same filesystem on the same host (obviously, but i include hostname in the data to be sure), and must have the same md5 sum, but different inode numbers. the output of the code below is easier for a human to read when paged through 'less', which on my mac renders the ascii nuls as "^@" in reverse video. thanks, david *usage: dupscan [-h] [--start-directory START_DIRECTORY]* *scan files in a tree and print a line of information about each regular file* *optional arguments:* * -h, --help show this help message and exit* * --start-directory START_DIRECTORY, -d START_DIRECTORY* * specifies the root of the filesystem tree to be* * processed* *#!/usr/bin/python* *import argparse* *import hashlib* *import os* *import re* *import socket* *import sys* *from stat import ** *ascii_nul = chr(0)* * # from: http://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python * * # except that i use hexdigest() rather than digest()* *def md5_for_file(f, block_size=2**20):* * md5 = hashlib.md5()* * while True:* * data = f.read(block_size)* * if not data:* * break* * md5.update(data)* * return md5.hexdigest()* *thishost = socket.gethostname()* *parser = argparse.ArgumentParser(description='scan files in a tree and print a line of information about each regular file')* *parser.add_argument('--start-directory', '-d', default='.', help='specifies the root of the filesystem tree to be processed')* *args = parser.parse_args()* *start_directory = re.sub( '/+$', '', args.start_directory )* *for directory_path, directory_names, file_names in os.walk( start_directory ):* * for file_name in file_names:* * file_path = "%s/%s" % ( directory_path, file_name )* * lstat_info = os.lstat( file_path )* * mode = lstat_info.st_mode* * if not S_ISREG( mode ) or S_ISLNK( mode ):* * continue* * f = open( file_path, 'r' )* * md5sum = md5_for_file( f )* * dev = lstat_info.st_dev* * ino = lstat_info.st_ino* * nlink = lstat_info.st_nlink* * size = lstat_info.st_size* * sep = ascii_nul* * print "%s%c%s%c%d%c%d%c%d%c%d%c%s" % ( thishost, sep, md5sum, sep, dev, sep, ino, sep, nlink, sep, size, sep, file_path )* *exit( 0 )* -- Our decisions are the most important things in our lives. *** Live in a world of your own, but always welcome visitors. --047d7b87375ccabcdf05035ae890 Content-Type: text/html; charset=UTF-8 Content-Transfer-Encoding: quoted-printable
greetings,

i'm a long time perl pro= grammer who is learning python. =C2=A0i'd be interested in any comments= you might have on my code below. =C2=A0feel free to respond privately if y= ou prefer. =C2=A0i'd like to know if i'm on the right track. =C2=A0= the program works, and does what i want it to do. =C2=A0is there a differen= t way a seasoned python programmer would have done things? =C2=A0i would li= ke to learn the culture as well as the language. =C2=A0am i missing anythin= g? =C2=A0i know i'm not doing error checking below. =C2=A0i suppose com= ments would help, too.

i wanted a program to scan = a tree and for each regular file, print a line of text to stdout with infor= mation about the file. =C2=A0this will be data for another program i want t= o write which finds sets of duplicate files larger than a parameter size. = =C2=A0that is, using output from this program, the sets of files i want to = find are on the same filesystem on the same host (obviously, but i include = hostname in the data to be sure), and must have the same md5 sum, but diffe= rent inode numbers.

the output of the code below i= s easier for a human to read when paged through 'less', which on my= mac renders the ascii nuls as "^@" in reverse video.

thanks,
david


usage: dupscan [-h] [-= -start-directory START_DIRECTORY]

scan files in a tree and print a line of information about eac= h regular file
<= b>
option= al arguments:
=C2=A0 -h, --help =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0show this help = message and exit
=C2=A0 --start-directory START_DIRECTORY, -d START_DIRECTORY=
=C2=A0 =C2=A0 =C2=A0 = =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 specifies th= e root of the filesystem tree to be
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 processed




#!/usr/bin/python

import argpars= e
= import hashlib
import os
import re
import socket
import sys
<= font face=3D"courier new, monospace" size=3D"1">
from stat import *

=
a= scii_nul =3D chr(0)

=C2= =A0 =C2=A0 =C2=A0# except that i use hexdigest() rather than digest()
def md5= _for_file(f, block_size=3D2**20):
=C2=A0 md5 =3D hashlib.md5()
=C2=A0 while Tru= e:
=C2=A0 =C2=A0 data =3D f.read(block_size)
=C2=A0 =C2=A0 if not data:
=C2=A0 = =C2=A0 =C2=A0 break
=C2=A0 =C2=A0 md5.update(data)
=C2=A0 return md5.hexdigest(= )
=
= thishost =3D socket.gethostname()

parser =3D argparse.ArgumentParser(de= scription=3D'scan files in a tree and print a line of information about= each regular file')
parser.add_argument('--start-directory', = 9;-d', default=3D'.', help=3D'specifies the root of the fil= esystem tree to be processed')
args =3D parser.parse_args()

=
start_direct= ory =3D re.sub( '/+$', '', args.start_directory )

for dire= ctory_path, directory_names, file_names in os.walk( start_directory ):<= /font>
=C2=A0= for file_name in file_names:
=C2=A0 =C2=A0 file_path =3D "%s/%s" %= ( directory_path, file_name )

=C2=A0 =C2=A0 lstat_info =3D os.lstat( file= _path )

=C2=A0 =C2=A0 mode =3D lstat_info.st_mode

=C2=A0 =C2=A0 if not S= _ISREG( mode ) or S_ISLNK( mode ):
=C2=A0 =C2=A0 =C2=A0 continue

=C2=A0 =C2= =A0 f =3D open( file_path, 'r' )
=C2=A0 =C2=A0 md5sum =3D md5_for_fil= e( f )

=C2=A0 =C2=A0 dev =C2=A0 =3D lstat_info.st_dev
=C2=A0 =C2=A0 ino = =C2=A0 =3D lstat_info.st_ino
=C2=A0 =C2=A0 nlink =3D lstat_info.st_nlink<= /font>
=C2=A0= =C2=A0 size =C2=A0=3D lstat_info.st_size

=C2=A0 =C2=A0 sep =3D ascii_nu= l
=
= =C2=A0 =C2=A0 print "%s%c%s%c%d%c%d%c%d%c%d%c%s" % ( thishost,= sep, md5sum, sep, dev, sep, ino, sep, nlink, sep, size, sep, file_path )
= exit( 0 )



--
Our decisions are the most important things in ou= r lives.
***
Live in a world of your own, but always welcome = visitors.
--047d7b87375ccabcdf05035ae890--