Path: csiph.com!v102.xanadu-bbs.net!xanadu-bbs.net!feeder.erje.net!eu.feeder.erje.net!border3.nntp.ams.giganews.com!backlog3.nntp.ams.giganews.com!border1.nntp.ams.giganews.com!nntp.giganews.com!newsfeed.xs4all.nl!newsfeed3a.news.xs4all.nl!xs4all!newsgate.cistron.nl!newsgate.news.xs4all.nl!post.news.xs4all.nl!not-for-mail Return-Path: X-Original-To: python-list@python.org Delivered-To: python-list@mail.python.org X-Spam-Status: OK 0.000 X-Spam-Evidence: '*H*': 1.00; '*S*': 0.00; 'python.': 0.02; 'root': 0.05; 'tree': 0.05; 'args': 0.07; 'binary': 0.07; 'duplicate': 0.07; 'parser': 0.07; 'perl,': 0.07; 'string': 0.09; '"__main__":': 0.09; "'.'": 0.09; '__name__': 0.09; 'ascii': 0.09; 'data:': 0.09; 'differently.': 0.09; 'executed': 0.09; 'exit': 0.09; 'omit': 0.09; 'subject:files': 0.09; 'terminated': 0.09; 'windows,': 0.09; 'python': 0.11; 'def': 0.12; '"\\r\\n"': 0.16; "'':": 0.16; "'rb')": 0.16; 'arguments:': 0.16; 'basename': 0.16; 'character.': 0.16; 'determines': 0.16; 'file).': 0.16; 'file_name': 0.16; 'filesystem': 0.16; 'fyi,': 0.16; 'hashlib': 0.16; 'md5': 0.16; 'optional': 0.16; 'readable': 0.16; 'responses.': 0.16; 'separator,': 0.16; 'set,': 0.16; 'subject:program': 0.16; 'true:': 0.16; '\xc2\xa0i': 0.16; '\xc2\xa0if': 0.16; 'folks': 0.16; 'sender:addr:gmail.com': 0.17; 'wrote:': 0.18; 'normally': 0.19; 'skip:p 40': 0.19; 'thu,': 0.19; 'platforms': 0.22; 'example': 0.22; 'import': 0.22; 'email addr:gmail.com>': 0.22; 'separate': 0.22; 'shell': 0.22; 'print': 0.22; 'either.': 0.24; 'script.': 0.24; 'specify': 0.24; 'stick': 0.24; 'helpful': 0.24; "i've": 0.25; '>': 0.26; 'possibly': 0.26; '----------': 0.26; 'code:': 0.26; 'appear': 0.29; 'chris': 0.29; 'am,': 0.29; 'mode': 0.30; 'statement': 0.30; 'message-id:@mail.gmail.com': 0.30; "i'm": 0.30; 'code': 0.31; 'usually': 0.31; 'apparently': 0.31; 'block,': 0.31; 'changes:': 0.31; 'directory,': 0.31; 'node': 0.31; 'own,': 0.31; 'perl': 0.31; 'question:': 0.31; 'sep': 0.31; 'universal': 0.31; 'file': 0.32; 'probably': 0.32; 'option': 0.32; 'regular': 0.32; 'quite': 0.32; 'text': 0.33; 'fri,': 0.33; 'plain': 0.33; 'skip:# 10': 0.33; 'style': 0.33; 'date:': 0.34; 'except': 0.35; 'convert': 0.35; 'one,': 0.35; 'but': 0.35; 'received:google.com': 0.35; 'there': 0.35; 'subject:data': 0.36; 'doing': 0.36; 'thanks': 0.36; 'should': 0.36; 'error.': 0.37; 'example,': 0.37; 'email addr:python.org': 0.37; 'starting': 0.37; 'skip:& 10': 0.38; 'skip:[ 10': 0.38; 'to:addr:python-list': 0.38; 'files': 0.38; 'skip:- 10': 0.38; 'pm,': 0.38; 'rather': 0.38; 'short': 0.38; 'forwarded': 0.39; 'skip:& 20': 0.39; 'subject:': 0.39; 'to:addr:python.org': 0.39; 'skip:p 20': 0.39; 'most': 0.60; 'break': 0.61; 'skip:a 30': 0.61; 'subject:Fwd': 0.61; "you'll": 0.62; 'show': 0.63; 'information': 0.63; 're:': 0.63; 'field': 0.63; 'our': 0.64; '8bit%:10': 0.64; 'email name:python-list': 0.65; 'world': 0.66; 'here': 0.66; 'header:Reply-To:1': 0.67; 'records.': 0.68; 'default': 0.69; 'reply-to:no real name:2**0': 0.71; 'therefore': 0.72; 'skip:a 40': 0.72; '11:45': 0.84; 'otten': 0.84; 'stat': 0.84; 'visitors.': 0.91; '***': 0.95 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=mime-version:reply-to:sender:date:message-id:subject:from:to :content-type; bh=ng7TOHIwbx8fn3VCB+v/hzJQYmhWIT/3i+WGyl3mt/k=; b=yyeU2wD+U2pSHyI1RdyTEyYqFVu8HGahJ0f5oaJXUOq+fJHKsJ9h0qmdmPl44CBnc4 JSxfLhSKXb0BY0WClumxRfdCs9DW6tP7DJZjWOsztq8tBFNAHflJ7cR+lbQQ9jH7uGyl TbkQx47dq2naKBpWCOHLm6IQWyLczuYuZwz6qU1oM5LpyQp8em4850ES0TrVWxmwo2A0 El3wjFjdrxnjFe3xBr+kvTbKI+gqFqLZsPNhHZRRKTF7sfv9p8Se7a8B8dKoH35GkgPC rD4gy3MVuGChbRcg+j2qnNmmkgU+XGYNCxb/XwbOXG8Sv1P03Srenk3touvOVnHq0qBe UZHA== MIME-Version: 1.0 X-Received: by 10.195.11.234 with SMTP id el10mr1060806wjd.95.1411133522443; Fri, 19 Sep 2014 06:32:02 -0700 (PDT) Sender: extasia@gmail.com Date: Fri, 19 Sep 2014 06:32:02 -0700 X-Google-Sender-Auth: qlppNiod5Kd0x4ZCsJjsBLqjke8 Subject: Fwd: program to generate data helpful in finding duplicate large files From: David Alban To: python-list@python.org Content-Type: multipart/alternative; boundary=047d7b87375c56023705036b2095 X-BeenThere: python-list@python.org X-Mailman-Version: 2.1.15 Precedence: list Reply-To: extasia@extasia.org List-Id: General discussion list for the Python programming language List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Newsgroups: comp.lang.python Message-ID: Lines: 264 NNTP-Posting-Host: 2001:888:2000:d::a6 X-Trace: 1411133530 news.xs4all.nl 2918 [2001:888:2000:d::a6]:52478 X-Complaints-To: abuse@xs4all.nl X-Original-Bytes: 17520 Xref: csiph.com comp.lang.python:78078 --047d7b87375c56023705036b2095 Content-Type: text/plain; charset=UTF-8 here is my reworked code in a plain text email. ---------- Forwarded message ---------- From: Date: Thu, Sep 18, 2014 at 3:58 PM Subject: Re: program to generate data helpful in finding duplicate large files To: python-list@python.org thanks for the responses. i'm having quite a good time learning python. On Thu, Sep 18, 2014 at 11:45 AM, Chris Kaynor wrote: > > Additionally, you may want to specify binary mode by using open(file_path, 'rb') to ensure platform-independence ('r' uses Universal newlines, which means on Windows, Python will convert "\r\n" to "\n" while reading the file). Additionally, some platforms will treat binary files differently. would it be good to use 'rb' all the time? On Thu, Sep 18, 2014 at 11:48 AM, Chris Angelico wrote: > > On Fri, Sep 19, 2014 at 4:11 AM, David Alban wrote: > > exit( 0 ) > > Unnecessary - if you omit this, you'll exit 0 implicitly at the end of > the script. aha. i've been doing this for years even with perl, and apparently it's not necessary in perl either. i was influenced by shell. this shell code: if [[ -n $report_mode ]] ; then do_report fi exit 0 is an example of why you want the last normally executed shell statement to be "exit 0". if you omit the exit statement it in this example, and $report_mode is not set, your shell program will give a non-zero return code and appear to have terminated with an error. in shell the last expression evaluated determines the return code to the os. ok, i don't need to do this in python. On Thu, Sep 18, 2014 at 1:23 PM, Peter Otten <__peter__@web.de> wrote: > > file_path may contain newlines, therefore you should probably use "\0" to > separate the records. i chose to stick with ascii nul as the default field separator, but i added a --field-separator option in case someone wants human readable output. style question: if there is only one, possibly short statement in a block, do folks usually move it up to the line starting the block? if not S_ISREG( mode ) or S_ISLNK( mode ): return vs. if not S_ISREG( mode ) or S_ISLNK( mode ): return or even: with open( file_path, 'rb' ) as f: md5sum = md5_for_file( file_path ) fyi, here are my changes: usage: dupscan [-h] [--start-directory START_DIRECTORY] [--field-separator FIELD_SEPARATOR] scan files in a tree and print a line of information about each regular file optional arguments: -h, --help show this help message and exit --start-directory START_DIRECTORY, -d START_DIRECTORY Specify the root of the filesystem tree to be processed. The default is '.' --field-separator FIELD_SEPARATOR, -s FIELD_SEPARATOR Specify the string to use as a field separator in output. The default is the ascii nul character. #!/usr/bin/python import argparse import hashlib import os from platform import node from stat import S_ISREG, S_ISLNK ASCII_NUL = chr(0) # from: http://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python # except that i use hexdigest() rather than digest() def md5_for_file( path, block_size=2**20 ): md5 = hashlib.md5() with open( path, 'rb' ) as f: while True: data = f.read(block_size) if not data: break md5.update(data) return md5.hexdigest() def file_info( directory, basename, field_separator=ASCII_NUL ): file_path = os.path.join( directory, basename ) st = os.lstat( file_path ) mode = st.st_mode if not S_ISREG( mode ) or S_ISLNK( mode ): return with open( file_path, 'rb' ) as f: md5sum = md5_for_file( file_path ) return field_separator.join( [ thishost, md5sum, str( st.st_dev ), str( st.st_ino ), str( st.st_nlink ), str( st.st_size ), file_path ] ) if __name__ == "__main__": parser = argparse.ArgumentParser(description='scan files in a tree and print a line of information about each regular file') parser.add_argument('--start-directory', '-d', default='.', help='''Specify the root of the filesystem tree to be processed. The default is '.' ''') parser.add_argument('--field-separator', '-s', default=ASCII_NUL, help='Specify the string to use as a field separator in output. The default is the ascii nul character.') args = parser.parse_args() start_directory = args.start_directory.rstrip('/') field_separator = args.field_separator thishost = node() if thishost == '': thishost='[UNKNOWN]' for directory_path, directory_names, file_names in os.walk( start_directory ): for file_name in file_names: print file_info( directory_path, file_name, field_separator ) -- Our decisions are the most important things in our lives. *** Live in a world of your own, but always welcome visitors. --047d7b87375c56023705036b2095 Content-Type: text/html; charset=UTF-8 Content-Transfer-Encoding: quoted-printable
here is my reworked code in a plain text email.
=
---------- Forwarded message ----------
From: <bizcor@gmail.com>
Date: Thu, Sep 18, 2014 at 3:= 58 PM
Subject: Re: program to generate data helpful in finding duplicate= large files
To: python-list@p= ython.org


thanks for the responses. =C2=A0 i'm having qu= ite a good time learning python.

On Thu, Sep 18, 2014 at 11:45 AM, C= hris Kaynor <ckaynor@zindagi= games.com> wrote:
>
> Additionally, you may want to spe= cify binary mode by using open(file_path, 'rb') to ensure platform-= independence ('r' uses Universal newlines, which means on Windows, = Python will convert "\r\n" to "\n" while reading the fi= le). Additionally, some platforms will treat binary files differently.
<= br>
would it be good to use 'rb' all the time?

On Thu, Se= p 18, 2014 at 11:48 AM, Chris Angelico <rosuav@gmail.com> wrote:
>
> On Fri, Sep 19, 2014 at= 4:11 AM, David Alban <extasia@ex= tasia.org> wrote:
> > exit( 0 )
>
> Unnecessary= - if you omit this, you'll exit 0 implicitly at the end of
> the= script.


aha. =C2=A0i've been doing this for years even with= perl, and apparently it's not necessary in perl either. =C2=A0i was in= fluenced by shell.

this shell code:

=C2=A0 =C2=A0 =C2=A0if [[= -n $report_mode ]] ; then
=C2=A0 =C2=A0 =C2=A0 =C2=A0 do_report
=C2= =A0 =C2=A0 =C2=A0fi

=C2=A0 =C2=A0 =C2=A0exit 0

is an example = of why you want the last normally executed shell statement to be "exit= 0". =C2=A0if you omit the exit statement it in this example, and $rep= ort_mode is not set, your shell program will give a non-zero return code an= d appear to have terminated with an error. =C2=A0in shell the last expressi= on evaluated determines the return code to the os.

ok, i don't n= eed to do this in python.

On Thu, Sep 18, 2014 at 1:23 PM, Peter Ott= en <__peter__@web.de> wrote:>
> file_path may contain newlines, therefore you should probably= use "\0" to
> separate the records.


i chose to = stick with ascii nul as the default field separator, but i added a --field-= separator option in case someone wants human readable output.

style = question: =C2=A0if there is only one, possibly short statement in a block, = do folks usually move it up to the line starting the block?

=C2=A0 i= f not S_ISREG( mode ) or S_ISLNK( mode ):
=C2=A0 =C2=A0 return

vs= .

=C2=A0 if not S_ISREG( mode ) or S_ISLNK( mode ): return

or= even:

=C2=A0 with open( file_path, 'rb' ) as f: md5sum= =3D md5_for_file( file_path )



fyi, here are my changes:
=

usage: dupscan [-h] [--start-directory START_DIRECTORY]
=C2=A0 = =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0[--field-separator FIELD_SE= PARATOR]

scan files in a tree and print a line of information about = each regular file

optional arguments:
=C2=A0 -h, --help =C2=A0 = =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0show this help message and exit
=C2=A0= --start-directory START_DIRECTORY, -d START_DIRECTORY
=C2=A0 =C2=A0 =C2= =A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 Specify = the root of the filesystem tree to be
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0= =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 processed. The default is= '.'
=C2=A0 --field-separator FIELD_SEPARATOR, -s FIELD_SEPARATO= R
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 = =C2=A0 =C2=A0 Specify the string to use as a field separator in
=C2=A0 = =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 output. The default is the ascii nul character.



#!/usr/b= in/python

import argparse
import hashlib
import os

from= platform import node
from stat import S_ISREG, S_ISLNK

ASCII_NUL= =3D chr(0)

=C2=A0 =C2=A0 =C2=A0# from: http://stack= overflow.com/questions/1131220/get-md5-hash-of-big-files-in-python
= =C2=A0 =C2=A0 =C2=A0# except that i use hexdigest() rather than digest()def md5_for_file( path, block_size=3D2**20 ):
=C2=A0 md5 =3D hashlib.md= 5()
=C2=A0 with open( path, 'rb' ) as f:
=C2=A0 =C2=A0 while = True:
=C2=A0 =C2=A0 =C2=A0 data =3D f.read(block_size)
=C2=A0 =C2=A0 = =C2=A0 if not data:
=C2=A0 =C2=A0 =C2=A0 =C2=A0 break
=C2=A0 =C2=A0 = =C2=A0 md5.update(data)
=C2=A0 return md5.hexdigest()

def file_in= fo( directory, basename, field_separator=3DASCII_NUL ):
=C2=A0 file_path= =3D os.path.join( directory, basename )
=C2=A0 st =3D os.lstat( file_pa= th )

=C2=A0 mode =3D st.st_mode
=C2=A0 if not S_ISREG( mode ) or = S_ISLNK( mode ):
=C2=A0 =C2=A0 return

=C2=A0 with open( file_pat= h, 'rb' ) as f:
=C2=A0 =C2=A0 md5sum =3D md5_for_file( file_path= )

=C2=A0 return field_separator.join( [ thishost, md5sum, str( st.s= t_dev ), str( st.st_ino ), str( st.st_nlink ), str( st.st_size ), file_path= ] )

if __name__ =3D=3D "__main__":
=C2=A0 parser =3D a= rgparse.ArgumentParser(description=3D'scan files in a tree and print a = line of information about each regular file')
=C2=A0 parser.add_argu= ment('--start-directory', '-d', default=3D'.', help= =3D'''Specify the root of the filesystem tree to be processed. = =C2=A0The default is '.' ''')
=C2=A0 parser.add_argu= ment('--field-separator', '-s', default=3DASCII_NUL, help= =3D'Specify the string to use as a field separator in output. =C2=A0The= default is the ascii nul character.')
=C2=A0 args =3D parser.parse_= args()

=C2=A0 start_directory =3D args.start_directory.rstrip('/= ')
=C2=A0 field_separator =3D args.field_separator

=C2=A0 thi= shost =3D node()
=C2=A0 if thishost =3D=3D '':
=C2=A0 =C2=A0 = thishost=3D'[UNKNOWN]'

=C2=A0 for directory_path, directory_= names, file_names in os.walk( start_directory ):
=C2=A0 =C2=A0 for file_= name in file_names:
=C2=A0 =C2=A0 =C2=A0 print file_info( directory_path= , file_name, field_separator )

--
Our decisions are the most important things in our lives.
*= **
Live in a world of your own, but always welcome visitors.


--047d7b87375c56023705036b2095--