Path: csiph.com!v102.xanadu-bbs.net!xanadu-bbs.net!news.glorb.com!news-out.readnews.com!transit4.readnews.com!news-out.news.tds.net!newsreading01.news.tds.net!53ab2750!not-for-mail
From: "qwertmonkey" <qwertmonkey@1:261/38.remove-dpk-this>
Subject: How can you make idle processors pick up java work?
Message-ID: <50181D65.55759.calajapr@time.synchro.net>
X-Comment-To: All
Newsgroups: comp.lang.java.programmer
X-FTN-AREA: COMP.LANG.JAVA.PROGRAMMER
X-FTN-MSGID: 1:261/38 5893f981
Content-Type: text/plain; charset=IBM437
Content-Transfer-Encoding: 8bit
X-Gateway: time.synchro.net [Synchronet 3.16a-Win32 NewsLink 1.98]
Lines: 138
Date: Tue, 31 Jul 2012 18:02:38 GMT
NNTP-Posting-Host: 69.21.70.65
X-Complaints-To: news@tds.net
X-Trace: newsreading01.news.tds.net 1343757758 69.21.70.65 (Tue, 31 Jul 2012 13:02:38 CDT)
NNTP-Posting-Date: Tue, 31 Jul 2012 13:02:38 CDT
Organization: tds.net
Xref: csiph.com comp.lang.java.programmer:16791

From: qwertmonkey@syberianoutpost.ru

~
> How slow is the NL processing?
~
> Does it make any sense to read lines in one thread and pass each off
to one of the iPrx-1 other threads that might run on separate processors? ~
 I don't think this would make sense. All sentences are short and all I
need to do is basically scan them and use look-up tables to do some tinkering 
with the code points. The scheduling of threads and constant context switching 
will most probably make things slower ~
 OK this is the piece of the code I am trying to optimize and the results
I get, using a large enough file with sentences: ~
 http://corpora.informatik.uni-leipzig.de/download.html
~
 http://corpora.uni-leipzig.de/downloads/deu_news_2008_10M-text.tar.gz
~
 inside of the tar ball there is a file with just sentences:
~
$ ls -l deu_news_2008_10M-sentences.txt
-rw-r--r-- 1 knoppix knoppix 1235804164 May 28  2011
 deu_news_2008_10M-sentences.txt

$ md5sum -b deu_news_2008_10M-sentences.txt
23041587b6414d1a1a56c9c389d3c18f *deu_news_2008_10M-sentences.txt

$ wc -l deu_news_2008_10M-sentences.txt
10000000 deu_news_2008_10M-sentences.txt ~
 Again, do you know of any faster way to go about reading the sentences of
such large files and getting their code points?
 lbrtchx
~
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Files;
import java.nio.charset.Charset;

import java.io.BufferedReader;
import java.io.IOException;

// __
public class NIO2_newBufferedReader02Test{
 private static final String aNWLn = System.getProperty("line.separator");
// __
 public static void main(String[] aArgs){

  if((aArgs != null) && (aArgs.length == 1)){
   long lTm00 = System.currentTimeMillis();
   long lLns = 0;
   int iTtlRdKdPnts = 0;
   BufferedReader BfR = null;
   Path IFlPth = FileSystems.getDefault().getPath(aArgs[0]);
   long lIFlL = IFlPth.toFile().length();
   int iKdPnt, iSxL;

   StringBuilder aBldr = new StringBuilder(1024);
// __
   try{
    BfR = Files.newBufferedReader(IFlPth, Charset.forName("UTF-8"));
    String aSx = BfR.readLine();
    while(aSx != null){
     iSxL = aSx.length();
     if(iSxL > 0){
      for(int i = 0; (i < iSxL); ++i){
       iKdPnt = aSx.codePointAt(i); ++iTtlRdKdPnts;
       aBldr.appendCodePoint(iKdPnt);
      }
// __
      aBldr.delete(0, aBldr.length());
     }// (iSxL > 0)
     ++lLns;
     aSx = BfR.readLine();
    }// (aSx != null)

    BfR.close();
// __
    System.err.println("// __ reading |" + lIFlL  + "|  bytes long text file
with |" +  lLns + "| lines took |" + (System.currentTimeMillis() - lTm00) + "| 
(ms)");
    System.err.println("// __ iTtlRdKdPnts: |" + iTtlRdKdPnts + "|");
   }catch(IOException IOX) { IOX.printStackTrace(System.err); }
  }
  else{ System.err.println("// __ usage:" + aNWLn + aNWLn +
" java NIO2_newBufferedReader02Test \"<text file>\"" + aNWLn); }
 }
}

~
$ java -version
java version "1.7.0_02"
Java(TM) SE Runtime Environment (build 1.7.0_02-b13) Java HotSpot(TM) Server VM 
(build 22.0-b10, mixed mode) ~
$ free
             total       used       free     shared    buffers     cached
Mem:       4051236     719224    3332012          0      22008     408260
-/+ buffers/cache:     288956    3762280
Swap:      3038424          0    3038424
~
$ javac -encoding utf8 NIO2_newBufferedReader02Test.java
~
$ date; java -Xms256m -Xmx1024m -Xincgc -Dfile.encoding=utf8
NIO2_newBufferedReader02Test /media/sdb1/tmp/eng_news_2006_10M-sentences.txt; 
date;
~
Tue Jul 31 02:05:04 UTC 2012
// __ reading |1280939143|  bytes long text file with |10000000| lines took
 |41922| (ms)
Tue Jul 31 02:05:46 UTC 2012
~
Tue Jul 31 02:05:51 UTC 2012
// __ reading |1280939143|  bytes long text file with |10000000| lines took
 |27299| (ms)
Tue Jul 31 02:06:19 UTC 2012
~
Tue Jul 31 02:06:22 UTC 2012
// __ reading |1280939143|  bytes long text file with |10000000| lines took
 |28180| (ms)
Tue Jul 31 02:06:50 UTC 2012
~
Tue Jul 31 02:26:43 UTC 2012
// __ reading |1280939143|  bytes long text file with |10000000| lines took
 |35388| (ms)
Tue Jul 31 02:27:18 UTC 2012
~
Tue Jul 31 02:27:21 UTC 2012
// __ reading |1280939143|  bytes long text file with |10000000| lines took
 |38155| (ms)
Tue Jul 31 02:28:00 UTC 2012
~
Tue Jul 31 02:30:40 UTC 2012
// __ reading |1280939143|  bytes long text file with |10000000| lines took
 |41099| (ms)
Tue Jul 31 02:31:21 UTC 2012

--- BBBS/Li6 v4.10 Dada-1
 * Origin: Prism bbs (1:261/38)
--- Synchronet 3.16a-Win32 NewsLink 1.98
Time Warp of the Future BBS - telnet://time.synchro.net:24