Groups > comp.lang.python > #85879 > unrolled thread

urgent help

Started by	ismahameed@gcuf.edu.pk
First post	2015-02-19 00:35 -0800
Last post	2015-02-19 10:49 +0000
Articles	7 — 4 participants

Back to article view | Back to comp.lang.python

  urgent help ismahameed@gcuf.edu.pk - 2015-02-19 00:35 -0800
    Re: urgent help Dave Angel <davea@davea.name> - 2015-02-19 04:00 -0500
      Re: urgent help Denis McMahon <denismfmcmahon@gmail.com> - 2015-02-19 14:08 +0000
    Re: urgent help ismahameed@gcuf.edu.pk - 2015-02-19 01:31 -0800
      Re: urgent help ismahameed@gcuf.edu.pk - 2015-02-19 01:46 -0800
        Re: urgent help ismahameed@gcuf.edu.pk - 2015-02-19 01:48 -0800
          Re: urgent help Mark Lawrence <breamoreboy@yahoo.co.uk> - 2015-02-19 10:49 +0000

#85879 — urgent help

From	ismahameed@gcuf.edu.pk
Date	2015-02-19 00:35 -0800
Subject	urgent help
Message-ID	<8fa27443-12f7-4ef6-ba6b-4af16abae29d@googlegroups.com>

this is the error in the following python code, can any one help me
error{Traceback (most recent call last):
  File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
    from BeautifulSoup import BeautifulSoup
ImportError: No module named BeautifulSoup} 



"#encoding=utf8
from codecs import open
from collections import defaultdict
import re

from BeautifulSoup import BeautifulSoup
import mechanize
import cookielib
import html2text
import time


def getbr():
    br = mechanize.Browser()

    # Cookie Jar
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # User-Agent (this is cheating, ok?)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    return br

def logthis(text):
    open("log.txt","a","utf8").write(text+"\n")

def getCommunity(community,url,out=""):
    # Browser
    
    # The site we will navigate into, handling it's session
    i = 1
    
    flag = True
    discussions = []
    baseDiscussion = []
    
    while flag:
        print i
        currurl = url+"/"+str(i)
        try:
            br = getbr()
            br.open(currurl)
            #br.follow_link(text='link')
            html = br.response().read()
            soup = BeautifulSoup(html)
            if soup.find("title").string == u'\r\n\t\u05d4\u05d5\u05d3\u05e2\u05ea \u05de\u05e2\u05e8\u05db\u05ea - BeOK\r\n':
                print "done at ",i,community
                logthis("done at "+str(i)+" "+community)
                return True
            hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})
            print currurl
            #print hrefList
            for link in hrefList:
                #print str(link)
                #continue
                span = link.find('div',{"class":"MsgUsr"})
                
                if "frm_mngr" in str(span):
                    mgr = span.find("span",{"class":"frm_mngr"}).string
                    if not "''" in mgr:
                        continue
                    mgr = mgr.replace("'","")
                    date =  link.find('span',{"class":"MsgDate"}).string.split(" ")[1]
                    #out.write(community+"\t"+mgr+"\t"+date+"\n")
                    print community.rstrip(),date,mgr
                    #fout = open("corpus\\"+community+"-"+date+"-"+mgr,"w","utf8")
                    ansDiv = link.nextSibling.find('div',{"class":"BodyMesInner"})
                    print "bla"
                    ans = fixHtml2(str(ansDiv))
                    print "bla"
                    print ans
                    #fout.write(fixHtml(link.find('div',{"class":"BodyMesInner"}).string)+"\n")
                    #fout.close()
                    questionDiv = link.previousSibling.find('div',{"class":"BodyMesInner"})
                    print "bla",questionDiv
                    quesiton = fixHtml2(str(questionDiv))
                    print question
                span = None
                
                
            
            soup = None
            br = None
        except:
            
            time.sleep(60)
        i+=1
    return list(set(discussions))
    
def fixHtml(page):
    page = page.replace("</p>","\n")
    page = page.replace("</P>","\n")
    page = page.replace("<br />","\n")
    page = page.replace("<BR />","\n")
    page = page.replace("<br>","\n")
    page = page.replace("<BR>","\n")
    page = page.replace("&quot;","'")
    reg = re.compile("<")
    reg2 = re.compile(">")
    page = " ".join([x[-1] for x in map(reg2.split,reg.split(page))])
    page = page.replace("\r\n\t\t\t","\n")
    return page

def fixHtml2(page):
    page = page.split('ner">')[1].split("<div")[0]
    print page
    page = page.replace("</p>","\n")
    page = page.replace("</P>","\n")
    page = page.replace("<br />","\n")
    page = page.replace("<BR />","\n")
    page = page.replace("<br>","\n")
    page = page.replace("<BR>","\n")
    page = page.replace("&quot;","'")
    return page
        
def getText(br,url):
    br.open(url)
    html = br.response().read()
    soup = BeautifulSoup(html)
    title = fixHtml(soup.find('h1',{'class':"articleName"}).contents[0])
    #print title
    artics = soup.findAll('div',{'class':"article"})
    text = "\n"+fixHtml(str(artics[0]).split('"article">')[1].split('</div>')[0])
    text += "\n<EXPERT>"+ fixHtml(str(artics[1]).split('"article">')[1].split('</div>')[0])+"</EXPERT>"
    text = text.decode("utf-8")
    #text = artics[0] +
    #print type(title),type(text)
    
    return title+text    

def getForums(file = "links.htm"):
    #out = open("beokDates","w","utf8")
    soup = BeautifulSoup(open(file,"r").read())
    communities = soup.findAll("a",{"class":"MainList"})
    for comm in communities:
        #print comm["href"]
        getCommunity(comm.string,comm["href"])
        
getForums()    
#links = getQALinks()
file = "links.htm"
soup = BeautifulSoup(open(file,"r").read())
comm = soup.findAll("a",{"class":"MainList"})[0]
br = getbr()
currurl = comm["href"]+"/3"
br.open(currurl)
html = br.response().read()
soup = BeautifulSoup(html)
hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})[0]
"

[toc] | [next] | [standalone]

#85882

From	Dave Angel <davea@davea.name>
Date	2015-02-19 04:00 -0500
Message-ID	<mailman.18871.1424336461.18130.python-list@python.org>
In reply to	#85879

On 02/19/2015 03:35 AM, ismahameed@gcuf.edu.pk wrote:
> this is the error in the following python code, can any one help me
> error{Traceback (most recent call last):
>    File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
>      from BeautifulSoup import BeautifulSoup
> ImportError: No module named BeautifulSoup}
>
>
>
> "#encoding=utf8
> from codecs import open
> from collections import defaultdict
> import re
>
> from BeautifulSoup import BeautifulSoup

When you can demonstrate a problem in a couple of lines of source code, 
why would you waste our bandwidth showing us dozens of unrelated  lines?

Since the error says there's no module named BeautifulSoup, perhaps 
that's because you haven't installed BeautifulSoup.  it's not in the 
standard library.

I've never used it, but a quick web search found me the page:

http://www.crummy.com/software/BeautifulSoup/bs4/doc/

And that seems to say the module is called bs4.

Anyway, if you did install it, and read the directions, and are still 
stumped, you probably need to supply many other details:

1) what version of Python are you using, and do you have multiple 
versions installed
2) what OS
3) where did you download it from, and what commands did you use to 
actually install it  How did you specify which Python version it would 
install to?
4) what your import line looks like (which you did specify)
5) and of course, what the exception is (which you did include)

Other things people may need to know include what directory the bs4.pyc 
file is installed to, what your sys.path is, and so on.  But just 
answering the first questions might let you figure it out for yourself.

-- 
DaveA

[toc] | [prev] | [next] | [standalone]

#85902

From	Denis McMahon <denismfmcmahon@gmail.com>
Date	2015-02-19 14:08 +0000
Message-ID	<mc4qp0$cm7$1@dont-email.me>
In reply to	#85882

On Thu, 19 Feb 2015 04:00:50 -0500, Dave Angel wrote:

> On 02/19/2015 03:35 AM, ismahameed@gcuf.edu.pk wrote:
>> this is the error in the following python code, can any one help me
>> error{Traceback (most recent call last):
>>    File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in
>>    <module>
>>      from BeautifulSoup import BeautifulSoup
>> ImportError: No module named BeautifulSoup}
>>
>> "#encoding=utf8 from codecs import open from collections import
>> defaultdict import re
>>
>> from BeautifulSoup import BeautifulSoup

> When you can demonstrate a problem in a couple of lines of source code,
> why would you waste our bandwidth showing us dozens of unrelated  lines?
> 
> Since the error says there's no module named BeautifulSoup, perhaps
> that's because you haven't installed BeautifulSoup.  it's not in the
> standard library.
> 
> I've never used it, but a quick web search found me the page:
> 
> http://www.crummy.com/software/BeautifulSoup/bs4/doc/

  *********************************************************

  *********************************************************

  ****                                                 ****

> **** And that seems to say the module is called bs4. ****

  ****                                                 ****

  *********************************************************

  *********************************************************

It seems that the OP has failed to read your post, the documentation or 
the examples for the code he is using.

As a very strong hint, I have highlighted your fix for his main problem 
above with a few (ok, several) asterisks. Let's see if he can find it now.

If he can't, I don't understand why he bothered to ask for help, because 
I'm pretty sure you nailed the issue right there, and unless he's going 
to read the responses to his post to see the answers that are provided 
it's a bit stupid to post asking for help in the first place.

-- 
Denis McMahon, denismfmcmahon@gmail.com

[toc] | [prev] | [next] | [standalone]

#85884

From	ismahameed@gcuf.edu.pk
Date	2015-02-19 01:31 -0800
Message-ID	<11ec9cc1-0eb4-4981-8a67-2cabf26554f1@googlegroups.com>
In reply to	#85879

On Thursday, February 19, 2015 at 4:35:18 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> this is the error in the following python code, can any one help me
> error{Traceback (most recent call last):
>   File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
>     from BeautifulSoup import BeautifulSoup
> ImportError: No module named BeautifulSoup} 
> 
> 
> 
> "#encoding=utf8
> from codecs import open
> from collections import defaultdict
> import re
> 
> from BeautifulSoup import BeautifulSoup
> import mechanize
> import cookielib
> import html2text
> import time
> 
> 
> def getbr():
>     br = mechanize.Browser()
> 
>     # Cookie Jar
>     cj = cookielib.LWPCookieJar()
>     br.set_cookiejar(cj)
> 
>     # Browser options
>     br.set_handle_equiv(True)
>     br.set_handle_gzip(True)
>     br.set_handle_redirect(True)
>     br.set_handle_referer(True)
>     br.set_handle_robots(False)
> 
>     # Follows refresh 0 but not hangs on refresh > 0
>     br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
> 
>     # User-Agent (this is cheating, ok?)
>     br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
>     return br
> 
> def logthis(text):
>     open("log.txt","a","utf8").write(text+"\n")
> 
> def getCommunity(community,url,out=""):
>     # Browser
>     
>     # The site we will navigate into, handling it's session
>     i = 1
>     
>     flag = True
>     discussions = []
>     baseDiscussion = []
>     
>     while flag:
>         print i
>         currurl = url+"/"+str(i)
>         try:
>             br = getbr()
>             br.open(currurl)
>             #br.follow_link(text='link')
>             html = br.response().read()
>             soup = BeautifulSoup(html)
>             if soup.find("title").string == u'\r\n\t\u05d4\u05d5\u05d3\u05e2\u05ea \u05de\u05e2\u05e8\u05db\u05ea - BeOK\r\n':
>                 print "done at ",i,community
>                 logthis("done at "+str(i)+" "+community)
>                 return True
>             hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})
>             print currurl
>             #print hrefList
>             for link in hrefList:
>                 #print str(link)
>                 #continue
>                 span = link.find('div',{"class":"MsgUsr"})
>                 
>                 if "frm_mngr" in str(span):
>                     mgr = span.find("span",{"class":"frm_mngr"}).string
>                     if not "''" in mgr:
>                         continue
>                     mgr = mgr.replace("'","")
>                     date =  link.find('span',{"class":"MsgDate"}).string.split(" ")[1]
>                     #out.write(community+"\t"+mgr+"\t"+date+"\n")
>                     print community.rstrip(),date,mgr
>                     #fout = open("corpus\\"+community+"-"+date+"-"+mgr,"w","utf8")
>                     ansDiv = link.nextSibling.find('div',{"class":"BodyMesInner"})
>                     print "bla"
>                     ans = fixHtml2(str(ansDiv))
>                     print "bla"
>                     print ans
>                     #fout.write(fixHtml(link.find('div',{"class":"BodyMesInner"}).string)+"\n")
>                     #fout.close()
>                     questionDiv = link.previousSibling.find('div',{"class":"BodyMesInner"})
>                     print "bla",questionDiv
>                     quesiton = fixHtml2(str(questionDiv))
>                     print question
>                 span = None
>                 
>                 
>             
>             soup = None
>             br = None
>         except:
>             
>             time.sleep(60)
>         i+=1
>     return list(set(discussions))
>     
> def fixHtml(page):
>     page = page.replace("</p>","\n")
>     page = page.replace("</P>","\n")
>     page = page.replace("<br />","\n")
>     page = page.replace("<BR />","\n")
>     page = page.replace("<br>","\n")
>     page = page.replace("<BR>","\n")
>     page = page.replace("&quot;","'")
>     reg = re.compile("<")
>     reg2 = re.compile(">")
>     page = " ".join([x[-1] for x in map(reg2.split,reg.split(page))])
>     page = page.replace("\r\n\t\t\t","\n")
>     return page
> 
> def fixHtml2(page):
>     page = page.split('ner">')[1].split("<div")[0]
>     print page
>     page = page.replace("</p>","\n")
>     page = page.replace("</P>","\n")
>     page = page.replace("<br />","\n")
>     page = page.replace("<BR />","\n")
>     page = page.replace("<br>","\n")
>     page = page.replace("<BR>","\n")
>     page = page.replace("&quot;","'")
>     return page
>         
> def getText(br,url):
>     br.open(url)
>     html = br.response().read()
>     soup = BeautifulSoup(html)
>     title = fixHtml(soup.find('h1',{'class':"articleName"}).contents[0])
>     #print title
>     artics = soup.findAll('div',{'class':"article"})
>     text = "\n"+fixHtml(str(artics[0]).split('"article">')[1].split('</div>')[0])
>     text += "\n<EXPERT>"+ fixHtml(str(artics[1]).split('"article">')[1].split('</div>')[0])+"</EXPERT>"
>     text = text.decode("utf-8")
>     #text = artics[0] +
>     #print type(title),type(text)
>     
>     return title+text    
> 
> def getForums(file = "links.htm"):
>     #out = open("beokDates","w","utf8")
>     soup = BeautifulSoup(open(file,"r").read())
>     communities = soup.findAll("a",{"class":"MainList"})
>     for comm in communities:
>         #print comm["href"]
>         getCommunity(comm.string,comm["href"])
>         
> getForums()    
> #links = getQALinks()
> file = "links.htm"
> soup = BeautifulSoup(open(file,"r").read())
> comm = soup.findAll("a",{"class":"MainList"})[0]
> br = getbr()
> currurl = comm["href"]+"/3"
> br.open(currurl)
> html = br.response().read()
> soup = BeautifulSoup(html)
> hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})[0]
> "




yes i have install the beautifulsoup module in python library .

[toc] | [prev] | [next] | [standalone]

#85886

From	ismahameed@gcuf.edu.pk
Date	2015-02-19 01:46 -0800
Message-ID	<6cce1037-1d2e-4e90-97c8-92ff7f1d677d@googlegroups.com>
In reply to	#85884

On Thursday, February 19, 2015 at 5:31:49 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> On Thursday, February 19, 2015 at 4:35:18 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> > this is the error in the following python code, can any one help me
> > error{Traceback (most recent call last):
> >   File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
> >     from BeautifulSoup import BeautifulSoup
> > ImportError: No module named BeautifulSoup} 
> > 
> > 
> > 
> > "#encoding=utf8
> > from codecs import open
> > from collections import defaultdict
> > import re
> > 
> > from BeautifulSoup import BeautifulSoup
> > import mechanize
> > import cookielib
> > import html2text
> > import time
> > 
> > 
> > def getbr():
> >     br = mechanize.Browser()
> > 
> >     # Cookie Jar
> >     cj = cookielib.LWPCookieJar()
> >     br.set_cookiejar(cj)
> > 
> >     # Browser options
> >     br.set_handle_equiv(True)
> >     br.set_handle_gzip(True)
> >     br.set_handle_redirect(True)
> >     br.set_handle_referer(True)
> >     br.set_handle_robots(False)
> > 
> >     # Follows refresh 0 but not hangs on refresh > 0
> >     br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
> > 
> >     # User-Agent (this is cheating, ok?)
> >     br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
> >     return br
> > 
> > def logthis(text):
> >     open("log.txt","a","utf8").write(text+"\n")
> > 
> > def getCommunity(community,url,out=""):
> >     # Browser
> >     
> >     # The site we will navigate into, handling it's session
> >     i = 1
> >     
> >     flag = True
> >     discussions = []
> >     baseDiscussion = []
> >     
> >     while flag:
> >         print i
> >         currurl = url+"/"+str(i)
> >         try:
> >             br = getbr()
> >             br.open(currurl)
> >             #br.follow_link(text='link')
> >             html = br.response().read()
> >             soup = BeautifulSoup(html)
> >             if soup.find("title").string == u'\r\n\t\u05d4\u05d5\u05d3\u05e2\u05ea \u05de\u05e2\u05e8\u05db\u05ea - BeOK\r\n':
> >                 print "done at ",i,community
> >                 logthis("done at "+str(i)+" "+community)
> >                 return True
> >             hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})
> >             print currurl
> >             #print hrefList
> >             for link in hrefList:
> >                 #print str(link)
> >                 #continue
> >                 span = link.find('div',{"class":"MsgUsr"})
> >                 
> >                 if "frm_mngr" in str(span):
> >                     mgr = span.find("span",{"class":"frm_mngr"}).string
> >                     if not "''" in mgr:
> >                         continue
> >                     mgr = mgr.replace("'","")
> >                     date =  link.find('span',{"class":"MsgDate"}).string.split(" ")[1]
> >                     #out.write(community+"\t"+mgr+"\t"+date+"\n")
> >                     print community.rstrip(),date,mgr
> >                     #fout = open("corpus\\"+community+"-"+date+"-"+mgr,"w","utf8")
> >                     ansDiv = link.nextSibling.find('div',{"class":"BodyMesInner"})
> >                     print "bla"
> >                     ans = fixHtml2(str(ansDiv))
> >                     print "bla"
> >                     print ans
> >                     #fout.write(fixHtml(link.find('div',{"class":"BodyMesInner"}).string)+"\n")
> >                     #fout.close()
> >                     questionDiv = link.previousSibling.find('div',{"class":"BodyMesInner"})
> >                     print "bla",questionDiv
> >                     quesiton = fixHtml2(str(questionDiv))
> >                     print question
> >                 span = None
> >                 
> >                 
> >             
> >             soup = None
> >             br = None
> >         except:
> >             
> >             time.sleep(60)
> >         i+=1
> >     return list(set(discussions))
> >     
> > def fixHtml(page):
> >     page = page.replace("</p>","\n")
> >     page = page.replace("</P>","\n")
> >     page = page.replace("<br />","\n")
> >     page = page.replace("<BR />","\n")
> >     page = page.replace("<br>","\n")
> >     page = page.replace("<BR>","\n")
> >     page = page.replace("&quot;","'")
> >     reg = re.compile("<")
> >     reg2 = re.compile(">")
> >     page = " ".join([x[-1] for x in map(reg2.split,reg.split(page))])
> >     page = page.replace("\r\n\t\t\t","\n")
> >     return page
> > 
> > def fixHtml2(page):
> >     page = page.split('ner">')[1].split("<div")[0]
> >     print page
> >     page = page.replace("</p>","\n")
> >     page = page.replace("</P>","\n")
> >     page = page.replace("<br />","\n")
> >     page = page.replace("<BR />","\n")
> >     page = page.replace("<br>","\n")
> >     page = page.replace("<BR>","\n")
> >     page = page.replace("&quot;","'")
> >     return page
> >         
> > def getText(br,url):
> >     br.open(url)
> >     html = br.response().read()
> >     soup = BeautifulSoup(html)
> >     title = fixHtml(soup.find('h1',{'class':"articleName"}).contents[0])
> >     #print title
> >     artics = soup.findAll('div',{'class':"article"})
> >     text = "\n"+fixHtml(str(artics[0]).split('"article">')[1].split('</div>')[0])
> >     text += "\n<EXPERT>"+ fixHtml(str(artics[1]).split('"article">')[1].split('</div>')[0])+"</EXPERT>"
> >     text = text.decode("utf-8")
> >     #text = artics[0] +
> >     #print type(title),type(text)
> >     
> >     return title+text    
> > 
> > def getForums(file = "links.htm"):
> >     #out = open("beokDates","w","utf8")
> >     soup = BeautifulSoup(open(file,"r").read())
> >     communities = soup.findAll("a",{"class":"MainList"})
> >     for comm in communities:
> >         #print comm["href"]
> >         getCommunity(comm.string,comm["href"])
> >         
> > getForums()    
> > #links = getQALinks()
> > file = "links.htm"
> > soup = BeautifulSoup(open(file,"r").read())
> > comm = soup.findAll("a",{"class":"MainList"})[0]
> > br = getbr()
> > currurl = comm["href"]+"/3"
> > br.open(currurl)
> > html = br.response().read()
> > soup = BeautifulSoup(html)
> > hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})[0]
> > "
> 
> 
> 
> 
> yes i have install the beautifulsoup module in python library .

when i checked that the module is working or not then in cmd its show that it is install but when i run my program code then its show that error which i have written before

[toc] | [prev] | [next] | [standalone]

#85887

From	ismahameed@gcuf.edu.pk
Date	2015-02-19 01:48 -0800
Message-ID	<9cbcc65d-72ce-423e-9a72-1c81c9c1a7c3@googlegroups.com>
In reply to	#85886

On Thursday, February 19, 2015 at 5:46:42 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> On Thursday, February 19, 2015 at 5:31:49 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> > On Thursday, February 19, 2015 at 4:35:18 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> > > this is the error in the following python code, can any one help me
> > > error{Traceback (most recent call last):
> > >   File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
> > >     from BeautifulSoup import BeautifulSoup
> > > ImportError: No module named BeautifulSoup} 
> > > 
> > > 
> > > 
> > > "#encoding=utf8
> > > from codecs import open
> > > from collections import defaultdict
> > > import re
> > > 
> > > from BeautifulSoup import BeautifulSoup
> > > import mechanize
> > > import cookielib
> > > import html2text
> > > import time
> > > 
> > > 
> > > def getbr():
> > >     br = mechanize.Browser()
> > > 
> > >     # Cookie Jar
> > >     cj = cookielib.LWPCookieJar()
> > >     br.set_cookiejar(cj)
> > > 
> > >     # Browser options
> > >     br.set_handle_equiv(True)
> > >     br.set_handle_gzip(True)
> > >     br.set_handle_redirect(True)
> > >     br.set_handle_referer(True)
> > >     br.set_handle_robots(False)
> > > 
> > >     # Follows refresh 0 but not hangs on refresh > 0
> > >     br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
> > > 
> > >     # User-Agent (this is cheating, ok?)
> > >     br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
> > >     return br
> > > 
> > > def logthis(text):
> > >     open("log.txt","a","utf8").write(text+"\n")
> > > 
> > > def getCommunity(community,url,out=""):
> > >     # Browser
> > >     
> > >     # The site we will navigate into, handling it's session
> > >     i = 1
> > >     
> > >     flag = True
> > >     discussions = []
> > >     baseDiscussion = []
> > >     
> > >     while flag:
> > >         print i
> > >         currurl = url+"/"+str(i)
> > >         try:
> > >             br = getbr()
> > >             br.open(currurl)
> > >             #br.follow_link(text='link')
> > >             html = br.response().read()
> > >             soup = BeautifulSoup(html)
> > >             if soup.find("title").string == u'\r\n\t\u05d4\u05d5\u05d3\u05e2\u05ea \u05de\u05e2\u05e8\u05db\u05ea - BeOK\r\n':
> > >                 print "done at ",i,community
> > >                 logthis("done at "+str(i)+" "+community)
> > >                 return True
> > >             hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})
> > >             print currurl
> > >             #print hrefList
> > >             for link in hrefList:
> > >                 #print str(link)
> > >                 #continue
> > >                 span = link.find('div',{"class":"MsgUsr"})
> > >                 
> > >                 if "frm_mngr" in str(span):
> > >                     mgr = span.find("span",{"class":"frm_mngr"}).string
> > >                     if not "''" in mgr:
> > >                         continue
> > >                     mgr = mgr.replace("'","")
> > >                     date =  link.find('span',{"class":"MsgDate"}).string.split(" ")[1]
> > >                     #out.write(community+"\t"+mgr+"\t"+date+"\n")
> > >                     print community.rstrip(),date,mgr
> > >                     #fout = open("corpus\\"+community+"-"+date+"-"+mgr,"w","utf8")
> > >                     ansDiv = link.nextSibling.find('div',{"class":"BodyMesInner"})
> > >                     print "bla"
> > >                     ans = fixHtml2(str(ansDiv))
> > >                     print "bla"
> > >                     print ans
> > >                     #fout.write(fixHtml(link.find('div',{"class":"BodyMesInner"}).string)+"\n")
> > >                     #fout.close()
> > >                     questionDiv = link.previousSibling.find('div',{"class":"BodyMesInner"})
> > >                     print "bla",questionDiv
> > >                     quesiton = fixHtml2(str(questionDiv))
> > >                     print question
> > >                 span = None
> > >                 
> > >                 
> > >             
> > >             soup = None
> > >             br = None
> > >         except:
> > >             
> > >             time.sleep(60)
> > >         i+=1
> > >     return list(set(discussions))
> > >     
> > > def fixHtml(page):
> > >     page = page.replace("</p>","\n")
> > >     page = page.replace("</P>","\n")
> > >     page = page.replace("<br />","\n")
> > >     page = page.replace("<BR />","\n")
> > >     page = page.replace("<br>","\n")
> > >     page = page.replace("<BR>","\n")
> > >     page = page.replace("&quot;","'")
> > >     reg = re.compile("<")
> > >     reg2 = re.compile(">")
> > >     page = " ".join([x[-1] for x in map(reg2.split,reg.split(page))])
> > >     page = page.replace("\r\n\t\t\t","\n")
> > >     return page
> > > 
> > > def fixHtml2(page):
> > >     page = page.split('ner">')[1].split("<div")[0]
> > >     print page
> > >     page = page.replace("</p>","\n")
> > >     page = page.replace("</P>","\n")
> > >     page = page.replace("<br />","\n")
> > >     page = page.replace("<BR />","\n")
> > >     page = page.replace("<br>","\n")
> > >     page = page.replace("<BR>","\n")
> > >     page = page.replace("&quot;","'")
> > >     return page
> > >         
> > > def getText(br,url):
> > >     br.open(url)
> > >     html = br.response().read()
> > >     soup = BeautifulSoup(html)
> > >     title = fixHtml(soup.find('h1',{'class':"articleName"}).contents[0])
> > >     #print title
> > >     artics = soup.findAll('div',{'class':"article"})
> > >     text = "\n"+fixHtml(str(artics[0]).split('"article">')[1].split('</div>')[0])
> > >     text += "\n<EXPERT>"+ fixHtml(str(artics[1]).split('"article">')[1].split('</div>')[0])+"</EXPERT>"
> > >     text = text.decode("utf-8")
> > >     #text = artics[0] +
> > >     #print type(title),type(text)
> > >     
> > >     return title+text    
> > > 
> > > def getForums(file = "links.htm"):
> > >     #out = open("beokDates","w","utf8")
> > >     soup = BeautifulSoup(open(file,"r").read())
> > >     communities = soup.findAll("a",{"class":"MainList"})
> > >     for comm in communities:
> > >         #print comm["href"]
> > >         getCommunity(comm.string,comm["href"])
> > >         
> > > getForums()    
> > > #links = getQALinks()
> > > file = "links.htm"
> > > soup = BeautifulSoup(open(file,"r").read())
> > > comm = soup.findAll("a",{"class":"MainList"})[0]
> > > br = getbr()
> > > currurl = comm["href"]+"/3"
> > > br.open(currurl)
> > > html = br.response().read()
> > > soup = BeautifulSoup(html)
> > > hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})[0]
> > > "
> > 
> > 
> > 
> > 
> > yes i have install the beautifulsoup module in python library .
> 
> when i checked that the module is working or not then in cmd its show that it is install but when i run my program code then its show that error which i have written before

i am using windows 8 , and i have installed python27, and i have installed beautifulsoup module from this website http://www.crummy.com/software/BeautifulSoup/bs4

[toc] | [prev] | [next] | [standalone]

#85895

From	Mark Lawrence <breamoreboy@yahoo.co.uk>
Date	2015-02-19 10:49 +0000
Message-ID	<mailman.18879.1424343016.18130.python-list@python.org>
In reply to	#85887

On 19/02/2015 09:48, ismahameed@gcuf.edu.pk wrote:

As Dave Angel said nearly two hours ago the module is called bs4, so the 
command you need is:-

from bs4 import BeautifulSoup

In future please don't repeat the entire email just to add a sentence or 
two, particularly when you do so three times, as some people pay for 
bandwidth.  Thanks.

-- 
My fellow Pythonistas, ask not what our language can do for you, ask
what you can do for our language.

Mark Lawrence

[toc] | [prev] | [standalone]

csiph-web

urgent help

Contents

#85879 — urgent help

#85882

#85902

#85884

#85886

#85887

#85895