Groups | Search | Server Info | Keyboard shortcuts | Login | Register [http] [https] [nntp] [nntps]
Groups > comp.lang.python > #85879 > unrolled thread
| Started by | ismahameed@gcuf.edu.pk |
|---|---|
| First post | 2015-02-19 00:35 -0800 |
| Last post | 2015-02-19 10:49 +0000 |
| Articles | 7 — 4 participants |
Back to article view | Back to comp.lang.python
urgent help ismahameed@gcuf.edu.pk - 2015-02-19 00:35 -0800
Re: urgent help Dave Angel <davea@davea.name> - 2015-02-19 04:00 -0500
Re: urgent help Denis McMahon <denismfmcmahon@gmail.com> - 2015-02-19 14:08 +0000
Re: urgent help ismahameed@gcuf.edu.pk - 2015-02-19 01:31 -0800
Re: urgent help ismahameed@gcuf.edu.pk - 2015-02-19 01:46 -0800
Re: urgent help ismahameed@gcuf.edu.pk - 2015-02-19 01:48 -0800
Re: urgent help Mark Lawrence <breamoreboy@yahoo.co.uk> - 2015-02-19 10:49 +0000
| From | ismahameed@gcuf.edu.pk |
|---|---|
| Date | 2015-02-19 00:35 -0800 |
| Subject | urgent help |
| Message-ID | <8fa27443-12f7-4ef6-ba6b-4af16abae29d@googlegroups.com> |
this is the error in the following python code, can any one help me
error{Traceback (most recent call last):
File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
from BeautifulSoup import BeautifulSoup
ImportError: No module named BeautifulSoup}
"#encoding=utf8
from codecs import open
from collections import defaultdict
import re
from BeautifulSoup import BeautifulSoup
import mechanize
import cookielib
import html2text
import time
def getbr():
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
return br
def logthis(text):
open("log.txt","a","utf8").write(text+"\n")
def getCommunity(community,url,out=""):
# Browser
# The site we will navigate into, handling it's session
i = 1
flag = True
discussions = []
baseDiscussion = []
while flag:
print i
currurl = url+"/"+str(i)
try:
br = getbr()
br.open(currurl)
#br.follow_link(text='link')
html = br.response().read()
soup = BeautifulSoup(html)
if soup.find("title").string == u'\r\n\t\u05d4\u05d5\u05d3\u05e2\u05ea \u05de\u05e2\u05e8\u05db\u05ea - BeOK\r\n':
print "done at ",i,community
logthis("done at "+str(i)+" "+community)
return True
hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})
print currurl
#print hrefList
for link in hrefList:
#print str(link)
#continue
span = link.find('div',{"class":"MsgUsr"})
if "frm_mngr" in str(span):
mgr = span.find("span",{"class":"frm_mngr"}).string
if not "''" in mgr:
continue
mgr = mgr.replace("'","")
date = link.find('span',{"class":"MsgDate"}).string.split(" ")[1]
#out.write(community+"\t"+mgr+"\t"+date+"\n")
print community.rstrip(),date,mgr
#fout = open("corpus\\"+community+"-"+date+"-"+mgr,"w","utf8")
ansDiv = link.nextSibling.find('div',{"class":"BodyMesInner"})
print "bla"
ans = fixHtml2(str(ansDiv))
print "bla"
print ans
#fout.write(fixHtml(link.find('div',{"class":"BodyMesInner"}).string)+"\n")
#fout.close()
questionDiv = link.previousSibling.find('div',{"class":"BodyMesInner"})
print "bla",questionDiv
quesiton = fixHtml2(str(questionDiv))
print question
span = None
soup = None
br = None
except:
time.sleep(60)
i+=1
return list(set(discussions))
def fixHtml(page):
page = page.replace("</p>","\n")
page = page.replace("</P>","\n")
page = page.replace("<br />","\n")
page = page.replace("<BR />","\n")
page = page.replace("<br>","\n")
page = page.replace("<BR>","\n")
page = page.replace(""","'")
reg = re.compile("<")
reg2 = re.compile(">")
page = " ".join([x[-1] for x in map(reg2.split,reg.split(page))])
page = page.replace("\r\n\t\t\t","\n")
return page
def fixHtml2(page):
page = page.split('ner">')[1].split("<div")[0]
print page
page = page.replace("</p>","\n")
page = page.replace("</P>","\n")
page = page.replace("<br />","\n")
page = page.replace("<BR />","\n")
page = page.replace("<br>","\n")
page = page.replace("<BR>","\n")
page = page.replace(""","'")
return page
def getText(br,url):
br.open(url)
html = br.response().read()
soup = BeautifulSoup(html)
title = fixHtml(soup.find('h1',{'class':"articleName"}).contents[0])
#print title
artics = soup.findAll('div',{'class':"article"})
text = "\n"+fixHtml(str(artics[0]).split('"article">')[1].split('</div>')[0])
text += "\n<EXPERT>"+ fixHtml(str(artics[1]).split('"article">')[1].split('</div>')[0])+"</EXPERT>"
text = text.decode("utf-8")
#text = artics[0] +
#print type(title),type(text)
return title+text
def getForums(file = "links.htm"):
#out = open("beokDates","w","utf8")
soup = BeautifulSoup(open(file,"r").read())
communities = soup.findAll("a",{"class":"MainList"})
for comm in communities:
#print comm["href"]
getCommunity(comm.string,comm["href"])
getForums()
#links = getQALinks()
file = "links.htm"
soup = BeautifulSoup(open(file,"r").read())
comm = soup.findAll("a",{"class":"MainList"})[0]
br = getbr()
currurl = comm["href"]+"/3"
br.open(currurl)
html = br.response().read()
soup = BeautifulSoup(html)
hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})[0]
"
[toc] | [next] | [standalone]
| From | Dave Angel <davea@davea.name> |
|---|---|
| Date | 2015-02-19 04:00 -0500 |
| Message-ID | <mailman.18871.1424336461.18130.python-list@python.org> |
| In reply to | #85879 |
On 02/19/2015 03:35 AM, ismahameed@gcuf.edu.pk wrote:
> this is the error in the following python code, can any one help me
> error{Traceback (most recent call last):
> File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
> from BeautifulSoup import BeautifulSoup
> ImportError: No module named BeautifulSoup}
>
>
>
> "#encoding=utf8
> from codecs import open
> from collections import defaultdict
> import re
>
> from BeautifulSoup import BeautifulSoup
When you can demonstrate a problem in a couple of lines of source code,
why would you waste our bandwidth showing us dozens of unrelated lines?
Since the error says there's no module named BeautifulSoup, perhaps
that's because you haven't installed BeautifulSoup. it's not in the
standard library.
I've never used it, but a quick web search found me the page:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
And that seems to say the module is called bs4.
Anyway, if you did install it, and read the directions, and are still
stumped, you probably need to supply many other details:
1) what version of Python are you using, and do you have multiple
versions installed
2) what OS
3) where did you download it from, and what commands did you use to
actually install it How did you specify which Python version it would
install to?
4) what your import line looks like (which you did specify)
5) and of course, what the exception is (which you did include)
Other things people may need to know include what directory the bs4.pyc
file is installed to, what your sys.path is, and so on. But just
answering the first questions might let you figure it out for yourself.
--
DaveA
[toc] | [prev] | [next] | [standalone]
| From | Denis McMahon <denismfmcmahon@gmail.com> |
|---|---|
| Date | 2015-02-19 14:08 +0000 |
| Message-ID | <mc4qp0$cm7$1@dont-email.me> |
| In reply to | #85882 |
On Thu, 19 Feb 2015 04:00:50 -0500, Dave Angel wrote:
> On 02/19/2015 03:35 AM, ismahameed@gcuf.edu.pk wrote:
>> this is the error in the following python code, can any one help me
>> error{Traceback (most recent call last):
>> File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in
>> <module>
>> from BeautifulSoup import BeautifulSoup
>> ImportError: No module named BeautifulSoup}
>>
>> "#encoding=utf8 from codecs import open from collections import
>> defaultdict import re
>>
>> from BeautifulSoup import BeautifulSoup
> When you can demonstrate a problem in a couple of lines of source code,
> why would you waste our bandwidth showing us dozens of unrelated lines?
>
> Since the error says there's no module named BeautifulSoup, perhaps
> that's because you haven't installed BeautifulSoup. it's not in the
> standard library.
>
> I've never used it, but a quick web search found me the page:
>
> http://www.crummy.com/software/BeautifulSoup/bs4/doc/
*********************************************************
*********************************************************
**** ****
> **** And that seems to say the module is called bs4. ****
**** ****
*********************************************************
*********************************************************
It seems that the OP has failed to read your post, the documentation or
the examples for the code he is using.
As a very strong hint, I have highlighted your fix for his main problem
above with a few (ok, several) asterisks. Let's see if he can find it now.
If he can't, I don't understand why he bothered to ask for help, because
I'm pretty sure you nailed the issue right there, and unless he's going
to read the responses to his post to see the answers that are provided
it's a bit stupid to post asking for help in the first place.
--
Denis McMahon, denismfmcmahon@gmail.com
[toc] | [prev] | [next] | [standalone]
| From | ismahameed@gcuf.edu.pk |
|---|---|
| Date | 2015-02-19 01:31 -0800 |
| Message-ID | <11ec9cc1-0eb4-4981-8a67-2cabf26554f1@googlegroups.com> |
| In reply to | #85879 |
On Thursday, February 19, 2015 at 4:35:18 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> this is the error in the following python code, can any one help me
> error{Traceback (most recent call last):
> File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
> from BeautifulSoup import BeautifulSoup
> ImportError: No module named BeautifulSoup}
>
>
>
> "#encoding=utf8
> from codecs import open
> from collections import defaultdict
> import re
>
> from BeautifulSoup import BeautifulSoup
> import mechanize
> import cookielib
> import html2text
> import time
>
>
> def getbr():
> br = mechanize.Browser()
>
> # Cookie Jar
> cj = cookielib.LWPCookieJar()
> br.set_cookiejar(cj)
>
> # Browser options
> br.set_handle_equiv(True)
> br.set_handle_gzip(True)
> br.set_handle_redirect(True)
> br.set_handle_referer(True)
> br.set_handle_robots(False)
>
> # Follows refresh 0 but not hangs on refresh > 0
> br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
>
> # User-Agent (this is cheating, ok?)
> br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
> return br
>
> def logthis(text):
> open("log.txt","a","utf8").write(text+"\n")
>
> def getCommunity(community,url,out=""):
> # Browser
>
> # The site we will navigate into, handling it's session
> i = 1
>
> flag = True
> discussions = []
> baseDiscussion = []
>
> while flag:
> print i
> currurl = url+"/"+str(i)
> try:
> br = getbr()
> br.open(currurl)
> #br.follow_link(text='link')
> html = br.response().read()
> soup = BeautifulSoup(html)
> if soup.find("title").string == u'\r\n\t\u05d4\u05d5\u05d3\u05e2\u05ea \u05de\u05e2\u05e8\u05db\u05ea - BeOK\r\n':
> print "done at ",i,community
> logthis("done at "+str(i)+" "+community)
> return True
> hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})
> print currurl
> #print hrefList
> for link in hrefList:
> #print str(link)
> #continue
> span = link.find('div',{"class":"MsgUsr"})
>
> if "frm_mngr" in str(span):
> mgr = span.find("span",{"class":"frm_mngr"}).string
> if not "''" in mgr:
> continue
> mgr = mgr.replace("'","")
> date = link.find('span',{"class":"MsgDate"}).string.split(" ")[1]
> #out.write(community+"\t"+mgr+"\t"+date+"\n")
> print community.rstrip(),date,mgr
> #fout = open("corpus\\"+community+"-"+date+"-"+mgr,"w","utf8")
> ansDiv = link.nextSibling.find('div',{"class":"BodyMesInner"})
> print "bla"
> ans = fixHtml2(str(ansDiv))
> print "bla"
> print ans
> #fout.write(fixHtml(link.find('div',{"class":"BodyMesInner"}).string)+"\n")
> #fout.close()
> questionDiv = link.previousSibling.find('div',{"class":"BodyMesInner"})
> print "bla",questionDiv
> quesiton = fixHtml2(str(questionDiv))
> print question
> span = None
>
>
>
> soup = None
> br = None
> except:
>
> time.sleep(60)
> i+=1
> return list(set(discussions))
>
> def fixHtml(page):
> page = page.replace("</p>","\n")
> page = page.replace("</P>","\n")
> page = page.replace("<br />","\n")
> page = page.replace("<BR />","\n")
> page = page.replace("<br>","\n")
> page = page.replace("<BR>","\n")
> page = page.replace(""","'")
> reg = re.compile("<")
> reg2 = re.compile(">")
> page = " ".join([x[-1] for x in map(reg2.split,reg.split(page))])
> page = page.replace("\r\n\t\t\t","\n")
> return page
>
> def fixHtml2(page):
> page = page.split('ner">')[1].split("<div")[0]
> print page
> page = page.replace("</p>","\n")
> page = page.replace("</P>","\n")
> page = page.replace("<br />","\n")
> page = page.replace("<BR />","\n")
> page = page.replace("<br>","\n")
> page = page.replace("<BR>","\n")
> page = page.replace(""","'")
> return page
>
> def getText(br,url):
> br.open(url)
> html = br.response().read()
> soup = BeautifulSoup(html)
> title = fixHtml(soup.find('h1',{'class':"articleName"}).contents[0])
> #print title
> artics = soup.findAll('div',{'class':"article"})
> text = "\n"+fixHtml(str(artics[0]).split('"article">')[1].split('</div>')[0])
> text += "\n<EXPERT>"+ fixHtml(str(artics[1]).split('"article">')[1].split('</div>')[0])+"</EXPERT>"
> text = text.decode("utf-8")
> #text = artics[0] +
> #print type(title),type(text)
>
> return title+text
>
> def getForums(file = "links.htm"):
> #out = open("beokDates","w","utf8")
> soup = BeautifulSoup(open(file,"r").read())
> communities = soup.findAll("a",{"class":"MainList"})
> for comm in communities:
> #print comm["href"]
> getCommunity(comm.string,comm["href"])
>
> getForums()
> #links = getQALinks()
> file = "links.htm"
> soup = BeautifulSoup(open(file,"r").read())
> comm = soup.findAll("a",{"class":"MainList"})[0]
> br = getbr()
> currurl = comm["href"]+"/3"
> br.open(currurl)
> html = br.response().read()
> soup = BeautifulSoup(html)
> hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})[0]
> "
yes i have install the beautifulsoup module in python library .
[toc] | [prev] | [next] | [standalone]
| From | ismahameed@gcuf.edu.pk |
|---|---|
| Date | 2015-02-19 01:46 -0800 |
| Message-ID | <6cce1037-1d2e-4e90-97c8-92ff7f1d677d@googlegroups.com> |
| In reply to | #85884 |
On Thursday, February 19, 2015 at 5:31:49 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> On Thursday, February 19, 2015 at 4:35:18 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> > this is the error in the following python code, can any one help me
> > error{Traceback (most recent call last):
> > File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
> > from BeautifulSoup import BeautifulSoup
> > ImportError: No module named BeautifulSoup}
> >
> >
> >
> > "#encoding=utf8
> > from codecs import open
> > from collections import defaultdict
> > import re
> >
> > from BeautifulSoup import BeautifulSoup
> > import mechanize
> > import cookielib
> > import html2text
> > import time
> >
> >
> > def getbr():
> > br = mechanize.Browser()
> >
> > # Cookie Jar
> > cj = cookielib.LWPCookieJar()
> > br.set_cookiejar(cj)
> >
> > # Browser options
> > br.set_handle_equiv(True)
> > br.set_handle_gzip(True)
> > br.set_handle_redirect(True)
> > br.set_handle_referer(True)
> > br.set_handle_robots(False)
> >
> > # Follows refresh 0 but not hangs on refresh > 0
> > br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
> >
> > # User-Agent (this is cheating, ok?)
> > br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
> > return br
> >
> > def logthis(text):
> > open("log.txt","a","utf8").write(text+"\n")
> >
> > def getCommunity(community,url,out=""):
> > # Browser
> >
> > # The site we will navigate into, handling it's session
> > i = 1
> >
> > flag = True
> > discussions = []
> > baseDiscussion = []
> >
> > while flag:
> > print i
> > currurl = url+"/"+str(i)
> > try:
> > br = getbr()
> > br.open(currurl)
> > #br.follow_link(text='link')
> > html = br.response().read()
> > soup = BeautifulSoup(html)
> > if soup.find("title").string == u'\r\n\t\u05d4\u05d5\u05d3\u05e2\u05ea \u05de\u05e2\u05e8\u05db\u05ea - BeOK\r\n':
> > print "done at ",i,community
> > logthis("done at "+str(i)+" "+community)
> > return True
> > hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})
> > print currurl
> > #print hrefList
> > for link in hrefList:
> > #print str(link)
> > #continue
> > span = link.find('div',{"class":"MsgUsr"})
> >
> > if "frm_mngr" in str(span):
> > mgr = span.find("span",{"class":"frm_mngr"}).string
> > if not "''" in mgr:
> > continue
> > mgr = mgr.replace("'","")
> > date = link.find('span',{"class":"MsgDate"}).string.split(" ")[1]
> > #out.write(community+"\t"+mgr+"\t"+date+"\n")
> > print community.rstrip(),date,mgr
> > #fout = open("corpus\\"+community+"-"+date+"-"+mgr,"w","utf8")
> > ansDiv = link.nextSibling.find('div',{"class":"BodyMesInner"})
> > print "bla"
> > ans = fixHtml2(str(ansDiv))
> > print "bla"
> > print ans
> > #fout.write(fixHtml(link.find('div',{"class":"BodyMesInner"}).string)+"\n")
> > #fout.close()
> > questionDiv = link.previousSibling.find('div',{"class":"BodyMesInner"})
> > print "bla",questionDiv
> > quesiton = fixHtml2(str(questionDiv))
> > print question
> > span = None
> >
> >
> >
> > soup = None
> > br = None
> > except:
> >
> > time.sleep(60)
> > i+=1
> > return list(set(discussions))
> >
> > def fixHtml(page):
> > page = page.replace("</p>","\n")
> > page = page.replace("</P>","\n")
> > page = page.replace("<br />","\n")
> > page = page.replace("<BR />","\n")
> > page = page.replace("<br>","\n")
> > page = page.replace("<BR>","\n")
> > page = page.replace(""","'")
> > reg = re.compile("<")
> > reg2 = re.compile(">")
> > page = " ".join([x[-1] for x in map(reg2.split,reg.split(page))])
> > page = page.replace("\r\n\t\t\t","\n")
> > return page
> >
> > def fixHtml2(page):
> > page = page.split('ner">')[1].split("<div")[0]
> > print page
> > page = page.replace("</p>","\n")
> > page = page.replace("</P>","\n")
> > page = page.replace("<br />","\n")
> > page = page.replace("<BR />","\n")
> > page = page.replace("<br>","\n")
> > page = page.replace("<BR>","\n")
> > page = page.replace(""","'")
> > return page
> >
> > def getText(br,url):
> > br.open(url)
> > html = br.response().read()
> > soup = BeautifulSoup(html)
> > title = fixHtml(soup.find('h1',{'class':"articleName"}).contents[0])
> > #print title
> > artics = soup.findAll('div',{'class':"article"})
> > text = "\n"+fixHtml(str(artics[0]).split('"article">')[1].split('</div>')[0])
> > text += "\n<EXPERT>"+ fixHtml(str(artics[1]).split('"article">')[1].split('</div>')[0])+"</EXPERT>"
> > text = text.decode("utf-8")
> > #text = artics[0] +
> > #print type(title),type(text)
> >
> > return title+text
> >
> > def getForums(file = "links.htm"):
> > #out = open("beokDates","w","utf8")
> > soup = BeautifulSoup(open(file,"r").read())
> > communities = soup.findAll("a",{"class":"MainList"})
> > for comm in communities:
> > #print comm["href"]
> > getCommunity(comm.string,comm["href"])
> >
> > getForums()
> > #links = getQALinks()
> > file = "links.htm"
> > soup = BeautifulSoup(open(file,"r").read())
> > comm = soup.findAll("a",{"class":"MainList"})[0]
> > br = getbr()
> > currurl = comm["href"]+"/3"
> > br.open(currurl)
> > html = br.response().read()
> > soup = BeautifulSoup(html)
> > hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})[0]
> > "
>
>
>
>
> yes i have install the beautifulsoup module in python library .
when i checked that the module is working or not then in cmd its show that it is install but when i run my program code then its show that error which i have written before
[toc] | [prev] | [next] | [standalone]
| From | ismahameed@gcuf.edu.pk |
|---|---|
| Date | 2015-02-19 01:48 -0800 |
| Message-ID | <9cbcc65d-72ce-423e-9a72-1c81c9c1a7c3@googlegroups.com> |
| In reply to | #85886 |
On Thursday, February 19, 2015 at 5:46:42 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> On Thursday, February 19, 2015 at 5:31:49 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> > On Thursday, February 19, 2015 at 4:35:18 PM UTC+8, ismah...@gcuf.edu.pk wrote:
> > > this is the error in the following python code, can any one help me
> > > error{Traceback (most recent call last):
> > > File "C:\Python27\Scripts\BeOk\getBeOKExperts.py", line 6, in <module>
> > > from BeautifulSoup import BeautifulSoup
> > > ImportError: No module named BeautifulSoup}
> > >
> > >
> > >
> > > "#encoding=utf8
> > > from codecs import open
> > > from collections import defaultdict
> > > import re
> > >
> > > from BeautifulSoup import BeautifulSoup
> > > import mechanize
> > > import cookielib
> > > import html2text
> > > import time
> > >
> > >
> > > def getbr():
> > > br = mechanize.Browser()
> > >
> > > # Cookie Jar
> > > cj = cookielib.LWPCookieJar()
> > > br.set_cookiejar(cj)
> > >
> > > # Browser options
> > > br.set_handle_equiv(True)
> > > br.set_handle_gzip(True)
> > > br.set_handle_redirect(True)
> > > br.set_handle_referer(True)
> > > br.set_handle_robots(False)
> > >
> > > # Follows refresh 0 but not hangs on refresh > 0
> > > br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
> > >
> > > # User-Agent (this is cheating, ok?)
> > > br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
> > > return br
> > >
> > > def logthis(text):
> > > open("log.txt","a","utf8").write(text+"\n")
> > >
> > > def getCommunity(community,url,out=""):
> > > # Browser
> > >
> > > # The site we will navigate into, handling it's session
> > > i = 1
> > >
> > > flag = True
> > > discussions = []
> > > baseDiscussion = []
> > >
> > > while flag:
> > > print i
> > > currurl = url+"/"+str(i)
> > > try:
> > > br = getbr()
> > > br.open(currurl)
> > > #br.follow_link(text='link')
> > > html = br.response().read()
> > > soup = BeautifulSoup(html)
> > > if soup.find("title").string == u'\r\n\t\u05d4\u05d5\u05d3\u05e2\u05ea \u05de\u05e2\u05e8\u05db\u05ea - BeOK\r\n':
> > > print "done at ",i,community
> > > logthis("done at "+str(i)+" "+community)
> > > return True
> > > hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})
> > > print currurl
> > > #print hrefList
> > > for link in hrefList:
> > > #print str(link)
> > > #continue
> > > span = link.find('div',{"class":"MsgUsr"})
> > >
> > > if "frm_mngr" in str(span):
> > > mgr = span.find("span",{"class":"frm_mngr"}).string
> > > if not "''" in mgr:
> > > continue
> > > mgr = mgr.replace("'","")
> > > date = link.find('span',{"class":"MsgDate"}).string.split(" ")[1]
> > > #out.write(community+"\t"+mgr+"\t"+date+"\n")
> > > print community.rstrip(),date,mgr
> > > #fout = open("corpus\\"+community+"-"+date+"-"+mgr,"w","utf8")
> > > ansDiv = link.nextSibling.find('div',{"class":"BodyMesInner"})
> > > print "bla"
> > > ans = fixHtml2(str(ansDiv))
> > > print "bla"
> > > print ans
> > > #fout.write(fixHtml(link.find('div',{"class":"BodyMesInner"}).string)+"\n")
> > > #fout.close()
> > > questionDiv = link.previousSibling.find('div',{"class":"BodyMesInner"})
> > > print "bla",questionDiv
> > > quesiton = fixHtml2(str(questionDiv))
> > > print question
> > > span = None
> > >
> > >
> > >
> > > soup = None
> > > br = None
> > > except:
> > >
> > > time.sleep(60)
> > > i+=1
> > > return list(set(discussions))
> > >
> > > def fixHtml(page):
> > > page = page.replace("</p>","\n")
> > > page = page.replace("</P>","\n")
> > > page = page.replace("<br />","\n")
> > > page = page.replace("<BR />","\n")
> > > page = page.replace("<br>","\n")
> > > page = page.replace("<BR>","\n")
> > > page = page.replace(""","'")
> > > reg = re.compile("<")
> > > reg2 = re.compile(">")
> > > page = " ".join([x[-1] for x in map(reg2.split,reg.split(page))])
> > > page = page.replace("\r\n\t\t\t","\n")
> > > return page
> > >
> > > def fixHtml2(page):
> > > page = page.split('ner">')[1].split("<div")[0]
> > > print page
> > > page = page.replace("</p>","\n")
> > > page = page.replace("</P>","\n")
> > > page = page.replace("<br />","\n")
> > > page = page.replace("<BR />","\n")
> > > page = page.replace("<br>","\n")
> > > page = page.replace("<BR>","\n")
> > > page = page.replace(""","'")
> > > return page
> > >
> > > def getText(br,url):
> > > br.open(url)
> > > html = br.response().read()
> > > soup = BeautifulSoup(html)
> > > title = fixHtml(soup.find('h1',{'class':"articleName"}).contents[0])
> > > #print title
> > > artics = soup.findAll('div',{'class':"article"})
> > > text = "\n"+fixHtml(str(artics[0]).split('"article">')[1].split('</div>')[0])
> > > text += "\n<EXPERT>"+ fixHtml(str(artics[1]).split('"article">')[1].split('</div>')[0])+"</EXPERT>"
> > > text = text.decode("utf-8")
> > > #text = artics[0] +
> > > #print type(title),type(text)
> > >
> > > return title+text
> > >
> > > def getForums(file = "links.htm"):
> > > #out = open("beokDates","w","utf8")
> > > soup = BeautifulSoup(open(file,"r").read())
> > > communities = soup.findAll("a",{"class":"MainList"})
> > > for comm in communities:
> > > #print comm["href"]
> > > getCommunity(comm.string,comm["href"])
> > >
> > > getForums()
> > > #links = getQALinks()
> > > file = "links.htm"
> > > soup = BeautifulSoup(open(file,"r").read())
> > > comm = soup.findAll("a",{"class":"MainList"})[0]
> > > br = getbr()
> > > currurl = comm["href"]+"/3"
> > > br.open(currurl)
> > > html = br.response().read()
> > > soup = BeautifulSoup(html)
> > > hrefList = soup.findAll('div',{"class":"MsgTtlChildRow"})[0]
> > > "
> >
> >
> >
> >
> > yes i have install the beautifulsoup module in python library .
>
> when i checked that the module is working or not then in cmd its show that it is install but when i run my program code then its show that error which i have written before
i am using windows 8 , and i have installed python27, and i have installed beautifulsoup module from this website http://www.crummy.com/software/BeautifulSoup/bs4
[toc] | [prev] | [next] | [standalone]
| From | Mark Lawrence <breamoreboy@yahoo.co.uk> |
|---|---|
| Date | 2015-02-19 10:49 +0000 |
| Message-ID | <mailman.18879.1424343016.18130.python-list@python.org> |
| In reply to | #85887 |
On 19/02/2015 09:48, ismahameed@gcuf.edu.pk wrote: As Dave Angel said nearly two hours ago the module is called bs4, so the command you need is:- from bs4 import BeautifulSoup In future please don't repeat the entire email just to add a sentence or two, particularly when you do so three times, as some people pay for bandwidth. Thanks. -- My fellow Pythonistas, ask not what our language can do for you, ask what you can do for our language. Mark Lawrence
[toc] | [prev] | [standalone]
Back to top | Article view | comp.lang.python
csiph-web