Path: csiph.com!v102.xanadu-bbs.net!xanadu-bbs.net!goblin1!goblin2!goblin.stu.neva.ru!newsfeed.xs4all.nl!newsfeed3a.news.xs4all.nl!xs4all!post.news.xs4all.nl!not-for-mail Return-Path: X-Original-To: python-list@python.org Delivered-To: python-list@mail.python.org X-Spam-Status: OK 0.009 X-Spam-Evidence: '*H*': 0.98; '*S*': 0.00; 'output': 0.05; 'subsequent': 0.05; 'assign': 0.07; 'context': 0.07; 'sys': 0.07; 'subject:help': 0.08; 'append': 0.09; 'req': 0.09; 'run,': 0.09; 'try:': 0.09; 'def': 0.12; 'here.\xa0': 0.16; 'skip:j 30': 0.16; 'soup': 0.16; 'urllib2,': 0.16; '8bit%:5': 0.22; 'code,': 0.22; 'programming': 0.22; 'import': 0.22; 'skip:\xa0 20': 0.24; 'earlier': 0.24; 'fine': 0.24; 'script': 0.25; 'appreciated.': 0.29; 'skip:p 30': 0.29; 'absolute': 0.30; 'message- id:@mail.gmail.com': 0.30; 'skip:( 20': 0.30; "i'm": 0.30; 'lines': 0.31; "skip:' 10": 0.31; 'extract': 0.31; 'stuff': 0.32; 'run': 0.32; 'skip:d 20': 0.34; 'display': 0.35; 'except': 0.35; 'skip:u 20': 0.35; 'but': 0.35; 'received:google.com': 0.35; '8bit%:9': 0.36; 'skip:j 20': 0.36; 'subject:data': 0.36; 'url:jobs': 0.36; 'next': 0.36; 'two': 0.37; 'list.': 0.37; 'skip:& 10': 0.38; '8bit%:4': 0.38; 'to:addr:python-list': 0.38; 'rather': 0.38; '\xa0\xa0\xa0': 0.39; 'to:addr:python.org': 0.39; 'unable': 0.39; 'skip:p 20': 0.39; '8bit%:6': 0.40; 'skip:u 10': 0.60; 'tag': 0.61; 'new': 0.61; 'first': 0.61; 'save': 0.62; 'name': 0.63; '8bit%:10': 0.64; 'jobs': 0.68; 'date,': 0.68; 'skip:r 30': 0.69; 'url:portal': 0.74; 'subject:want': 0.91; 'url:n': 0.93 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=mime-version:date:message-id:subject:from:to:content-type; bh=BpWG9CiYTb/KQ1et/LE+LkxSXFwEih7DRspMlo5VFPY=; b=xiIiiquB24ZywEsC4h3rD/caYS6ZC7em2QGidV7+bsQSE+JuQNH4ZBUrFuHZ7CTM+o j7UFGvkFfNyb2BDAme87RUNgy+EJ487Ruyv+JOXLsP+5nYdMVhxkd9ynZdq8O3S1Zvkp Dv6EfWpZ4Fxmhbl9gXLdwqXqXpXEO0XcBIW9F25mubgUG2YG6JhoxQdOeKID9zp9v+sj 2gYmh/oX8XFubcIeXFuc8uDDOprD7asHKS96yPAfKGB+2rJG5FveeIi26bjeIRAifpGH Rc+I+zbHt0zJEN9luo5+vLGbHJUBeOUfxOBKyeoLJ1z3P2oyJkvhsVo58NElyq4o+b1q pf5A== MIME-Version: 1.0 X-Received: by 10.68.20.1 with SMTP id j1mr21458927pbe.148.1390676441116; Sat, 25 Jan 2014 11:00:41 -0800 (PST) Date: Sat, 25 Jan 2014 11:00:41 -0800 Subject: Pls help me...I want to save data to my database but I am unable to From: Max Cuban To: python-list@python.org Content-Type: multipart/alternative; boundary=bcaec520e87d4535bb04f0d017bc X-BeenThere: python-list@python.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: General discussion list for the Python programming language List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Newsgroups: comp.lang.python Message-ID: Lines: 195 NNTP-Posting-Host: 2001:888:2000:d::a6 X-Trace: 1390676453 news.xs4all.nl 2867 [2001:888:2000:d::a6]:42357 X-Complaints-To: abuse@xs4all.nl Xref: csiph.com comp.lang.python:64749 --bcaec520e87d4535bb04f0d017bc Content-Type: text/plain; charset=ISO-8859-1 This is my first programming pet project. I have the following script that extracts links from specific sites and display them on the web(via django). The script work fine but I'm unable to save any stuff in my database. Hence if I run the code, I get the output I want but then it always extracts only new content. I will rather want to have the content scrapped earlier saved to the database so that on subsequent run, it only scrap and append ONLY new links to the list. [ ] Any help will be appreciated. [] # Create your views here. from django.template.loader import get_template from django.core.paginator import Paginator, EmptyPage, PageNotAnInteger from django.shortcuts import render_to_response from django.template import Context from bs4 import BeautifulSoup import urllib2, sys import urlparse import re from datetime import date, datetime from listing.models import jobLinks def businessghana(): site = "http://www.businessghana.com/portal/jobs" hdr = {'User-Agent' : 'Mozilla/5.0'} req = urllib2.Request(site, headers=hdr) jobpass = urllib2.urlopen(req) soup = BeautifulSoup(jobpass) for tag in soup.find_all('a', href = True): tag['href'] = urlparse.urljoin(' http://www.businessghana.com/portal/', tag['href']) return map(str, soup.find_all('a', href = re.compile('.getJobInfo'))) def tonaton(): site = "http://tonaton.com/en/job-vacancies-in-ghana" hdr = {'User-Agent' : 'Mozilla/5.0'} req = urllib2.Request(site, headers=hdr) jobpass = urllib2.urlopen(req) soup = BeautifulSoup(jobpass) result = [] # next two lines make all the links in the soup absolute for tag in soup.find_all('a', href=True): tag['href'] = urlparse.urljoin('http://www.tonaton.com', tag['href']) # assign all 'h2' tags to 'jobs'. The 'h2'tag contains the required links jobs = soup.find_all('h2') # Loop through the 'h2' tags and extract all the links for h2 in soup.find_all('h2'): n = h2.next_element if n.name == 'a': result.append(str(n)) return result def jobscomgh(): site = "http://jobs.com.gh" hdr = {'User-Agent' : 'Mozilla/5.0'} req = urllib2.Request(site, headers=hdr) jobpass = urllib2.urlopen(req) soup = BeautifulSoup(jobpass) return map(str, soup.find_all('a', href = re.compile('.display-job'))) businessghana_links = businessghana() tonaton_links = tonaton() jobscomgh_links = jobscomgh() def all_links(): return (businessghana_links + tonaton_links + jobscomgh_links) def save_new_links(all_links): current_links = jobLinks.objects.all() for i in all_links: if i not in current_links: jobLinks.objects.create(url=i) def this_week_links(all_links): return jobLinks.objects.filter(date__gte = datetime.timedelta(days=-7)) save_new_links(all_links) this_week_links(all_links) def display_links(request): name = all_links() paginator = Paginator(name, 25) page = request.GET.get('page') try: name = paginator.page(page) except PageNotAnInteger: name = paginator.page(1) except EmptyPage: name = paginator.page(paginator.num_pages) return render_to_response('jobs.html', {'name' : name}) --bcaec520e87d4535bb04f0d017bc Content-Type: text/html; charset=ISO-8859-1 Content-Transfer-Encoding: quoted-printable
This is my first programming pet project. I have the = following script that extracts links from specific sites and display them o= n the web(via django). The script work fine but I'm unable to save any = stuff in my database.

Hence if I run the code, I get the output I want but th= en it always extracts only new content. I will rather want to have the cont= ent scrapped earlier saved to the database so that on subsequent run, it on= ly scrap and append ONLY new links to the list.
[ ]
Any help will be appreciated.
[]
=A0= =A0 # Create your views here.
=A0 =A0 from django.template.loade= r import get_template
=A0 =A0 from django.core.paginator import P= aginator, EmptyPage, PageNotAnInteger
=A0 =A0 from django.shortcuts import render_to_response=A0
= =A0 =A0 from django.template import Context
=A0 =A0 from bs4 impo= rt BeautifulSoup
=A0 =A0 import urllib2, sys
=A0 =A0 im= port urlparse
=A0 =A0 import re
=A0 =A0 from datetime import date, datetim= e
=A0 =A0 from listing.models import jobLinks


=A0 =A0 def businessghana():
=A0 =A0 =A0 =A0= site =3D "http:/= /www.businessghana.com/portal/jobs"
=A0 =A0 =A0 =A0 hdr =3D {'User-Agent' : 'Mozilla/5.0'}=
=A0 =A0 =A0 =A0 req =3D urllib2.Request(site, headers=3Dhdr)
=A0 =A0 =A0 =A0 jobpass =3D urllib2.urlopen(req)
=A0 =A0 = =A0 =A0 soup =3D BeautifulSoup(jobpass)
=A0 =A0 =A0 =A0 for tag in soup.find_all('a', href =3D True):<= /div>
=A0 =A0 =A0 =A0 =A0 =A0 tag['href'] =3D urlparse.urljoin(= 'http://www.businessgh= ana.com/portal/', tag['href'])
=A0 =A0 =A0 =A0 return map(str, soup.find_all('a', href =3D re= .compile('.getJobInfo')))

=A0 =A0 def tona= ton():
=A0 =A0 =A0 =A0 hdr =3D {'User-Agent' : 'Mozilla/5.0'}=
=A0 =A0 =A0 =A0 req =3D urllib2.Request(site, headers=3Dhdr)
=A0 =A0 =A0 =A0 jobpass =3D urllib2.urlopen(req)
=A0 =A0 = =A0 =A0 soup =3D BeautifulSoup(jobpass)
=A0 =A0 =A0 =A0 result =3D []
=A0 =A0 =A0 =A0 # next two lin= es make all the links in the soup absolute =A0 =A0
=A0 =A0 =A0 = =A0 for tag in soup.find_all('a', href=3DTrue):
=A0 =A0 =A0 =A0 # assign all 'h2' tags to 'jobs'. The = 'h2'tag contains the required links =A0=A0
=A0 =A0 =A0 = =A0 jobs =3D soup.find_all('h2')
=A0 =A0 =A0 =A0 # Loop t= hrough the 'h2' tags and extract all the links
=A0 =A0 =A0 =A0 for h2 in soup.find_all('h2'):
=A0 = =A0 =A0 =A0 =A0 =A0 n =3D h2.next_element
=A0 =A0 =A0 =A0 =A0 =A0= if n.name =3D=3D 'a': =A0result.appe= nd(str(n))
=A0 =A0 =A0 =A0 return result

=A0 =A0 def jobscomgh():
=A0 =A0 =A0 =A0 site= =3D "http://jobs.com.gh"
=A0 =A0 =A0 =A0 hdr =3D {'User-Agent' : 'Mozilla/5.0'= }
=A0 =A0 =A0 =A0 req =3D urllib2.Request(site, headers=3Dhdr)
=A0 =A0 =A0 =A0 jobpass =3D urllib2.urlopen(req)
=A0 =A0 =A0= =A0 soup =3D BeautifulSoup(jobpass)
=A0 =A0 =A0 =A0 return map(s= tr, soup.find_all('a', href =3D re.compile('.display-job'))= )

=A0 =A0 businessghana_links =3D businessghana()
=A0 =A0 tonaton_l= inks =3D tonaton()
=A0 =A0 jobscomgh_links =3D jobscomgh()
<= div>
=A0 =A0 def all_links():
=A0 =A0 =A0 =A0 retur= n (businessghana_links + tonaton_links + jobscomgh_links)

=A0 =A0 def save_new_links(all_links):
=A0 = =A0 =A0 =A0 current_links =3D jobLinks.objects.all()
=A0 =A0 =A0 = =A0 for i in all_links:
=A0 =A0 =A0 =A0 =A0 =A0 if i not in curre= nt_links:
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 jobLinks.objects.create= (url=3Di)
=A0 =A0=A0
=A0 =A0 def this_week_links(all_links):
=A0 =A0 =A0 =A0 return jobLinks.objects.filter(date__gte =3D datetime.time= delta(days=3D-7))

=A0 =A0 save_new_links(all_links= )
=A0 =A0 this_week_links(all_links) =A0 =A0 =A0 =A0 =A0 =A0 =A0 = =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0

=A0 =A0 def display_links(request):
=A0 =A0 = =A0 =A0 name =3D all_links()
=A0 =A0 =A0 =A0 paginator =3D Pagina= tor(name, 25)
=A0 =A0 =A0 =A0 page =3D request.GET.get('page&= #39;)
=A0 =A0 =A0 =A0 try:
=A0 =A0 =A0 =A0 =A0 =A0 name =3D paginator.page(page)
=A0 = =A0 =A0 =A0 except PageNotAnInteger:
=A0 =A0 =A0 =A0 =A0 =A0 name= =3D paginator.page(1)
=A0 =A0 =A0 =A0 except EmptyPage:
=A0 =A0 =A0 =A0 =A0 =A0 name =3D paginator.page(paginator.num_pages)
=A0 =A0=A0
=A0 =A0 =A0 =A0=A0
=A0 =A0 =A0 =A0 retu= rn render_to_response('jobs.html', {'name' : name}) =A0 =A0=
=A0 =A0=A0

--bcaec520e87d4535bb04f0d017bc--