Path: csiph.com!v102.xanadu-bbs.net!xanadu-bbs.net!feeder.erje.net!eu.feeder.erje.net!newsfeed.xs4all.nl!newsfeed4a.news.xs4all.nl!xs4all!newsgate.cistron.nl!newsgate.news.xs4all.nl!post.news.xs4all.nl!not-for-mail Return-Path: X-Original-To: python-list@python.org Delivered-To: python-list@mail.python.org X-Spam-Status: OK 0.008 X-Spam-Evidence: '*H*': 0.98; '*S*': 0.00; 'subsequent': 0.05; 'sys': 0.07; 'advance': 0.07; 'subject:help': 0.08; 'append': 0.09; 'mess': 0.09; 'req': 0.09; 'run,': 0.09; 'subject:skip:a 10': 0.09; 'subject:version': 0.09; 'django': 0.11; 'def': 0.12; 'changes': 0.15; '#this': 0.16; 'cleaner': 0.16; 'django.db': 0.16; 'empty.': 0.16; 'iterable': 0.16; 'skip:j 30': 0.16; 'soup': 0.16; 'subject: \n ': 0.16; 'urllib2,': 0.16; 'variable': 0.18; '8bit%:5': 0.22; 'import': 0.22; 'this?': 0.23; 'earlier': 0.24; 'initial': 0.24; 'looks': 0.24; 'question': 0.24; 'this:': 0.26; 'function': 0.29; '8bit%:3': 0.30; 'message-id:@mail.gmail.com': 0.30; 'asked': 0.31; 'code': 0.31; "skip:' 10": 0.31; 'extract': 0.31; 'anyone': 0.31; 'class': 0.32; 'updated': 0.34; 'sense': 0.34; 'skip:_ 10': 0.34; 'skip:d 20': 0.34; 'could': 0.34; 'skip:u 20': 0.35; 'but': 0.35; 'received:google.com': 0.35; 'version': 0.36; 'skip:j 20': 0.36; 'subject:data': 0.36; 'url:jobs': 0.36; 'thanks': 0.36; 'should': 0.36; 'list': 0.37; 'list.': 0.37; 'skip:& 10': 0.38; '8bit%:4': 0.38; 'to:addr:python-list': 0.38; 'to:addr:python.org': 0.39; 'called': 0.40; '8bit%:6': 0.40; 'even': 0.60; 'skip:u 10': 0.60; 'tag': 0.61; 'new': 0.61; 'save': 0.62; 'name': 0.63; '8bit%:10': 0.64; 'more': 0.64; 'here': 0.66; 'reply': 0.66; 'saving': 0.69; 'skip:r 30': 0.69; 'overcome': 0.74; 'url:portal': 0.74; 'hoping': 0.75; 'day': 0.76; 'potentially': 0.81; '\xa0at': 0.84; 'subject:want': 0.91 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=mime-version:date:message-id:subject:from:to:content-type; bh=wrNM0aczKALBq8jDL+Z6W1j4yD/HBTLrIhlAdlzmlcc=; b=mFGRIEY6Ael4MJgAJAVD6fJmEc9G8MELCQHbRwreTyOR8ZEijlYS57VSa/V78i0pih JlKJYbZn/JDkv+2Hg4DFyk2nUtOp3RnNMsmtycwn0tSjHmujXSM0Ox0Wx/wLpFcpkaI2 hsC5LP9PO+QWkN/j/yhcqjTgdyQqG7grfOQlGGDMfLsw3DRVsXCzrG4RaZiQ99iqIMhi GW2FKPSyFpRHBQaX81zAnlEYjQsj8sp05TXyFkPcgy1KPCm61tlvWEFHiLHvCQHi4eJf xUxY0RBlmwjWSX+bmEk4n4Bya9iValyWshVoT5UDFjg4HHKbDEAgpGcDy3BFyQkJ5jVI m5nA== MIME-Version: 1.0 X-Received: by 10.66.129.169 with SMTP id nx9mr21489173pab.130.1390684934113; Sat, 25 Jan 2014 13:22:14 -0800 (PST) Date: Sat, 25 Jan 2014 13:22:14 -0800 Subject: Pls help me...I want to save scraped data automatically to my database(cleaner version) From: Max Cuban To: python-list@python.org Content-Type: multipart/alternative; boundary=001a113653707e0eeb04f0d2119c X-BeenThere: python-list@python.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: General discussion list for the Python programming language List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Newsgroups: comp.lang.python Message-ID: Lines: 126 NNTP-Posting-Host: 2001:888:2000:d::a6 X-Trace: 1390684944 news.xs4all.nl 2939 [2001:888:2000:d::a6]:37856 X-Complaints-To: abuse@xs4all.nl Xref: csiph.com comp.lang.python:64753 --001a113653707e0eeb04f0d2119c Content-Type: text/plain; charset=ISO-8859-1 I have asked this question earlier but this should make more sense than the earlier version and I don't want anyone who could potentially helped to be put off by the initial mess even if I updated it with my cleaner version as a reply I want to save the links scraped to be save in my database so that on subsequent run, it only scrapes and append only new links to the list. This is my code below but at the end of the day my database is empty. What changes can I make to overcome this? Thanks in advance from django.template.loader import get_template from django.shortcuts import render_to_response from bs4 import BeautifulSoup import urllib2, sys import urlparse import re from listing.models import jobLinks #this function extract the links def businessghana(): site = "http://www.businessghana.com/portal/jobs" hdr = {'User-Agent' : 'Mozilla/5.0'} req = urllib2.Request(site, headers=hdr) jobpass = urllib2.urlopen(req) soup = BeautifulSoup(jobpass) for tag in soup.find_all('a', href = True): tag['href'] = urlparse.urljoin(' http://www.businessghana.com/portal/', tag['href']) return map(str, soup.find_all('a', href = re.compile('.getJobInfo'))) # result from businssghana() saved to a variable to make them iterable as a list all_links = businessghana() #this function should be saving the links to the database unless the link already exist def save_new_links(all_links): current_links = jobLinks.objects.all() for i in all_links: if i not in current_links: jobLinks.objects.create(url=i) # I called the above function here hoping that it will save to database save_new_links(all_links) # return my httpResponse with this function def display_links(request): name = all_links() return render_to_response('jobs.html', {'name' : name}) My django models.py looks like this: from django.db import models class jobLinks(models.Model): links = models.URLField() pub_date = models.DateTimeField('date retrieved') def __unicode__(self): return self.links --001a113653707e0eeb04f0d2119c Content-Type: text/html; charset=ISO-8859-1 Content-Transfer-Encoding: quoted-printable

I have asked this question earlier but= this should make more sense than the earlier version and I don't want = anyone who could potentially helped to be put off by the initial mess even = if I updated it with my cleaner version as a reply=A0

I want to save the links scraped to be save in my datab= ase so that on subsequent run, it only scrapes and append only new links to= the list.

This is my code below but =A0at the end= of the day my database is empty. What changes can I make to overcome this?= Thanks in advance

=A0 =A0
=A0 =A0 from django.template.loader i= mport get_template
=A0 =A0 from django.shortcuts import render_to= _response=A0
=A0 =A0 from bs4 import BeautifulSoup
=A0 = =A0 import urllib2, sys
=A0 =A0 import urlparse
=A0 =A0 import re
=A0 =A0 = from listing.models import jobLinks
#this function extract the links
=A0 =A0 def businessghana():
=A0 =A0 =A0 =A0 site =3D "= http://www.businessgha= na.com/portal/jobs"
=A0 =A0 =A0 =A0 hdr =3D {'User-A= gent' : 'Mozilla/5.0'}
=A0 =A0 =A0 =A0 req =3D urllib2.Request(site, headers=3Dhdr)
=A0 =A0 =A0 =A0 jobpass =3D urllib2.urlopen(req)
=A0 =A0 =A0 =A0= soup =3D BeautifulSoup(jobpass)
=A0 =A0 =A0 =A0 for tag in soup.= find_all('a', href =3D True):
=A0 =A0 =A0 =A0 =A0 =A0 tag['href'] =3D urlparse.urljoin('= http://www.businessghana.c= om/portal/', tag['href'])
=A0 =A0 =A0 =A0 return = map(str, soup.find_all('a', href =3D re.compile('.getJobInfo= 9;)))
# result from businssghana() sav= ed to a variable to make them iterable as a list
=A0 =A0 all_link= s =A0=3D businessghana()

#thi= s function should be saving the links to the database unless the link alrea= dy exist
=A0 =A0 def save_new_links(all_links):
=A0 =A0= =A0 =A0 current_links =3D jobLinks.objects.all()
=A0 =A0 =A0 =A0 for i in all_links:
=A0 =A0 =A0 =A0 =A0 =A0 = if i not in current_links:
=A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 jobLin= ks.objects.create(url=3Di)
=A0 =A0=A0
# I called the above function here hoping= that it will save to database
=A0 =A0 save_new_links(all_links)

# return my httpResponse with this = function
=A0 =A0 def display_links(request):
=A0 =A0 = =A0 =A0 name =3D all_links() =A0 =A0=A0
=A0 =A0 =A0 =A0 return render_to_response('jobs.html', {'n= ame' : name})
My django models.py l= ooks like this:
from django.db import models class jobLinks(models.Model): links =3D models.URLField() pub_date =3D models.DateTimeField('date retrieved') =20 def __unicode__(self): return self.links
= =A0

--001a113653707e0eeb04f0d2119c--