If it won't be simple, it simply won't be. [source code] by Miki Tebeka, CEO, 353Solutions

Wednesday, July 12, 2006

Easy Web Scraping

This is a little script I use to email myself the latest Reality Check comic from a cron job.

#!/usr/bin/python
# Send new "Reality Check" image in email

from urllib import urlopen
import re
from smtplib import SMTP
from email.MIMEImage import MIMEImage
from email.MIMEMultipart import MIMEMultipart
from time import ctime

# My email
MYMAIL = "miki.tebeka@gmail.com"
# Find email image name
find_image = re.compile("reality\d+\.gif", re.M).search
BASE_URL = "http://www.unitedmedia.com/comics/reality"

def send_new():
'''Send new image in email'''
# Find
im = find_image(urlopen(BASE_URL).read())
if not im:
raise ValueError("error: can't find image file in web page")
image = im.group()

# Full image URL
url = BASE_URL + "/archive/images/" + image
# Read image data
image_data = urlopen(url).read()

# Send it in email
msg = MIMEMultipart()
msg["Subject"] = "Reality check for %s " % ctime()
msg["To"] = MYMAIL
msg["From"] = MYMAIL
att = MIMEImage(image_data)
att.add_header("Content-Disposition", "attachment", filename=image)
msg.attach(att)

s = SMTP("my-mail-host")
s.sendmail(MYMAIL, [MYMAIL], msg.as_string())
s.close()

if __name__ == "__main__":
try:
send_new()
except Exception, e:
raise SystemExit("error: %s" % e)
Post a Comment

Blog Archive