#!/usr/bin/python

#
# gpr.py generates RSS from a greenpece.org web page
# Copyright 2003 Bill de hÓra
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of version 2.1 of
# the GNU Lesser General Public License as published by the
# Free Software Foundation.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# You can contact the author by sending e-mail to
# bill@dehora.net?subject=greenpeace-rss.
#

import os
from time import *
import urllib, re
from xml.sax.saxutils import escape


def makeRSS():

  urltest = 'http://localhost/greenpeace/news.htm'
  url = 'http://www.greenpeace.org/news'
  urlroot = url+'/'
  file91 = 'greenpeace-rss91.xml'
  file10 = 'greenpeace-rss10.xml'
  xml =  '<?xml version="1.0" encoding="iso-8859-1"?>\n'
  title = 'Grenpeace News'
  description = 'An RSS feed for Greenpeace News'
  where = urlroot
  desctag = '<description>'+description+'</description>\n'
  titletag = '<title>'+title+'</title>\n'
  linktag = '<link>'+where+'</link>\n'
  tstamp = strftime("%Y-%m-%dT%H:%M:%S+00:00", localtime())
  rfc822stamp = strftime("%a, %d %b %Y %H:%M:%S +0000", localtime())

  rss91 =  xml
  rss91 += ' <rss version="0.91">\n'
  rss91 += '    <channel>\n'
  rss91 += '      '+titletag
  rss91 += '      '+linktag
  rss91 += '      '+desctag
  rss91 += '      <lastBuildDate>'+rfc822stamp+'</lastBuildDate>'
  rss91 += '      <language>en-us</language>\n'

  rss10 = xml
  rss10+= '<rdf:RDF\n'
  rss10+= '  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"\n'
  rss10+=   'xmlns:dc="http://purl.org/dc/elements/1.1/"\n'
  rss10+= '  xmlns="http://purl.org/rss/1.0/">\n'
  rss10+= '   <channel rdf:about="'+where+'">\n'
  rss10+= '     '+titletag
  rss10+= '     '+linktag
  rss10+= '     '+desctag
  rss10+= '     <dc:creator>Greenpeace</dc:creator>\n'
  rss10+= '     <dc:publisher>dehora FD85 1117 1888 1681 7689  B5DF E696 885C 20D8 21F8</dc:publisher>\n'
  rss10+= '     <dc:date>'+tstamp+'</dc:date>\n'
  rss10+= '     <items>\n'
  rss10+= '       <rdf:Seq>\n'

  rss10Seq = ''
  rss10Items=''

# typical links
#                 <A
#                  href="details?item%5fid=177484"
#                  target=_self>Seizing rainbows and stopping wars</A>
#            <A
#            href="details?item%5fid=196577"
#            target=_self>US to UN: Butt out</A>
#
#             <A
#                  href="details?item%5fid=172591"
#                  target=_self>More<IMG class=imagelink height=12 alt=more
#                  src="Latest news_files/gpi_more.gif" width=12 align=middle
#                  border=0> </A>
#
#
# all this nasty re s2t needs to go once pyana is installed on the server
#
  # we can't close with </A> because re might blows its stack
  pattern = re.compile('href="(details\?item%5fid=[0-9]+)"\s*target="_self"\s*>(.*?)/', re.S)
  f = urllib.urlopen(url).read()
  for (link, description) in pattern.findall(f):
    desc = escape(description)
    desc = desc.replace('\n', '')
    desc = desc.replace('\r', '')
    # each link is repeated as More<.., skip it
    if desc.lower().find('more&lt;') == -1:
      # suck down the details page for a longdesc and grab the first 3 items
      f1 = urllib.urlopen(urlroot+link).read()
      pattern1 = re.compile('<\s*td\s*class="bodyhighlightbold"\s*>(.*?)</\s*td\s*>', re.S)
      p1 = pattern1.findall(f1)
      longdesc = ""
      longdesc += p1[0] + '; '
      longdesc += p1[1] + '; '
      longdesc += p1[2]
      longdesc = escape(longdesc)
      desc = desc.replace('&lt;', '')
      rss91 += ('\n<item>\n<title>%s</title>\n<link>'+urlroot+'%s</link>\n<description>%s</description>\n</item>\n') \
               % (desc, link, longdesc)
      rss10Seq += ('\n<rdf:li resource="'+urlroot+'%s" />\n') % (link)
      rss10Items += ('\n<item rdf:about="'+urlroot+'%s">\n<title>%s</title><link>' +\
                     urlroot+'%s</link>\n<description>%s</description>\n</item>')\
                     % (link, desc, link, longdesc)

  rss91 += '  </channel>\n'
  rss91 += '</rss>\n'

  rss10 += rss10Seq
  rss10+= '       </rdf:Seq>\n'
  rss10+= '     </items>\n'
  rss10 += rss10Items
  rss10+= '   </channel>\n'
  rss10+= '</rdf:RDF>\n'

  if rss91<>open(file91).read():
    fh=open(file91,'w')
    fh.write(rss91)
    fh.close()

  if rss10<>open(file10).read():
    fh=open(file10,'w')
    fh.write(rss10)
    fh.close()




if __name__ == "__main__":
    makeRSS()
