From e2773dd1cd2621bdc2e0514ff66ea0a07aeea3e9 Mon Sep 17 00:00:00 2001 From: Arun Persaud Date: Mon, 15 Oct 2012 10:33:19 -0700 Subject: [PATCH 1/2] removed trailing whitespaces --- feedparser.py | 210 ++++++++++++++++++++-------------------- html2text.py | 96 +++++++++---------- readme.html | 8 +- rss2email.py | 240 +++++++++++++++++++++++----------------------- test_rss2email.py | 4 +- 5 files changed, 279 insertions(+), 279 deletions(-) diff --git a/feedparser.py b/feedparser.py index b9144a9..229e5d7 100644 --- a/feedparser.py +++ b/feedparser.py @@ -89,7 +89,7 @@ except (NameError, AttributeError): import string _maketrans = string.maketrans - + # base64 support for Atom feeds that contain embedded binary data try: import base64, binascii @@ -240,7 +240,7 @@ def search(self,string,index=0): if match is not None: # Returning a new object in the calling thread's context # resolves a thread-safety. - return EndBracketMatch(match) + return EndBracketMatch(match) return None class EndBracketMatch: def __init__(self, match): @@ -334,7 +334,7 @@ def setdefault(self, key, value): if not self.has_key(key): self[key] = value return self[key] - + def has_key(self, key): try: return hasattr(self, key) or UserDict.__contains__(self, key) @@ -343,7 +343,7 @@ def has_key(self, key): # This alias prevents the 2to3 tool from changing the semantics of the # __contains__ function below and exhausting the maximum recursion depth __has_key = has_key - + def __getattr__(self, key): try: return self.__dict__[key] @@ -398,7 +398,7 @@ def _ebcdic_to_ascii(s): _ebcdic_to_ascii_map = _maketrans( \ _l2bytes(range(256)), _l2bytes(emap)) return s.translate(_ebcdic_to_ascii_map) - + _cp1252 = { unichr(128): unichr(8364), # euro sign unichr(130): unichr(8218), # single low-9 quotation mark @@ -451,7 +451,7 @@ class _FeedParserMixin: 'http://purl.org/atom/ns#': '', 'http://www.w3.org/2005/Atom': '', 'http://purl.org/rss/1.0/modules/rss091#': '', - + 'http://webns.net/mvcb/': 'admin', 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', @@ -508,7 +508,7 @@ class _FeedParserMixin: can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] html_types = ['text/html', 'application/xhtml+xml'] - + def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): if _debug: sys.stderr.write('initializing FeedParser\n') if not self._matchnamespaces: @@ -554,7 +554,7 @@ def unknown_starttag(self, tag, attrs): # strict xml parsers do -- account for this difference if isinstance(self, _LooseFeedParser): attrs = [(k, v.replace('&', '&')) for k, v in attrs] - + # track xml:base and xml:lang attrsD = dict(attrs) baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri @@ -582,7 +582,7 @@ def unknown_starttag(self, tag, attrs): self.lang = lang self.basestack.append(self.baseuri) self.langstack.append(lang) - + # track namespaces for prefix, uri in attrs: if prefix.startswith('xmlns:'): @@ -620,7 +620,7 @@ def unknown_starttag(self, tag, attrs): self.intextinput = 0 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): self.inimage = 0 - + # call special handler (if defined) or default handler methodname = '_start_' + prefix + suffix try: @@ -754,7 +754,7 @@ def mapContentType(self, contentType): elif contentType == 'xhtml': contentType = 'application/xhtml+xml' return contentType - + def trackNamespace(self, prefix, uri): loweruri = uri.lower() if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: @@ -775,7 +775,7 @@ def trackNamespace(self, prefix, uri): def resolveURI(self, uri): return _urljoin(self.baseuri or '', uri) - + def decodeEntities(self, element, data): return data @@ -788,7 +788,7 @@ def push(self, element, expectingText): def pop(self, element, stripWhitespace=1): if not self.elementstack: return if self.elementstack[-1][0] != element: return - + element, expectingText, pieces = self.elementstack.pop() if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': @@ -833,11 +833,11 @@ def pop(self, element, stripWhitespace=1): # In Python 3, base64 takes and outputs bytes, not str # This may not be the most correct way to accomplish this output = _base64decode(output.encode('utf-8')).decode('utf-8') - + # resolve relative URIs if (element in self.can_be_relative_uri) and output: output = self.resolveURI(output) - + # decode entities within embedded markup if not self.contentparams.get('base64', 0): output = self.decodeEntities(element, output) @@ -860,7 +860,7 @@ def pop(self, element, stripWhitespace=1): if is_htmlish and RESOLVE_RELATIVE_URIS: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) - + # parse microformats # (must do this before sanitizing because some microformats # rely on elements that we sanitize) @@ -876,7 +876,7 @@ def pop(self, element, stripWhitespace=1): vcard = mfresults.get('vcard') if vcard: self._getContext()['vcard'] = vcard - + # sanitize embedded markup if is_htmlish and SANITIZE_HTML: if element in self.can_contain_dangerous_markup: @@ -906,7 +906,7 @@ def pop(self, element, stripWhitespace=1): if element == 'title' and self.hasTitle: return output - + # store output in appropriate place(s) if self.inentry and not self.insource: if element == 'content': @@ -962,7 +962,7 @@ def popContent(self, tag): self.incontent -= 1 self.contentparams.clear() return value - + # a number of elements in a number of RSS variants are nominally plain # text, but this is routinely ignored. This is an attempt to detect # the most common cases. As false positives often result in silent @@ -993,7 +993,7 @@ def _mapToStandardPrefix(self, name): prefix = self.namespacemap.get(prefix, prefix) name = prefix + ':' + suffix return name - + def _getAttribute(self, attrsD, name): return attrsD.get(self._mapToStandardPrefix(name)) @@ -1021,7 +1021,7 @@ def _itsAnHrefDamnIt(self, attrsD): pass attrsD['href'] = href return attrsD - + def _save(self, key, value, overwrite=False): context = self._getContext() if overwrite: @@ -1046,7 +1046,7 @@ def _start_rss(self, attrsD): self.version = 'rss20' else: self.version = 'rss' - + def _start_dlhottitles(self, attrsD): self.version = 'hotrss' @@ -1064,7 +1064,7 @@ def _cdf_common(self, attrsD): self._start_link({}) self.elementstack[-1][-1] = attrsD['href'] self._end_link() - + def _start_feed(self, attrsD): self.infeed = 1 versionmap = {'0.1': 'atom01', @@ -1081,7 +1081,7 @@ def _start_feed(self, attrsD): def _end_channel(self): self.infeed = 0 _end_feed = _end_channel - + def _start_image(self, attrsD): context = self._getContext() if not self.inentry: @@ -1089,7 +1089,7 @@ def _start_image(self, attrsD): self.inimage = 1 self.hasTitle = 0 self.push('image', 0) - + def _end_image(self): self.pop('image') self.inimage = 0 @@ -1101,7 +1101,7 @@ def _start_textinput(self, attrsD): self.hasTitle = 0 self.push('textinput', 0) _start_textInput = _start_textinput - + def _end_textinput(self): self.pop('textinput') self.intextinput = 0 @@ -1301,7 +1301,7 @@ def _end_subtitle(self): self.popContent('subtitle') _end_tagline = _end_subtitle _end_itunes_subtitle = _end_subtitle - + def _start_rights(self, attrsD): self.pushContent('rights', attrsD, 'text/plain', 1) _start_dc_rights = _start_rights @@ -1399,7 +1399,7 @@ def _start_cc_license(self, attrsD): attrsD['rel']='license' if value: attrsD['href']=value context.setdefault('links', []).append(attrsD) - + def _start_creativecommons_license(self, attrsD): self.push('license', 1) _start_creativeCommons_license = _start_creativecommons_license @@ -1420,7 +1420,7 @@ def _addXFN(self, relationships, href, name): value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) if value not in xfn: xfn.append(value) - + def _addTag(self, term, scheme, label): context = self._getContext() tags = context.setdefault('tags', []) @@ -1438,7 +1438,7 @@ def _start_category(self, attrsD): self.push('category', 1) _start_dc_subject = _start_category _start_keywords = _start_category - + def _start_media_category(self, attrsD): attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema') self._start_category(attrsD) @@ -1446,11 +1446,11 @@ def _start_media_category(self, attrsD): def _end_itunes_keywords(self): for term in self.pop('itunes_keywords').split(): self._addTag(term, 'http://www.itunes.com/', None) - + def _start_itunes_category(self, attrsD): self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) self.push('category', 1) - + def _end_category(self): value = self.pop('category') if not value: return @@ -1467,7 +1467,7 @@ def _end_category(self): def _start_cloud(self, attrsD): self._getContext()['cloud'] = FeedParserDict(attrsD) - + def _start_link(self, attrsD): attrsD.setdefault('rel', 'alternate') if attrsD['rel'] == 'self': @@ -1568,7 +1568,7 @@ def _end_generator(self): context = self._getContext() if context.has_key('generator_detail'): context['generator_detail']['name'] = value - + def _start_admin_generatoragent(self, attrsD): self.push('generator', 1) value = self._getAttribute(attrsD, 'rdf:resource') @@ -1583,7 +1583,7 @@ def _start_admin_errorreportsto(self, attrsD): if value: self.elementstack[-1][2].append(value) self.pop('errorreportsto') - + def _start_summary(self, attrsD): context = self._getContext() if context.has_key('summary'): @@ -1601,13 +1601,13 @@ def _end_summary(self): self.popContent(self._summaryKey or 'summary') self._summaryKey = None _end_itunes_summary = _end_summary - + def _start_enclosure(self, attrsD): attrsD = self._itsAnHrefDamnIt(attrsD) context = self._getContext() attrsD['rel']='enclosure' context.setdefault('links', []).append(FeedParserDict(attrsD)) - + def _start_source(self, attrsD): if 'url' in attrsD: # This means that we're processing a source element from an RSS 2.0 feed @@ -1659,7 +1659,7 @@ def _start_itunes_image(self, attrsD): if attrsD.get('href'): self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) _start_itunes_link = _start_itunes_image - + def _end_itunes_block(self): value = self.pop('itunes_block', 0) self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 @@ -1718,12 +1718,12 @@ def __init__(self, baseuri, baselang, encoding): self.bozo = 0 self.exc = None self.decls = {} - + def startPrefixMapping(self, prefix, uri): self.trackNamespace(prefix, uri) if uri == 'http://www.w3.org/1999/xlink': self.decls['xmlns:'+prefix] = uri - + def startElementNS(self, name, qname, attrs): namespace, localname = name lowernamespace = str(namespace or '').lower() @@ -1805,7 +1805,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): special = re.compile('''[<>'"]''') bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") elements_no_end_tag = [ - 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', + 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' ] @@ -1837,7 +1837,7 @@ def parse_starttag(self,i): def feed(self, data): data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace - data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) + data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') try: @@ -1910,7 +1910,7 @@ def handle_charref(self, ref): self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) else: self.pieces.append('&#%(ref)s;' % locals()) - + def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. @@ -1925,12 +1925,12 @@ def handle_data(self, text): # Store the original text verbatim. if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text) self.pieces.append(text) - + def handle_comment(self, text): # called for each HTML comment, e.g. # Reconstruct the original comment. self.pieces.append('' % locals()) - + def handle_pi(self, text): # called for each processing instruction, e.g. # Reconstruct original processing instruction. @@ -1942,7 +1942,7 @@ def handle_decl(self, text): # "http://www.w3.org/TR/html4/loose.dtd"> # Reconstruct original DOCTYPE self.pieces.append('' % locals()) - + _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match def _scan_name(self, i, declstartpos): rawdata = self.rawdata @@ -2006,7 +2006,7 @@ def decodeEntities(self, element, data): data = data.replace('"', '"') data = data.replace(''', "'") return data - + def strattrs(self, attrs): return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) @@ -2030,12 +2030,12 @@ def __init__(self, data, baseuri, encoding): self.enclosures = [] self.xfn = [] self.vcard = None - + def vcardEscape(self, s): if type(s) in (type(''), type(u'')): s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') return s - + def vcardFold(self, s): s = re.sub(';+$', '', s) sFolded = '' @@ -2051,14 +2051,14 @@ def vcardFold(self, s): def normalize(self, s): return re.sub(r'\s+', ' ', s).strip() - + def unique(self, aList): results = [] for element in aList: if element not in results: results.append(element) return results - + def toISO8601(self, dt): return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt) @@ -2148,21 +2148,21 @@ def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0 def findVCards(self, elmRoot, bAgentParsing=0): sVCards = '' - + if not bAgentParsing: arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) else: arCards = [elmRoot] - + for elmCard in arCards: arLines = [] - + def processSingleString(sProperty): sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding) if sValue: arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) return sValue or u'' - + def processSingleURI(sProperty): sValue = self.getPropertyValue(elmCard, sProperty, self.URI) if sValue: @@ -2185,7 +2185,7 @@ def processSingleURI(sProperty): if sContentType: sContentType = ';TYPE=' + sContentType.upper() arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) - + def processTypeValue(sProperty, arDefaultType, arForceType=None): arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) for elmResult in arResults: @@ -2197,7 +2197,7 @@ def processTypeValue(sProperty, arDefaultType, arForceType=None): sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) if sValue: arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) - + # AGENT # must do this before all other properties because it is destructive # (removes nested class="vcard" nodes so they don't interfere with @@ -2216,10 +2216,10 @@ def processTypeValue(sProperty, arDefaultType, arForceType=None): sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); if sAgentValue: arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue)) - + # FN (full name) sFN = processSingleString('fn') - + # N (name) elmName = self.getPropertyValue(elmCard, 'n') if elmName: @@ -2228,7 +2228,7 @@ def processTypeValue(sProperty, arDefaultType, arForceType=None): arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) - arLines.append(self.vcardFold('N:' + sFamilyName + ';' + + arLines.append(self.vcardFold('N:' + sFamilyName + ';' + sGivenName + ';' + ','.join(arAdditionalNames) + ';' + ','.join(arHonorificPrefixes) + ';' + @@ -2245,25 +2245,25 @@ def processTypeValue(sProperty, arDefaultType, arForceType=None): arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1])) else: arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0])) - + # SORT-STRING sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) if sSortString: arLines.append(self.vcardFold('SORT-STRING:' + sSortString)) - + # NICKNAME arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) if arNickname: arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname))) - + # PHOTO processSingleURI('photo') - + # BDAY dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) if dtBday: arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday))) - + # ADR (address) arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) for elmAdr in arAdr: @@ -2285,38 +2285,38 @@ def processTypeValue(sProperty, arDefaultType, arForceType=None): sRegion + ';' + sPostalCode + ';' + sCountryName)) - + # LABEL processTypeValue('label', ['intl','postal','parcel','work']) - + # TEL (phone number) processTypeValue('tel', ['voice']) - + # EMAIL processTypeValue('email', ['internet'], ['internet']) - + # MAILER processSingleString('mailer') - + # TZ (timezone) processSingleString('tz') - + # GEO (geographical information) elmGeo = self.getPropertyValue(elmCard, 'geo') if elmGeo: sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude)) - + # TITLE processSingleString('title') - + # ROLE processSingleString('role') # LOGO processSingleURI('logo') - + # ORG (organization) elmOrg = self.getPropertyValue(elmCard, 'org') if elmOrg: @@ -2330,39 +2330,39 @@ def processTypeValue(sProperty, arDefaultType, arForceType=None): else: arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit))) - + # CATEGORY arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) if arCategory: arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory))) - + # NOTE processSingleString('note') - + # REV processSingleString('rev') - + # SOUND processSingleURI('sound') - + # UID processSingleString('uid') - + # URL processSingleURI('url') - + # CLASS processSingleString('class') - + # KEY processSingleURI('key') - + if arLines: arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard'] sVCards += u'\n'.join(arLines) + u'\n' - + return sVCards.strip() - + def isProbablyDownloadable(self, elm): attrsD = elm.attrMap if not attrsD.has_key('href'): return 0 @@ -2461,7 +2461,7 @@ def __init__(self, baseuri, encoding, _type): def resolveURI(self, uri): return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip())) - + def unknown_starttag(self, tag, attrs): if _debug: sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs))) @@ -2575,7 +2575,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): # svgtiny - foreignObject + linearGradient + radialGradient + stop svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', - 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', + 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] @@ -2621,7 +2621,7 @@ def reset(self): self.unacceptablestack = 0 self.mathmlOK = 0 self.svgOK = 0 - + def unknown_starttag(self, tag, attrs): acceptable_attributes = self.acceptable_attributes keymap = {} @@ -2683,7 +2683,7 @@ def unknown_starttag(self, tag, attrs): clean_value = self.sanitize_style(value) if clean_value: clean_attrs.append((key,clean_value)) _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) - + def unknown_endtag(self, tag): if not tag in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: @@ -2815,7 +2815,7 @@ def http_error_301(self, req, fp, code, msg, headers): http_error_300 = http_error_302 http_error_303 = http_error_302 http_error_307 = http_error_302 - + def http_error_401(self, req, fp, code, msg, headers): # Check if # - server requires digest auth, AND @@ -2914,7 +2914,7 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h return opener.open(request) finally: opener.close() # JohnD - + # try to open with native open function (if url_file_stream_or_string is a filename) try: return open(url_file_stream_or_string, 'rb') @@ -2966,7 +2966,7 @@ def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_h def registerDateHandler(func): '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' _date_handlers.insert(0, func) - + # ISO-8601 date parsing routines written by Fazal Majid. # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 # parser is beyond the scope of feedparser and would be a worthwhile addition @@ -2977,7 +2977,7 @@ def registerDateHandler(func): # Please note the order in templates is significant because we need a # greedy match. _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO', - 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', + 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', '-YY-?MM', '-OOO', '-YY', '--MM-?DD', '--MM', '---DD', @@ -3079,7 +3079,7 @@ def _parse_date_iso8601(dateString): # Many implementations have bugs, but we'll pretend they don't. return time.localtime(time.mktime(tuple(tm))) registerDateHandler(_parse_date_iso8601) - + # 8-bit date handling routines written by ytrewq1. _korean_year = u'\ub144' # b3e2 in euc-kr _korean_month = u'\uc6d4' # bff9 in euc-kr @@ -3170,7 +3170,7 @@ def _parse_date_mssql(dateString): u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 - u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 + u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 } _greek_date_format_re = \ @@ -3360,7 +3360,7 @@ def _parse_date_rfc822(dateString): # 'ET' is equivalent to 'EST', etc. _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} rfc822._timezones.update(_additional_timezones) -registerDateHandler(_parse_date_rfc822) +registerDateHandler(_parse_date_rfc822) def _parse_date_perforce(aDateString): """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" @@ -3398,7 +3398,7 @@ def _getCharacterEncoding(http_headers, xml_data): http_headers is a dictionary xml_data is a raw string (not Unicode) - + This is so much trickier than it sounds, it's not even funny. According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type is application/xml, application/*+xml, @@ -3417,12 +3417,12 @@ def _getCharacterEncoding(http_headers, xml_data): served with a Content-Type of text/* and no charset parameter must be treated as us-ascii. (We now do this.) And also that it must always be flagged as non-well-formed. (We now do this too.) - + If Content-Type is unspecified (input was local file or non-HTTP source) or unrecognized (server just got it totally wrong), then go by the encoding given in the XML prefix of the document and default to 'iso-8859-1' as per the HTTP specification (RFC 2616). - + Then, assuming we didn't find a character encoding in the HTTP headers (and the HTTP Content-type allowed us to look in the body), we need to sniff the first few bytes of the XML data and try to determine @@ -3532,7 +3532,7 @@ def _parseHTTPContentType(content_type): if true_encoding.lower() == 'gb2312': true_encoding = 'gb18030' return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type - + def _toUTF8(data, encoding): '''Changes an XML data stream on the fly to specify a new encoding @@ -3595,7 +3595,7 @@ def _stripDoctype(data): start = re.search(_s2bytes('<\w'), data) start = start and start.start() or -1 head,data = data[:start+1], data[start+1:] - + entity_pattern = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) entity_results=entity_pattern.findall(head) head = entity_pattern.sub(_s2bytes(''), head) @@ -3617,10 +3617,10 @@ def _stripDoctype(data): data = doctype_pattern.sub(replacement, head) + data return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)]) - + def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}): '''Parse a feed from a URL, file, stream, or string. - + request_headers, if given, is a dict from http header name to value to add to the request; this overrides internally generated values. ''' @@ -3861,7 +3861,7 @@ def _writer(self, stream, node, prefix): stream.write('\n') except: pass - + class PprintSerializer(Serializer): def write(self, stream=sys.stdout): if self.results.has_key('href'): @@ -3869,7 +3869,7 @@ def write(self, stream=sys.stdout): from pprint import pprint pprint(self.results, stream) stream.write('\n') - + if __name__ == '__main__': try: from optparse import OptionParser diff --git a/html2text.py b/html2text.py index 0ed4cec..97caa8c 100644 --- a/html2text.py +++ b/html2text.py @@ -59,13 +59,13 @@ def name2cp(k): if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0]) -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', +unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', 'ndash':'-', 'oelig':'oe', 'aelig':'ae', -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', +'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', +'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', +'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} unifiable_n = {} @@ -78,7 +78,7 @@ def charref(name): c = int(name[1:], 16) else: c = int(name) - + if not UNICODE_SNOB and c in unifiable_n.keys(): return unifiable_n[c] else: @@ -101,7 +101,7 @@ def entityref(c): def replaceEntities(s): s = s.group(1) - if s[0] == "#": + if s[0] == "#": return charref(s[1:]) else: return entityref(s) @@ -122,7 +122,7 @@ def optwrap(text): """Wrap all paragraphs in the provided text.""" if not BODY_WIDTH: return text - + assert wrap, "Requires Python 2.3." result = '' newlines = 0 @@ -153,7 +153,7 @@ def hn(tag): class _html2text(HTMLParser.HTMLParser): def __init__(self, out=None, baseurl=''): HTMLParser.HTMLParser.__init__(self) - + if out is None: self.out = self.outtextf else: self.out = out try: @@ -177,43 +177,43 @@ def __init__(self, out=None, baseurl=''): self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl - - def outtextf(self, s): + + def outtextf(self, s): self.outtext += s - + def close(self): HTMLParser.HTMLParser.close(self) - + self.pbr() self.o('', 0, 'end') - + return self.outtext - + def handle_charref(self, c): self.o(charref(c)) def handle_entityref(self, c): self.o(entityref(c)) - + def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) - + def handle_endtag(self, tag): self.handle_tag(tag, None, 0) - + def previousIndex(self, attrs): """ returns the index of certain set of attributes (of a link) in the self.a list - + If the set of attributes is not found, returns None """ if not has_key(attrs, 'href'): return None - + i = -1 for a in self.a: i += 1 match = 0 - + if has_key(a, 'href') and a['href'] == attrs['href']: if has_key(a, 'title') or has_key(attrs, 'title'): if (has_key(a, 'title') and has_key(attrs, 'title') and @@ -226,13 +226,13 @@ def previousIndex(self, attrs): def handle_tag(self, tag, attrs, start): #attrs = fixattrs(attrs) - + if hn(tag): self.p() if start: self.o(hn(tag)*"#" + ' ') if tag in ['p', 'div']: self.p() - + if tag == "br" and start: self.o(" \n") if tag == "hr" and start: @@ -240,21 +240,21 @@ def handle_tag(self, tag, attrs, start): self.o("* * *") self.p() - if tag in ["head", "style", 'script']: + if tag in ["head", "style", 'script']: if start: self.quiet += 1 else: self.quiet -= 1 if tag in ["body"]: self.quiet = 0 # sites like 9rules.com never close - + if tag == "blockquote": - if start: + if start: self.p(); self.o('> ', 0, 1); self.start = 1 self.blockquote += 1 else: self.blockquote -= 1 self.p() - + if tag in ['em', 'i', 'u']: self.o("_") if tag in ['strong', 'b']: self.o("**") if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` @@ -263,7 +263,7 @@ def handle_tag(self, tag, attrs, start): attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD - + self.abbr_title = None self.abbr_data = '' if has_key(attrs, 'title'): @@ -273,13 +273,13 @@ def handle_tag(self, tag, attrs, start): self.abbr_list[self.abbr_data] = self.abbr_title self.abbr_title = None self.abbr_data = '' - + if tag == "a": if start: attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD - if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): + if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): self.astack.append(attrs) self.o("[") else: @@ -297,7 +297,7 @@ def handle_tag(self, tag, attrs, start): a['outcount'] = self.outcount self.a.append(a) self.o("][" + str(a['count']) + "]") - + if tag == "img" and start: attrsD = {} for (x, y) in attrs: attrsD[x] = y @@ -316,20 +316,20 @@ def handle_tag(self, tag, attrs, start): self.o("![") self.o(alt) self.o("]["+ str(attrs['count']) +"]") - + if tag == 'dl' and start: self.p() if tag == 'dt' and not start: self.pbr() if tag == 'dd' and start: self.o(' ') if tag == 'dd' and not start: self.pbr() - + if tag in ["ol", "ul"]: if start: self.list.append({'name':tag, 'num':0}) else: if self.list: self.list.pop() - + self.p() - + if tag == 'li': if start: self.pbr() @@ -343,10 +343,10 @@ def handle_tag(self, tag, attrs, start): self.start = 1 else: self.pbr() - + if tag in ["table", "tr"] and start: self.p() if tag == 'td': self.pbr() - + if tag == "pre": if start: self.startpre = 1 @@ -354,34 +354,34 @@ def handle_tag(self, tag, attrs, start): else: self.pre = 0 self.p() - + def pbr(self): if self.p_p == 0: self.p_p = 1 def p(self): self.p_p = 2 - + def o(self, data, puredata=0, force=0): if self.abbr_data is not None: self.abbr_data += data - - if not self.quiet: + + if not self.quiet: if puredata and not self.pre: data = re.sub('\s+', ' ', data) if data and data[0] == ' ': self.space = 1 data = data[1:] if not data and not force: return - + if self.startpre: #self.out(" :") #TODO: not output when already one there self.startpre = 0 - + bq = (">" * self.blockquote) if not (force and data and data[0] == ">") and self.blockquote: bq += " " - + if self.pre: bq += " " data = data.replace("\n", "\n"+bq) - + if self.start: self.space = 0 self.p_p = 0 @@ -397,7 +397,7 @@ def o(self, data, puredata=0, force=0): if self.p_p: self.out(('\n'+bq)*self.p_p) self.space = 0 - + if self.space: if not self.lastWasNL: self.out(' ') self.space = 0 @@ -408,7 +408,7 @@ def o(self, data, puredata=0, force=0): newa = [] for link in self.a: if self.outcount > link['outcount']: - self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) + self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) if has_key(link, 'title'): self.out(" ("+link['title']+")") self.out("\n") else: @@ -417,7 +417,7 @@ def o(self, data, puredata=0, force=0): if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. self.a = newa - + if self.abbr_list and force == "end": for abbr, definition in self.abbr_list.items(): self.out(" *[" + abbr + "]: " + definition + "\n") @@ -430,7 +430,7 @@ def o(self, data, puredata=0, force=0): def handle_data(self, data): if r'\/script>' in data: self.quiet -= 1 self.o(data, 1) - + def unknown_decl(self, data): pass def wrapwrite(text): sys.stdout.write(text) diff --git a/readme.html b/readme.html index 224fbbe..e00a253 100644 --- a/readme.html +++ b/readme.html @@ -23,7 +23,7 @@

Download

  • Create a new folder
  • Download the latest rss2email .ZIP file and unzip to the new folder - +

    Configure

    Edit the config.py file and fill in your outoing email server's details. If your server requires you to login, change "AUTHREQUIRED = 0" to "AUTHREQUIRED = 1" and enter your email username and password.

    @@ -157,7 +157,7 @@

    Customize rss2email

    DATE_HEADER = 1

    - + - - \ No newline at end of file + + diff --git a/rss2email.py b/rss2email.py index 9fd2426..9d08eaa 100644 --- a/rss2email.py +++ b/rss2email.py @@ -18,9 +18,9 @@ __version__ = "2.72" __author__ = "Lindsey Smith (lindsey@allthingsrss.com)" __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3." -___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", - "Matej Cepl", "Martin 'Joey' Schulze", - "Marcel Ackermann (http://www.DreamFlasher.de)", +___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", + "Matej Cepl", "Martin 'Joey' Schulze", + "Marcel Ackermann (http://www.DreamFlasher.de)", "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ] import urllib2 @@ -50,17 +50,17 @@ # A tuple consisting of some combination of # ('issued', 'created', 'modified', 'expired') -# expressing ordered list of preference in dates +# expressing ordered list of preference in dates # to use for the Date header of the email. DATE_HEADER_ORDER = ('modified', 'issued', 'created') # 1: Apply Q-P conversion (required for some MUAs). # 0: Send message in 8-bits. # http://cr.yp.to/smtp/8bitmime.html -#DEPRECATED +#DEPRECATED QP_REQUIRED = 0 -#DEPRECATED - +#DEPRECATED + # 1: Name feeds as they're being processed. # 0: Keep quiet. VERBOSE = 0 @@ -114,20 +114,20 @@ from email.MIMEText import MIMEText from email.Header import Header from email.Utils import parseaddr, formataddr - + # Note: You can also override the send function. def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None): """Send an email. - + All arguments should be Unicode strings (plain ASCII works as well). - + Only the real name part of sender and recipient addresses may contain non-ASCII characters. - + The email will be properly MIME encoded and delivered though SMTP to localhost port 25. This is easy to change if you want something different. - + The charset of the email will be the first one out of the list that can represent all the characters occurring in the email. """ @@ -135,7 +135,7 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps # Header class is smart enough to try US-ASCII, then the charset we # provide, then fall back to UTF-8. header_charset = 'ISO-8859-1' - + # We must choose the body charset manually for body_charset in CHARSET_LIST: try: @@ -148,16 +148,16 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps # Split real name (which is optional) and email address parts sender_name, sender_addr = parseaddr(sender) recipient_name, recipient_addr = parseaddr(recipient) - + # We must always pass Unicode strings to Header, otherwise it will # use RFC 2047 encoding even on plain ASCII strings. sender_name = str(Header(unicode(sender_name), header_charset)) recipient_name = str(Header(unicode(recipient_name), header_charset)) - + # Make sure email addresses do not contain non-ASCII characters sender_addr = sender_addr.encode('ascii') recipient_addr = recipient_addr.encode('ascii') - + # Create the message ('plain' stands for Content-Type: text/plain) msg = MIMEText(body.encode(body_charset), contenttype, body_charset) msg['To'] = formataddr((recipient_name, recipient_addr)) @@ -167,7 +167,7 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps msg[hdr] = Header(unicode(extraheaders[hdr], header_charset)) except: msg[hdr] = Header(extraheaders[hdr]) - + fromhdr = formataddr((sender_name, sender_addr)) msg['From'] = fromhdr @@ -178,9 +178,9 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps #DEPRECATED msg_as_string = outs.getvalue() if SMTP_SEND: - if not smtpserver: + if not smtpserver: import smtplib - + try: if SMTP_SSL: smtpserver = smtplib.SMTP_SSL() @@ -196,7 +196,7 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps if hasattr(e, 'reason'): print >>warn, "Reason:", e.reason sys.exit(1) - + if AUTHREQUIRED: try: smtpserver.ehlo() @@ -212,7 +212,7 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps if hasattr(e, 'reason'): print >>warn, "Reason:", e.reason sys.exit(1) - + smtpserver.sendmail(sender, recipient, msg_as_string) return smtpserver @@ -262,7 +262,7 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps pass warn = sys.stderr - + if QP_REQUIRED: print >>warn, "QP_REQUIRED has been deprecated in rss2email." @@ -280,18 +280,18 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps unix = 0 try: import fcntl -# A pox on SunOS file locking methods - if (sys.platform.find('sunos') == -1): +# A pox on SunOS file locking methods + if (sys.platform.find('sunos') == -1): unix = 1 except: pass - + import socket; socket_errors = [] for e in ['error', 'gaierror']: if hasattr(socket, e): socket_errors.append(getattr(socket, e)) -#DEPRECATED import mimify -#DEPRECATED from StringIO import StringIO as SIO +#DEPRECATED import mimify +#DEPRECATED from StringIO import StringIO as SIO #DEPRECATED mimify.CHARSET = 'utf-8' import feedparser @@ -325,13 +325,13 @@ def __init__(self): threading.Thread.__init__(self) self.result = None self.error = None - + def run(self): try: self.result = function(*args, **kw) except: self.error = sys.exc_info() - + c = Calculator() c.setDaemon(True) # don't hold up exiting c.start() @@ -343,7 +343,7 @@ def run(self): return c.result return internal2 # return internal - + def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u'')) def ishtml(t): return type(t) is type(()) @@ -357,10 +357,10 @@ def unu(s): # I / freakin' hate / that unicode def getContent(entry, HTMLOK=0): """Select the best content from an entry, deHTMLizing if necessary. If raw HTML is best, an ('HTML', best) tuple is returned. """ - + # How this works: - # * We have a bunch of potential contents. - # * We go thru looking for our first choice. + # * We have a bunch of potential contents. + # * We go thru looking for our first choice. # (HTML or text, depending on HTMLOK) # * If that doesn't work, we go thru looking for our second choice. # * If that still doesn't work, we just take the first one. @@ -369,35 +369,35 @@ def getContent(entry, HTMLOK=0): # * Instead of just taking the first one # pick the one in the "best" language. # * HACK: hardcoded HTMLOK, should take a tuple of media types - + conts = entry.get('content', []) - + if entry.get('summary_detail', {}): conts += [entry.summary_detail] - + if conts: if HTMLOK: for c in conts: if contains(c.type, 'html'): return ('HTML', c.value) - + if not HTMLOK: # Only need to convert to text if HTML isn't OK for c in conts: if contains(c.type, 'html'): cleanerhtml = BeautifulSoup.BeautifulSoup(c.value) return html2text(unicode(cleanerhtml)) - + for c in conts: if c.type == 'text/plain': return c.value - - return conts[0].value - + + return conts[0].value + return "" def getID(entry): """Get best ID from an entry. NEEDS UNIT TESTS""" if TRUST_GUID: - if 'id' in entry and entry.id: + if 'id' in entry and entry.id: # Newer versions of feedparser could return a dictionary if type(entry.id) is DictType: return entry.id.values()[0] @@ -418,7 +418,7 @@ def getName(fullfeed, entry): feedinfo = fullfeed.feed if hasattr(fullfeed, "url") and fullfeed.url in OVERRIDE_FROM.keys(): return OVERRIDE_FROM[fullfeed.url] - + name = feedinfo.get('title', '') if 'name' in entry.get('author_detail', []): # normally {} but py2.1 @@ -434,7 +434,7 @@ def getName(fullfeed, entry): if feedinfo.author_detail.name: if name: name += ", " name += feedinfo.author_detail.name - + return name def validateEmail(email, planb): @@ -443,31 +443,31 @@ def validateEmail(email, planb): if (len(email_parts) != 2) or not email_parts[0] or not email_parts[1]: return planb return email - + def getEmail(r, entry): """Get the best email_address. If the best guess isn't well-formed (something@somthing.com), use DEFAULT_FROM instead. NEEDS UNIT TESTS""" - + feed = r.feed - + if FORCE_FROM: return DEFAULT_FROM - + if hasattr(r, "url") and r.url in OVERRIDE_EMAIL.keys(): return validateEmail(OVERRIDE_EMAIL[r.url], DEFAULT_FROM) - + if 'email' in entry.get('author_detail', []): return validateEmail(entry.author_detail.email, DEFAULT_FROM) - + if 'email' in feed.get('author_detail', []): return validateEmail(feed.author_detail.email, DEFAULT_FROM) - + if USE_PUBLISHER_EMAIL: if 'email' in feed.get('publisher_detail', []): return validateEmail(feed.publisher_detail.email, DEFAULT_FROM) - + if feed.get("errorreportsto", ''): return validateEmail(feed.errorreportsto, DEFAULT_FROM) - + if hasattr(r, "url") and r.url in DEFAULT_EMAIL.keys(): return DEFAULT_EMAIL[r.url] return DEFAULT_FROM @@ -485,7 +485,7 @@ def getTags(entry): tagline = ",".join(taglist) return tagline - + ### Simple Database of Feeds ### @@ -493,7 +493,7 @@ class Feed: def __init__(self, url, to): self.url, self.etag, self.modified, self.seen = url, None, None, {} self.active = True - self.to = to + self.to = to def load(lock=1): if not os.path.exists(feedfile): @@ -506,7 +506,7 @@ def load(lock=1): print "Feedfile could not be opened: %s" % e sys.exit(1) feeds = pickle.load(feedfileObject) - + if lock: locktype = 0 if unix: @@ -515,19 +515,19 @@ def load(lock=1): #HACK: to deal with lock caching feedfileObject = open(feedfile, 'r') feeds = pickle.load(feedfileObject) - if unix: + if unix: fcntl.flock(feedfileObject.fileno(), locktype) - if feeds: + if feeds: for feed in feeds[1:]: - if not hasattr(feed, 'active'): + if not hasattr(feed, 'active'): feed.active = True - + return feeds, feedfileObject def unlock(feeds, feedfileObject): - if not unix: + if not unix: pickle.dump(feeds, open(feedfile, 'w')) - else: + else: fd = open(feedfile+'.tmp', 'w') pickle.dump(feeds, fd) fd.flush() @@ -536,15 +536,15 @@ def unlock(feeds, feedfileObject): os.rename(feedfile+'.tmp', feedfile) fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN) -#@timelimit(FEED_TIMEOUT) +#@timelimit(FEED_TIMEOUT) def parse(url, etag, modified): if PROXY == '': return feedparser.parse(url, etag, modified) else: proxy = urllib2.ProxyHandler( {"http":PROXY} ) - return feedparser.parse(url, etag, modified, handlers = [proxy]) - - + return feedparser.parse(url, etag, modified, handlers = [proxy]) + + ### Program Functions ### def add(*args): @@ -552,7 +552,7 @@ def add(*args): urls, to = [args[0]], args[1] else: urls, to = args, None - + feeds, feedfileObject = load() if (feeds and not isstr(feeds[0]) and to is None) or (not len(feeds) and to is None): print "No email address has been defined. Please run 'r2e email emailaddress' or" @@ -568,17 +568,17 @@ def run(num=None): # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" - if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] + if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds - + if num: ifeeds = [feeds[num]] feednum = 0 - + for f in ifeeds: - try: + try: feednum += 1 if not f.active: continue - + if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: @@ -586,7 +586,7 @@ def run(num=None): except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue - + # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] @@ -594,15 +594,15 @@ def run(num=None): print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue - + http_status = r.get('status', 200) if VERBOSE > 1: print >>warn, "I: http status", http_status http_headers = r.get('headers', { - 'content-type': 'application/rss+xml', + 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): - if http_status not in [200, 302]: + if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): @@ -613,13 +613,13 @@ def run(num=None): elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) - + elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) - + elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) - + elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) @@ -630,13 +630,13 @@ def run(num=None): else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) - + elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) - + elif exc_type == KeyboardInterrupt: raise r.bozo_exception - + elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) @@ -652,25 +652,25 @@ def run(num=None): print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue - + r.entries.reverse() - + for entry in r.entries: id = getID(entry) - + # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). - + frameid = entry.get('id') if not(frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] - + # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. - + if frameid in f.seen: if f.seen[frameid] == id: continue @@ -678,7 +678,7 @@ def run(num=None): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break - + if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): @@ -687,25 +687,25 @@ def run(num=None): title = getContent(entry)[:70] title = title.replace("\n", " ").strip() - + datetime = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: datetime = entry[kind] - + link = entry.get('link', "") - + from_addr = getEmail(r, entry) - + name = h2t.unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) useragenthdr = "rss2email" - + # Add post tags, if available tagline = getTags(entry) @@ -716,14 +716,14 @@ def run(num=None): if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: - print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER - + print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER + entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' - content = "\n" + content = "\n" content += '\n' content += '\n' content += '
    \n' @@ -734,7 +734,7 @@ def run(num=None): body = entrycontent[1].strip() else: body = entrycontent.strip() - if body != '': + if body != '': content += '
    \n' + body + '
    \n' content += '\n
    \n' content += "\n\n" - else: + else: if ishtml(entrycontent): contenttype = 'html' - content = "\n" - content = ("\n\n" + + content = "\n" + content = ("\n\n" + '

    '+subjecthdr+'

    \n\n' + entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) '

    URL: '+link+'

    ' ) - + if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": @@ -771,7 +771,7 @@ def run(num=None): for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += 'Via: '+extralink['title']+'
    \n' - + content += ("\n") else: content = entrycontent.strip() + "\n\nURL: "+link @@ -785,9 +785,9 @@ def run(num=None): content += 'Via: '+extralink['title']+'\n' smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver) - + f.seen[frameid] = id - + f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise @@ -804,7 +804,7 @@ def run(num=None): print >>warn, "=== END HERE ===" continue - finally: + finally: unlock(feeds, feedfileObject) if smtpserver: smtpserver.quit() @@ -812,7 +812,7 @@ def run(num=None): def list(): feeds, feedfileObject = load(lock=0) default_to = "" - + if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:]; i=1 print "default email:", default_to @@ -827,7 +827,7 @@ def list(): def opmlexport(): import xml.sax.saxutils feeds, feedfileObject = load(lock=0) - + if feeds: print '\n\n\nrss2email OPML export\n\n' for f in feeds[1:]: @@ -854,15 +854,15 @@ def opmlimport(importfile): sys.exit(1) feeds, feedfileObject = load(lock=1) - + import xml.sax.saxutils - + for f in newfeeds: if f.hasAttribute('xmlUrl'): feedurl = f.getAttribute('xmlUrl') print 'Adding %s' % xml.sax.saxutils.unescape(feedurl) feeds.append(Feed(feedurl, None)) - + unlock(feeds, feedfileObject) def delete(n): @@ -877,7 +877,7 @@ def delete(n): if n != len(feeds): print >>warn, "W: feed IDs have changed, list before deleting again" unlock(feeds, feedfileObject) - + def toggleactive(n, active): feeds, feedfileObject = load() if (n == 0) and (feeds and isstr(feeds[0])): @@ -889,7 +889,7 @@ def toggleactive(n, active): print >>warn, "%s feed %s" % (action, feeds[n].url) feeds[n].active = active unlock(feeds, feedfileObject) - + def reset(): feeds, feedfileObject = load() if feeds and isstr(feeds[0]): @@ -900,9 +900,9 @@ def reset(): f.seen = {} f.etag = None f.modified = None - + unlock(feeds, feedfileObject) - + def email(addr): feeds, feedfileObject = load() if feeds and isstr(feeds[0]): feeds[0] = addr @@ -914,8 +914,8 @@ def email(addr): try: if len(args) < 3: raise InputError, "insufficient args" feedfile, action, args = args[1], args[2], args[3:] - - if action == "run": + + if action == "run": if args and args[0] == "--no-send": def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None): if VERBOSE: print 'Not sending:', unu(subject) @@ -931,7 +931,7 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps elif action == "add": add(*args) - elif action == "new": + elif action == "new": if len(args) == 1: d = [args[0]] else: d = [] pickle.dump(d, open(feedfile, 'w')) @@ -961,14 +961,14 @@ def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtps elif action == "opmlexport": opmlexport() - elif action == "opmlimport": + elif action == "opmlimport": if not args: raise InputError, "OPML import '%s' requires a filename argument" % action opmlimport(args[0]) else: raise InputError, "Invalid action" - + except InputError, e: print "E:", e print diff --git a/test_rss2email.py b/test_rss2email.py index 8ab532f..437db85 100644 --- a/test_rss2email.py +++ b/test_rss2email.py @@ -50,7 +50,7 @@ def test_no_friendly_name(self): name = getName(0, 0) rss2email.NO_FRIENDLY_NAME = 0 self.assertEqual(name, '') - + def test_override_from(self): # have to fake url attribute because it is only set on downloaded feeds urlToOverride = 'http://example.com/feed/' @@ -96,7 +96,7 @@ def test_multiple_tags(self): tagline = getTags(entry) self.assertEqual(tagline, "tag1,tag2") - + if __name__ == '__main__': unittest.main() From cd230d4be6513b1fc6e276079b12cde3a5685d7c Mon Sep 17 00:00:00 2001 From: Arun Persaud Date: Mon, 15 Oct 2012 10:38:42 -0700 Subject: [PATCH 2/2] added comment about customization of CHARSET_LIST (should be done in config.py) --- rss2email.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rss2email.py b/rss2email.py index 9d08eaa..73350cb 100644 --- a/rss2email.py +++ b/rss2email.py @@ -109,6 +109,7 @@ # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes +# CHARSET_LIST gets customized in configy.py, so any changes there will overwrite this setting CHARSET_LIST='US-ASCII', 'BIG5', 'ISO-2022-JP', 'ISO-8859-1', 'UTF-8' from email.MIMEText import MIMEText