|
| 1 | +# pip3 install setuptools |
| 2 | + |
| 3 | +# 让Python脚本随Linux开机自动运行 |
| 4 | +# https://www.jianshu.com/p/5cd74add11ba |
| 5 | + |
| 6 | +# vim /ect/rc.local |
| 7 | +# python3 /root/parser.py > /root/rss.log |
| 8 | + |
| 9 | + |
| 10 | +# apt-get install cron |
| 11 | +# crontab -e |
| 12 | + #每隔两分钟执行一次脚本并打印日志 |
| 13 | + # */2 * * * * cd /root/rss2ifttt&&python3 parser.py >> rss.log 2>&1 & |
| 14 | +# service cron restart |
| 15 | + |
| 16 | + |
| 17 | +# generate requirements.txt |
| 18 | +# pip install pipreqs |
| 19 | +# pipreqs --force <project-path> |
| 20 | + |
| 21 | + |
| 22 | +# apt-get install python3-setuptools |
| 23 | +# pip3 install -r requirements.txt |
| 24 | + |
| 25 | + |
| 26 | +import feedparser |
| 27 | +import urllib3 |
| 28 | +from bs4 import BeautifulSoup |
| 29 | +import simplejson as json |
| 30 | +import urllib |
| 31 | +from datetime import datetime, timezone |
| 32 | +from dateutil import parser |
| 33 | +from urllib.parse import urlparse |
| 34 | +from time import sleep |
| 35 | +import logging |
| 36 | +import telegram |
| 37 | +from telegram.error import NetworkError, Unauthorized |
| 38 | +import sqlite3 |
| 39 | +import io |
| 40 | +import sys |
| 41 | +import os |
| 42 | +import hashlib |
| 43 | + |
| 44 | +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') # 改变标准输出的默认编码 |
| 45 | + |
| 46 | +# appcoding.dev |
| 47 | +ifttt_webhook = 'https://maker.ifttt.com/trigger/rss_update/with/key/ovrGYBKpZSYDdSymtG_mPO9lKdLd5e3abLE9Q22knsS' |
| 48 | +# BOT_TOKEN = '500445809:AAEuNb7B4rD6XmQ0haz3Su3yaYXkMCioqHg' #FeedsRobot |
| 49 | +BOT_TOKEN = '520542158:AAEAznZxhW-hwg7L0-R4vPig40hpasjN78Q' #RssFeedRobot |
| 50 | +channel_name = '@FeedsReader' |
| 51 | +# nwsuafer |
| 52 | +# ifttt_webhook = 'https://maker.ifttt.com/trigger/rss_update/with/key/c-y3FuRtWbwx9iqwntbN2u' |
| 53 | +# BOT_TOKEN = '521695283:AAGXJmTJ1qpLWTwNLCskTo-R53ZxX3sFiUk' |
| 54 | +# channel_name = '@listenfeeds' |
| 55 | + |
| 56 | +feed_urls = [ |
| 57 | + 'http://gank.io/feed', |
| 58 | + 'http://ifanr.com/feed', |
| 59 | + 'https://sspai.com/feed', |
| 60 | + 'http://www.geekpark.net/rss', |
| 61 | + 'https://www.ithome.com/rss/', |
| 62 | +] |
| 63 | + |
| 64 | +test_urls = [ |
| 65 | + 'http://cdc.tencent.com/feed/', |
| 66 | + 'https://www.leiphone.com/feed/categoryRss/name/ai', |
| 67 | + 'https://www.leiphone.com/feed/categoryRss/name/transportation', |
| 68 | + 'https://www.leiphone.com/feed/categoryRss/name/arvr', |
| 69 | + 'https://www.leiphone.com/feed/categoryRss/name/igao7', |
| 70 | + 'https://www.leiphone.com/feed/categoryRss/name/aijuejinzhi', |
| 71 | + 'https://www.leiphone.com/feed/categoryRss/name/qiku', |
| 72 | + 'https://www.leiphone.com/feed/categoryRss/name/zaobaoXML', |
| 73 | + 'http://www.techweb.com.cn/rss/people.xml', |
| 74 | + 'http://www.techweb.com.cn/rss/focus.xml', |
| 75 | + 'http://techcrunch.cn/feed/', |
| 76 | + 'http://xclient.info/feed/', |
| 77 | + 'http://next.36kr.com/feed', |
| 78 | + 'http://www.zreading.cn/feed', |
| 79 | + 'http://www.ixiqi.com/feed', |
| 80 | + 'http://news.ifeng.com/rss/index.xml', |
| 81 | + 'http://www.adaymag.com/feed/', |
| 82 | + 'http://www.uisdc.com/feed', |
| 83 | + 'http://cinephilia.net/feed', |
| 84 | + 'http://www.toodaylab.com/feed', |
| 85 | + 'https://feeds.appinn.com/appinns/', |
| 86 | + 'http://blog.sina.com.cn/rss/1286528122.xml', |
| 87 | + 'https://cn.engadget.com/rss.xml', |
| 88 | + 'https://www.zhihu.com/rss', |
| 89 | + 'http://www.gzhshoulu.wang/rssCreate.php?id=zxcx0101', |
| 90 | + |
| 91 | +] |
| 92 | + |
| 93 | +bot = telegram.Bot(BOT_TOKEN) |
| 94 | +m = hashlib.sha256() |
| 95 | + |
| 96 | + |
| 97 | +def parse_feed(feed_url): |
| 98 | + d = feedparser.parse(feed_url) |
| 99 | + logging.warning("parse: %s" % (feed_url)) |
| 100 | + # logging.debug(d.feed.title) |
| 101 | + |
| 102 | + for entry in d.entries: |
| 103 | + # # logging.debug(entry) |
| 104 | + # # 'Fri, 19 Jan 2018 14:24:25 +0800' |
| 105 | + # present = datetime.now(timezone.utc) |
| 106 | + # publish_at = parser.parse(entry.published) |
| 107 | + # delta = present - publish_at |
| 108 | + # |
| 109 | + # if delta.total_seconds() < 300: |
| 110 | + |
| 111 | + |
| 112 | + # logging.debug(entry.title) |
| 113 | + if hasattr(entry, 'content'): |
| 114 | + soup = BeautifulSoup(entry.content[0].value, "html.parser") # ifanr |
| 115 | + else: |
| 116 | + soup = BeautifulSoup(entry.summary, "html.parser") |
| 117 | + |
| 118 | + img_url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1516723011841&di=e525c3ba6d533f30d25e08a0a6f3d5d4&imgtype=0&src=http%3A%2F%2Fimg5.cache.netease.com%2F2008%2F2013%2F3%2F20%2F2013032021273601186.gif' |
| 119 | + imgs = soup.find_all('img') |
| 120 | + if len(imgs) > 0: |
| 121 | + img_url = imgs[0].get('src') |
| 122 | + |
| 123 | + url = remove_params(entry.links[0].href) |
| 124 | + if (img_url.startswith("/")): |
| 125 | + p = urlparse(url) |
| 126 | + img_url = p.scheme + "://" + p.netloc + img_url |
| 127 | + |
| 128 | + data = {"value1": img_url, "value2": entry.title, "value3": url} |
| 129 | + send(data) |
| 130 | + |
| 131 | + |
| 132 | +def remove_params(url): |
| 133 | + p = urlparse(url) |
| 134 | + return "%s://%s%s" % (p.scheme, p.netloc, p.path) |
| 135 | + |
| 136 | + |
| 137 | +def post(data): |
| 138 | + req = urllib.request.Request(ifttt_webhook) |
| 139 | + req.add_header('Content-Type', 'application/json') |
| 140 | + response = urllib.request.urlopen(req, bytes(json.dumps(data), 'utf8')) |
| 141 | + logging.debug(response.getcode(), datetime.now()) |
| 142 | + |
| 143 | + |
| 144 | +def send(data): |
| 145 | + prepare_connection() |
| 146 | + image_url = data['value1'] |
| 147 | + title = data['value2'] |
| 148 | + url = data['value3'] |
| 149 | + id = hashlib.sha224(json.dumps(data).encode()).hexdigest() |
| 150 | + sql = "select * from feeds WHERE id='%s'" % id |
| 151 | + cursor.execute(sql) |
| 152 | + rows = cursor.fetchall() |
| 153 | + row_count = len(rows) |
| 154 | + if row_count <= 0: |
| 155 | + try: |
| 156 | + # post by ifttt |
| 157 | + # post(data) |
| 158 | + |
| 159 | + #post by telegram bot |
| 160 | + bot.sendPhoto(chat_id=channel_name, photo=image_url, |
| 161 | + caption="%s %s" % (title, url)) |
| 162 | + |
| 163 | + |
| 164 | + cursor.execute("INSERT INTO feeds(id,image_url, title, url) VALUES (?,?,?,?)", |
| 165 | + (id,image_url, title, url)) |
| 166 | + logging.warning("post success: %s" % data) |
| 167 | + sleep(5) |
| 168 | + except: |
| 169 | + |
| 170 | + try: |
| 171 | + cache_file_path = "test.jpg" |
| 172 | + urllib.request.urlretrieve(image_url, cache_file_path) |
| 173 | + if os.path.exists(cache_file_path): |
| 174 | + bot.send_photo(chat_id=channel_name, photo=open(cache_file_path, 'rb'), |
| 175 | + caption="%s %s" % (title, url)) |
| 176 | + os.remove(cache_file_path) |
| 177 | + cursor.execute("INSERT INTO feeds(id,image_url, title, url) VALUES (?,?,?,?)", |
| 178 | + (id,image_url, title, url)) |
| 179 | + logging.error("send %s by local cache" % data) |
| 180 | + except: |
| 181 | + exc_type, exc_value, exc_traceback_obj = sys.exc_info() |
| 182 | + logging.error('\n\n\n') |
| 183 | + logging.error("post failed: %s" % data) |
| 184 | + logging.error("exc_type: %s" % exc_type) |
| 185 | + logging.error("exc_value: %s" % exc_value) |
| 186 | + logging.error("exc_traceback_obj: %s" % exc_traceback_obj) |
| 187 | + logging.error('\n\n\n') |
| 188 | + # else: |
| 189 | + # logging.warning("%s already post." % data) |
| 190 | + close_connection() |
| 191 | + |
| 192 | + |
| 193 | +def short_url(url): |
| 194 | + http = urllib3.PoolManager() |
| 195 | + r = http.request('post', |
| 196 | + 'http://dwz.cn/create.php', |
| 197 | + headers={ |
| 198 | + 'Host': 'http://dwz.cn', |
| 199 | + 'Referer': 'http://dwz.cn', |
| 200 | + 'Origin': 'http://dwz.cn', |
| 201 | + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', |
| 202 | + }, |
| 203 | + |
| 204 | + fields={'url': url}) |
| 205 | + result = json.loads(r.data.decode('utf-8')) |
| 206 | + logging.debug(result['tinyurl']) |
| 207 | + return result['tinyurl'] |
| 208 | + |
| 209 | + |
| 210 | +def prepare_connection(): |
| 211 | + global conn, cursor |
| 212 | + conn = sqlite3.connect('feeds.db') |
| 213 | + cursor = conn.cursor() |
| 214 | + cursor.execute("CREATE TABLE IF NOT EXISTS feeds (id varchar(56) PRIMARY KEY ,image_url VARCHAR(1000),title VARCHAR(1000),url varchr(1000))") |
| 215 | + |
| 216 | + |
| 217 | +def close_connection(): |
| 218 | + cursor.close() |
| 219 | + conn.commit() |
| 220 | + conn.close() |
| 221 | + |
| 222 | + |
| 223 | + |
| 224 | +try: |
| 225 | + logging.warning("parse start: %s" % str(datetime.now())) |
| 226 | + |
| 227 | + for feed_url in feed_urls: |
| 228 | + parse_feed(feed_url) |
| 229 | + for feed_url in test_urls: |
| 230 | + parse_feed(feed_url) |
| 231 | + |
| 232 | + logging.warning("parse end: %s" % str(datetime.now())) |
| 233 | +except ValueError: |
| 234 | + close_connection() |
| 235 | + pass |
0 commit comments