Skip to content

Commit 589df34

Browse files
author
Yuanming Chen
committed
Initial Commit
0 parents  commit 589df34

File tree

2 files changed

+242
-0
lines changed

2 files changed

+242
-0
lines changed

parser.py

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
# pip3 install setuptools
2+
3+
# 让Python脚本随Linux开机自动运行
4+
# https://www.jianshu.com/p/5cd74add11ba
5+
6+
# vim /ect/rc.local
7+
# python3 /root/parser.py > /root/rss.log
8+
9+
10+
# apt-get install cron
11+
# crontab -e
12+
#每隔两分钟执行一次脚本并打印日志
13+
# */2 * * * * cd /root/rss2ifttt&&python3 parser.py >> rss.log 2>&1 &
14+
# service cron restart
15+
16+
17+
# generate requirements.txt
18+
# pip install pipreqs
19+
# pipreqs --force <project-path>
20+
21+
22+
# apt-get install python3-setuptools
23+
# pip3 install -r requirements.txt
24+
25+
26+
import feedparser
27+
import urllib3
28+
from bs4 import BeautifulSoup
29+
import simplejson as json
30+
import urllib
31+
from datetime import datetime, timezone
32+
from dateutil import parser
33+
from urllib.parse import urlparse
34+
from time import sleep
35+
import logging
36+
import telegram
37+
from telegram.error import NetworkError, Unauthorized
38+
import sqlite3
39+
import io
40+
import sys
41+
import os
42+
import hashlib
43+
44+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') # 改变标准输出的默认编码
45+
46+
# appcoding.dev
47+
ifttt_webhook = 'https://maker.ifttt.com/trigger/rss_update/with/key/ovrGYBKpZSYDdSymtG_mPO9lKdLd5e3abLE9Q22knsS'
48+
# BOT_TOKEN = '500445809:AAEuNb7B4rD6XmQ0haz3Su3yaYXkMCioqHg' #FeedsRobot
49+
BOT_TOKEN = '520542158:AAEAznZxhW-hwg7L0-R4vPig40hpasjN78Q' #RssFeedRobot
50+
channel_name = '@FeedsReader'
51+
# nwsuafer
52+
# ifttt_webhook = 'https://maker.ifttt.com/trigger/rss_update/with/key/c-y3FuRtWbwx9iqwntbN2u'
53+
# BOT_TOKEN = '521695283:AAGXJmTJ1qpLWTwNLCskTo-R53ZxX3sFiUk'
54+
# channel_name = '@listenfeeds'
55+
56+
feed_urls = [
57+
'http://gank.io/feed',
58+
'http://ifanr.com/feed',
59+
'https://sspai.com/feed',
60+
'http://www.geekpark.net/rss',
61+
'https://www.ithome.com/rss/',
62+
]
63+
64+
test_urls = [
65+
'http://cdc.tencent.com/feed/',
66+
'https://www.leiphone.com/feed/categoryRss/name/ai',
67+
'https://www.leiphone.com/feed/categoryRss/name/transportation',
68+
'https://www.leiphone.com/feed/categoryRss/name/arvr',
69+
'https://www.leiphone.com/feed/categoryRss/name/igao7',
70+
'https://www.leiphone.com/feed/categoryRss/name/aijuejinzhi',
71+
'https://www.leiphone.com/feed/categoryRss/name/qiku',
72+
'https://www.leiphone.com/feed/categoryRss/name/zaobaoXML',
73+
'http://www.techweb.com.cn/rss/people.xml',
74+
'http://www.techweb.com.cn/rss/focus.xml',
75+
'http://techcrunch.cn/feed/',
76+
'http://xclient.info/feed/',
77+
'http://next.36kr.com/feed',
78+
'http://www.zreading.cn/feed',
79+
'http://www.ixiqi.com/feed',
80+
'http://news.ifeng.com/rss/index.xml',
81+
'http://www.adaymag.com/feed/',
82+
'http://www.uisdc.com/feed',
83+
'http://cinephilia.net/feed',
84+
'http://www.toodaylab.com/feed',
85+
'https://feeds.appinn.com/appinns/',
86+
'http://blog.sina.com.cn/rss/1286528122.xml',
87+
'https://cn.engadget.com/rss.xml',
88+
'https://www.zhihu.com/rss',
89+
'http://www.gzhshoulu.wang/rssCreate.php?id=zxcx0101',
90+
91+
]
92+
93+
bot = telegram.Bot(BOT_TOKEN)
94+
m = hashlib.sha256()
95+
96+
97+
def parse_feed(feed_url):
98+
d = feedparser.parse(feed_url)
99+
logging.warning("parse: %s" % (feed_url))
100+
# logging.debug(d.feed.title)
101+
102+
for entry in d.entries:
103+
# # logging.debug(entry)
104+
# # 'Fri, 19 Jan 2018 14:24:25 +0800'
105+
# present = datetime.now(timezone.utc)
106+
# publish_at = parser.parse(entry.published)
107+
# delta = present - publish_at
108+
#
109+
# if delta.total_seconds() < 300:
110+
111+
112+
# logging.debug(entry.title)
113+
if hasattr(entry, 'content'):
114+
soup = BeautifulSoup(entry.content[0].value, "html.parser") # ifanr
115+
else:
116+
soup = BeautifulSoup(entry.summary, "html.parser")
117+
118+
img_url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1516723011841&di=e525c3ba6d533f30d25e08a0a6f3d5d4&imgtype=0&src=http%3A%2F%2Fimg5.cache.netease.com%2F2008%2F2013%2F3%2F20%2F2013032021273601186.gif'
119+
imgs = soup.find_all('img')
120+
if len(imgs) > 0:
121+
img_url = imgs[0].get('src')
122+
123+
url = remove_params(entry.links[0].href)
124+
if (img_url.startswith("/")):
125+
p = urlparse(url)
126+
img_url = p.scheme + "://" + p.netloc + img_url
127+
128+
data = {"value1": img_url, "value2": entry.title, "value3": url}
129+
send(data)
130+
131+
132+
def remove_params(url):
133+
p = urlparse(url)
134+
return "%s://%s%s" % (p.scheme, p.netloc, p.path)
135+
136+
137+
def post(data):
138+
req = urllib.request.Request(ifttt_webhook)
139+
req.add_header('Content-Type', 'application/json')
140+
response = urllib.request.urlopen(req, bytes(json.dumps(data), 'utf8'))
141+
logging.debug(response.getcode(), datetime.now())
142+
143+
144+
def send(data):
145+
prepare_connection()
146+
image_url = data['value1']
147+
title = data['value2']
148+
url = data['value3']
149+
id = hashlib.sha224(json.dumps(data).encode()).hexdigest()
150+
sql = "select * from feeds WHERE id='%s'" % id
151+
cursor.execute(sql)
152+
rows = cursor.fetchall()
153+
row_count = len(rows)
154+
if row_count <= 0:
155+
try:
156+
# post by ifttt
157+
# post(data)
158+
159+
#post by telegram bot
160+
bot.sendPhoto(chat_id=channel_name, photo=image_url,
161+
caption="%s %s" % (title, url))
162+
163+
164+
cursor.execute("INSERT INTO feeds(id,image_url, title, url) VALUES (?,?,?,?)",
165+
(id,image_url, title, url))
166+
logging.warning("post success: %s" % data)
167+
sleep(5)
168+
except:
169+
170+
try:
171+
cache_file_path = "test.jpg"
172+
urllib.request.urlretrieve(image_url, cache_file_path)
173+
if os.path.exists(cache_file_path):
174+
bot.send_photo(chat_id=channel_name, photo=open(cache_file_path, 'rb'),
175+
caption="%s %s" % (title, url))
176+
os.remove(cache_file_path)
177+
cursor.execute("INSERT INTO feeds(id,image_url, title, url) VALUES (?,?,?,?)",
178+
(id,image_url, title, url))
179+
logging.error("send %s by local cache" % data)
180+
except:
181+
exc_type, exc_value, exc_traceback_obj = sys.exc_info()
182+
logging.error('\n\n\n')
183+
logging.error("post failed: %s" % data)
184+
logging.error("exc_type: %s" % exc_type)
185+
logging.error("exc_value: %s" % exc_value)
186+
logging.error("exc_traceback_obj: %s" % exc_traceback_obj)
187+
logging.error('\n\n\n')
188+
# else:
189+
# logging.warning("%s already post." % data)
190+
close_connection()
191+
192+
193+
def short_url(url):
194+
http = urllib3.PoolManager()
195+
r = http.request('post',
196+
'http://dwz.cn/create.php',
197+
headers={
198+
'Host': 'http://dwz.cn',
199+
'Referer': 'http://dwz.cn',
200+
'Origin': 'http://dwz.cn',
201+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
202+
},
203+
204+
fields={'url': url})
205+
result = json.loads(r.data.decode('utf-8'))
206+
logging.debug(result['tinyurl'])
207+
return result['tinyurl']
208+
209+
210+
def prepare_connection():
211+
global conn, cursor
212+
conn = sqlite3.connect('feeds.db')
213+
cursor = conn.cursor()
214+
cursor.execute("CREATE TABLE IF NOT EXISTS feeds (id varchar(56) PRIMARY KEY ,image_url VARCHAR(1000),title VARCHAR(1000),url varchr(1000))")
215+
216+
217+
def close_connection():
218+
cursor.close()
219+
conn.commit()
220+
conn.close()
221+
222+
223+
224+
try:
225+
logging.warning("parse start: %s" % str(datetime.now()))
226+
227+
for feed_url in feed_urls:
228+
parse_feed(feed_url)
229+
for feed_url in test_urls:
230+
parse_feed(feed_url)
231+
232+
logging.warning("parse end: %s" % str(datetime.now()))
233+
except ValueError:
234+
close_connection()
235+
pass

requirements.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
urllib3==1.22
2+
python_telegram_bot==9.0.0
3+
beautifulsoup4==4.6.0
4+
python_dateutil==2.6.1
5+
feedparser==5.2.1
6+
simplejson==3.13.2
7+
telegram==0.0.1

0 commit comments

Comments
 (0)