1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
| import requests import re import time import os
homePage = 'https://www.cnblogs.com/burnling/' pageNum = 9 cookie = '填自己的cookie,不然拿不到.md文件'
needMathJax = True folderName = 'mds'
headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'cookie': cookie }
def getBlogId(): res = requests.get(homePage) return re.findall('var currentBlogId = (\d*);',res.text,re.S)[0]
def getUrl(text): return re.findall('<a class="postTitle2 vertical-middle" href="(.*?)">',text,re.S)[0]
def getTitle(text): return re.findall('<span>(.*?)</span>',text,re.S)[0].strip()
def getDate(text): date_ = re.findall('<div class="postDesc">(.*?)<',text,re.S)[0] date_ = re.findall('(\d{4}-\d{2}-\d{2} \d{2}:\d{2})',date_,re.S)[0] date_.strip() date_ = date_+':00' return date_
def getMarkDownData(url,title): headers['referer'] = url
url = '.'.join(url.split('.')[:-1]+['md'])
ress = requests.get(url,headers=headers) ress.encoding = 'utf-8'
text = re.sub('#{1,2} *('+title+')','',ress.text)
return text
def getTags(text): params = { 'postId' : re.findall('p/(\d{7,9}).html',text,re.S)[0], 'blogId' : getBlogId(), '_' : str(int(time.time() * 1000)) } url = f'{homePage}ajax/CategoriesTags.aspx'
headers['referer'] = text res = requests.get(url,params=params,headers=headers)
aElements = ','.join(re.findall('<a (.*?</a>)',res.text))
return re.findall('">(.*?)</a>',aElements,re.S)
def get(text): url = getUrl(text)
title = getTitle(text) print(f"Downloading {title}...")
date = getDate(text)
FileData = getMarkDownData(url,title)
tags = getTags(url)
mathjax = "\nmathjax: true" if needMathJax else ""
WriteData = f'---\ntitle: {title}\ndate: {date}\ntags: [{", ".join(tags)}]{mathjax}\n---\n\n{FileData}' FileName = re.sub('(\[.*?\])','',title)
with open(f"mds/{FileName}.md","w",encoding='utf-8') as f: f.write(WriteData)
def main(pages):
res = requests.get(f'{homePage}?page={pages}')
result = re.findall('<div id="mainContent">(.*?)<!--end: mainContent 主体内容容器-->',res.text,re.S)[0] result = result.split('<div class="day" ro')
texts = []
for i in result: texts += i.split('class="postSeparator"') for text in texts: try: get(text) except BaseException as E: print(E)
try: os.mkdir(folderName) except: pass
for i in range(1,pageNum): main(i)
|