CSDN博客导出为Hexo的MD文件

一.获取所有文章地址、二.使用脚本进行采集

参考：CSDN文章导出md并迁移至博客园 - lytcreate - 博客园

部分代码修正：

if __name__ == '__main__':

    url_list = [{'url': item['url'], 'title': item['title']} for item in data]
    # 解析地址
    base_url = 'https://www.helloworld.net/getUrlHtml?url='
    # 解析错误的url
    err_list = []
    for item in url_list:
        try:
            print(item['url'])
            res = requests.get(base_url + item['url'])
            content = res.json().get('html')
            title = item['title']
            print(title+'已完成')
            # 调用函数，将HTML转换为Markdown并保存为文件
            html_to_md(content, os.path.join('md', f'{title}.md')) 
        except Exception as e: 
            print(e) 
            err_list.append(item['url']) 
        if err_list: 
            print(err_list) 
            df = pd.DataFrame([{'name': err_list}]) 
            df.to_csv('err.csv', index=False)

注：可能出现的问题：博客名不合规无法保存——修改文件名

三.批量给md文件增加hexo识别码

hexo识别文章需要添加如下信息：

---
title: "XXX"
date: 20XX-XX-XX XX:XX:XX
tags: ['AA', 'BB',...]
......(依据manu设置)
---

对于CSDN批量导出文章，采用以下方法快速增加相关信息：

创建 CSDN_INFO.txt 保存之前在浏览器console导出的文章信息
使用脚本 AddInfo_CSDN.py 将信息自动添加到对应md文件中，将需要转换的 .md 文件保存在同目录的 md 文件夹中

AddInfo_CSDN.py 内容如下：

# 为所有md文件开头添加对应CSDN_INFO.txt中的信息
# ---
# title: "标题"
# date: 2020-01-01 00:00:00
# tags: ["标签1", "标签2"...]
# ---

import os
import re
import time

# 读取CSDN_INFO.txt，提取每篇文章对应的文章标题title、日期postTime、标签tags
def readCSDNInfo():
    CSDNInfo = {}
    with open("CSDN_INFO.txt", "r", encoding="utf-8") as f:
        # 字符匹配title、postTime、tags
        title = re.findall(r"title\": \"(.*?)\"", f.read())
        f.seek(0)#文件指针重置
        postTime = re.findall(r"postTime\": \"(.*?)\"", f.read())
        f.seek(0)
        all_tags = re.findall(r'"tags":\s*\[([^\]]+)\]', f.read())
        tags = [None] * len(title)
        # 进一步提取每个标签
        for tag_group in all_tags:
            # 提取双引号中的内容
            tags[all_tags.index(tag_group)] = re.findall(r'"([^"]+)"', tag_group)
        # 提取title、postTime、tags
        for i in range(len(title)):
            CSDNInfo[title[i]] = {}
            CSDNInfo[title[i]]["postTime"] = postTime[i]
            CSDNInfo[title[i]]["tags"] = tags[i]
    return CSDNInfo
        
# 为md文件开头添加对应CSDN_INFO.txt中的信息
def addInfo2MD(mdPath, CSDNInfo):
    title = os.path.basename(mdPath).split(".")[0]
    if title in CSDNInfo:
        with open(mdPath, "r", encoding="utf-8") as f:
            lines = f.readlines()
            if lines[0].startswith("---"):
                return
        with open(mdPath, "r", encoding="utf-8") as f:
            content = f.read()
        with open(mdPath, "w", encoding="utf-8") as f:
            f.write("---\n")
            f.write(f"title: \"{title}\"\n")
            f.write(f"date: {CSDNInfo[title]['postTime']}\n")
            f.write(f"tags: {CSDNInfo[title]['tags']}\n")
            f.write("---\n")
            f.write(content)
        print(f"AddInfo: {title}")
    else:
        print(f"Error: {title} not in CSDN_INFO.txt")

# 遍历所有md文件
def traverseMDFiles(mdDir, CSDNInfo):
    for root, dirs, files in os.walk(mdDir):
        for file in files:
            if file.endswith(".md"):
                mdPath = os.path.join(root, file)
                addInfo2MD(mdPath, CSDNInfo)

if __name__ == "__main__":
    mdDir = "md"
    CSDNInfo = readCSDNInfo()
    traverseMDFiles(mdDir, CSDNInfo)
    print("Done!")