PDF简历信息提取

业务场景

有一个需求,就是从大批量的简历中筛选出需要的信息。本文所需要的信息是姓名、电话和Email地址。为了提高效率,故使用程序来完成这个工作。

文件转换:word转为pdf

原始文件包含了$.doc$ $.docx$ 和$.pdf$三种格式。经过验证,发现使用PDF转换后提取信息的效果更好一些,因此第一步需要把word文件转换为pdf文件。这里使用python的$win32com$包来实现转换,需要说明一点的是这个包需要调用windows下的word程序,因此只支持windows平台。
第一步:安装依赖库

pip install pywin32

第二步:转换文件

def word2pdf(file_dir):
    """word转为pdf"""
    w = Dispatch('Word.Application')
    w.Visible = 0
    w.DisplayAlerts = 0
    doc = w.Documents.Open(file_dir)
    new_path = os.path.splitext(file_dir)[0] + '.pdf'
    doc.SaveAs(new_path, FileFormat=17)
    doc.Close()
    w.Quit()

提取思路

基本的提取思路是先把PDF文件的文本内容提取出来,然后通过正则表达式去匹配值。这就导致了一个问题,姓名只能猜测而无法准确获取。PDF文本提取使用的是pdfplumber这个库,通过以下命令安装:

pip install pdfplumber

然后提取PDF内容只需要简单地几行代码即可搞定

pdf=pdfplumber.open(file)
for page in pdf.pages:
    text=page.extract_text()
    print(text)

代码

思路清楚之后把完整的代码封装为一个类,方便使用,如下:

import os
import re
import pdfplumber as pb
from win32com.client import Dispatch
import pandas as pd
import sys

class Extractor(object):
    """抽取单个文件的信息"""

    def __init__(self, file_dir):
        self.file_dir = file_dir
        if os.path.splitext(self.file_dir)[1] in [".doc", ".docx"]:
            try:
                self.__word2pdf()
            except Exception as e:
                print(e)
                return

    def __doc2docx(self):
        """doc转为docx"""
        w = Dispatch('Word.Application')
        w.Visible = 0
        w.DisplayAlerts = 0
        doc = w.Documents.Open(self.file_dir)
        new_path = os.path.splitext(self.file_dir)[0] + '.docx'
        doc.SaveAs(new_path, 12, False, "", True, "", False, False, False, False)
        doc.Close()
        w.Quit()
        os.remove(self.file_dir)
        self.file_dir = new_path
        return new_path

    def __word2pdf(self):
        """word转为pdf"""
        w = Dispatch('Word.Application')
        w.Visible = 0
        w.DisplayAlerts = 0
        doc = w.Documents.Open(self.file_dir)
        new_path = os.path.splitext(self.file_dir)[0] + '.pdf'
        doc.SaveAs(new_path, FileFormat=17)
        doc.Close()
        w.Quit()
        os.remove(self.file_dir)
        self.file_dir = new_path
        return new_path

    def __extract_text(self):
        """抽取文本内容"""
        text = ""
        if os.path.splitext(self.file_dir)[1] == ".pdf":
            pdf = pb.open(self.file_dir)
            for page in pdf.pages:
                text += page.extract_text() if page.extract_text() else ""
        # elif os.path.splitext(self.file_dir)[1] == ".docx":
        #     doc = docx.Document(self.file_dir)
        #     for para in doc.paragraphs:
        #         text += para.text
        return text

    def __extract_words(self):
        """抽取单词"""
        words = []
        if os.path.splitext(self.file_dir)[1] == ".pdf":
            pdf = pb.open(self.file_dir)
            for page in pdf.pages:
                words += page.extract_words()
        # elif os.path.splitext(self.file_dir)[1] == ".docx":
        #     doc = docx.Document(self.file_dir)
        #     for para in doc.paragraphs:
        #         words.append(para.text)
        return words

    def __search_name(self):
        """搜索姓名"""
        names = []
        full_text = self.__extract_text()
        # 先通过"姓名"字段去查找”
        for line in full_text.split("\n"):
            if re.search(r"姓\s*名", line):
                name = re.findall(r"姓\s*名[::\s]*[\u4e00-\u9fa5]{2,4}", line)[0]
                names.append(re.sub(r"[姓名::\s]", "", name))
        # 在"姓名"字段中找不到结果,则按照文字长度去猜测一个
        if len(names) < 1:
            for line in re.split(r"\n|\s+", full_text):
                if re.search(r"\d", line):
                    continue
                word = ""
                for w in line:  # 去重
                    if w not in word:
                        word += w
                if 2 <= len(word) <= 4:
                    _names = re.findall(r"[\u4e00-\u9fa5]{2,4}", word)
                    names += _names
                    # break
        return names

    def __search_email(self):
        """搜索Email地址"""
        full_words = self.__extract_words()
        email = ""
        for word in full_words:
            if os.path.splitext(self.file_dir)[1] == ".pdf":
                text = word["text"]
            else:
                text = word
            if "@" in text and "." in text:
                for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
                    if "@" in e:
                        email = e
                        break
            if email != "":
                break
        return email

    def __search_phone(self):
        """搜索电话号码"""
        full_text = self.__extract_text()
        phone = ""
        # 直接通过文件名查找
        file_name = re.split(r"/+|\\+", self.file_dir)[-1]
        number = re.findall(r"\d{11,13}", file_name)
        if len(number) > 0 and re.search(r"^1", number[0]):
            phone = number[0]
        else:
            # 通过关键词查找
            for line in re.split(r"[\n\s]+", full_text):
                if "电话" in line or "手机" in line:
                    line = re.sub(r"[()()::+\-]", "", line)
                    number = re.findall(r"\d{11,13}", line)[0]
                    phone = re.sub(r"^(86)", "", number)
                    break
            # 直接通过数字长度查找
            if phone == "":
                text = re.sub(r"[()()+\-]", "", full_text)
                phones = re.findall(r"\d{11,13}", text)
                phones = [re.sub(r"^(86)", "", p) for p in phones if re.search(r"^1", re.sub(r"^(86)", "", p))]
                phone = ",".join(set(phones))
        return phone

    def search(self):
        """入口函数,返回搜索结果"""
        sep_dir = re.split(r"/+|\\+", self.file_dir)
        directory = sep_dir[-2]
        file_name = sep_dir[-1]
        info = {"directory": directory, "file_name": file_name, "phone": "", "user_name": "", "email": ""}

        # 查找姓名
        try:
            names = self.__search_name()
            info["user_name"] = ",".join(names)
        except Exception as e:
            print(e)

        # 查找Email
        try:
            email = self.__search_email()
            info["email"] = email
        except Exception as e:
            print(e)

        # 查找电话
        try:
            phone = self.__search_phone()
            info["phone"] = phone
        except Exception as e:
            print(e)
        return info

最后,还需要处理批量文件,写一个迭代查找文件的函数:

def find_files(file_dir):
    """迭代查找文件"""
    file_paths = []
    for root, _, files in os.walk(file_dir):
        for file in files:
            path = os.path.join(root, file)
            rear = os.path.splitext(path)[1]
            if rear in [".doc", ".docx", ".pdf"]:
                file_paths.append(path)
    return file_paths

以及,文件入口:

if __name__ == "__main__":
    FILE_DIR = r"data"
    OUT_DIR = r"resume-data.xlsx"
    args = sys.argv

    if len(args) > 1:
        FILE_DIR = args[1]
    if len(args) > 2:
        OUT_DIR = args[2]
        FILE_DIR = args[1]
    # 文件存在,则追加序号
    cnt = 0
    while os.path.isfile(os.path.abspath(OUT_DIR)):
        OUT_DIR = os.path.splitext(OUT_DIR)[0] + "_" + str(cnt) + ".xlsx"
        cnt += 1
    writer = pd.ExcelWriter(OUT_DIR)
    for folder in os.listdir(FILE_DIR):
        file_dir = os.path.join(os.path.abspath(FILE_DIR), folder)
        paths = find_files(file_dir)
        print("Total {} file(s) in directory {}:".format(len(paths), folder))
        df = pd.DataFrame()
        for index, file_path in enumerate(paths):
            info = Extractor(file_dir=file_path).search()
            df = df.append(info, ignore_index=True)
            print(index, info["file_name"], info["email"], info["phone"], info["user_name"])
        df.to_excel(writer, folder)
    print("Save to file ", OUT_DIR)
    writer.save()
    print("All done.")

使用说明

首先,你的简历文件结构应该如下:

data
 - 目录一
   - 一些pdf或者word文件
 - 目录...
   - ...

使用时可以直接通过以下方式调用:

python extractor.py data result.xlsx

其中 data代表简历存放的根目录,result.xlsx代表保存文件名,这两个参数都是可选的,不加则代表使用默认值data 和resume-data.xlsx

完整的代码下载地址:extractor.py


   转载规则


《PDF简历信息提取》 查永春 采用 知识共享署名 4.0 国际许可协议 进行许可。
  目录