Introduction to AI Resume Parsing
Resume parsing is a fundamental task in HR technology, powering applicant tracking systems (ATS), job boards, and recruitment platforms worldwide. The challenge lies in extracting structured information from unstructured documents that come in countless formats and styles.
In this comprehensive tutorial, you’ll build a production-ready resume parser that uses Natural Language Processing to extract key information like contact details, work experience, education, skills, and more. This is the same technology used by LinkedIn, Indeed, and enterprise HR systems.
What You’ll Build
By the end of this tutorial, you’ll have a complete resume parsing system that:
- Extracts text from PDF, DOCX, and image-based resumes
- Identifies and extracts contact information (email, phone, LinkedIn)
- Parses work experience with company names, titles, and dates
- Extracts education history with degrees and institutions
- Identifies technical and soft skills
- Uses Named Entity Recognition for accurate extraction
- Provides a REST API and web interface
Understanding Resume Parsing
The Parsing Pipeline
Resume parsing involves several stages:
1. Document Ingestion: Convert PDF, DOCX, or images to raw text
2. Text Preprocessing: Clean and normalize the extracted text
3. Section Detection: Identify different resume sections (Experience, Education, Skills)
4. Entity Extraction: Use NER to extract names, organizations, dates
5. Relationship Mapping: Connect extracted entities (job title + company + dates)
6. Structured Output: Generate clean JSON with all parsed information
Common Challenges
Resume parsing is difficult because:
- No standard format – every resume is different
- Inconsistent date formats (Jan 2020, 01/2020, 2020-01)
- Abbreviations and acronyms vary widely
- Multi-column layouts confuse text extraction
- Skills can be listed or embedded in descriptions
Prerequisites and Setup
Required Libraries
# Create virtual environment
python -m venv resume_parser_env
source resume_parser_env/bin/activate
# Core NLP dependencies
pip install spacy
python -m spacy download en_core_web_lg
# Document processing
pip install pypdf2 python-docx pdf2image pytesseract
pip install pillow
# Additional NLP tools
pip install dateparser phonenumbers
pip install transformers torch
# Web framework
pip install flask flask-cors
pip install python-dotenvProject Structure
resume_parser/
├── app.py # Flask API
├── parser/
│ ├── __init__.py
│ ├── document_reader.py # PDF/DOCX extraction
│ ├── text_processor.py # Text cleaning
│ ├── section_detector.py # Section identification
│ ├── entity_extractor.py # NER-based extraction
│ ├── skill_extractor.py # Skills identification
│ └── resume_parser.py # Main parser class
├── models/
│ └── skills_database.json # Known skills list
├── templates/
│ └── index.html # Web interface
├── uploads/ # Uploaded resumes
└── requirements.txtStep 1: Document Reader
First, let’s create a robust document reader that handles multiple formats:
# parser/document_reader.py
import os
import io
from typing import Optional
from PyPDF2 import PdfReader
from docx import Document
from PIL import Image
import pytesseract
class DocumentReader:
"""Extract text from various document formats."""
def __init__(self):
# Configure Tesseract path if needed
# pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
pass
def read_pdf(self, file_path: str) -> str:
"""Extract text from PDF file."""
text_parts = []
with open(file_path, 'rb') as file:
reader = PdfReader(file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
text = '\n'.join(text_parts)
# If PDF text extraction failed (scanned PDF), try OCR
if len(text.strip()) < 100:
text = self._ocr_pdf(file_path)
return text
def _ocr_pdf(self, file_path: str) -> str:
"""Use OCR for scanned PDFs."""
try:
from pdf2image import convert_from_path
images = convert_from_path(file_path)
text_parts = []
for image in images:
text = pytesseract.image_to_string(image)
text_parts.append(text)
return '\n'.join(text_parts)
except Exception as e:
return f"OCR failed: {str(e)}"
def read_docx(self, file_path: str) -> str:
"""Extract text from DOCX file."""
doc = Document(file_path)
text_parts = []
for paragraph in doc.paragraphs:
text_parts.append(paragraph.text)
# Also extract from tables
for table in doc.tables:
for row in table.rows:
row_text = ' | '.join(cell.text for cell in row.cells)
text_parts.append(row_text)
return '\n'.join(text_parts)
def read_image(self, file_path: str) -> str:
"""Extract text from image using OCR."""
image = Image.open(file_path)
text = pytesseract.image_to_string(image)
return text
def read(self, file_path: str) -> str:
"""Read document based on file extension."""
_, ext = os.path.splitext(file_path.lower())
readers = {
'.pdf': self.read_pdf,
'.docx': self.read_docx,
'.doc': self.read_docx,
'.png': self.read_image,
'.jpg': self.read_image,
'.jpeg': self.read_image,
}
if ext not in readers:
raise ValueError(f"Unsupported file format: {ext}")
return readers[ext](file_path)Step 2: Text Processor
Clean and normalize the extracted text:
# parser/text_processor.py
import re
from typing import List, Tuple
class TextProcessor:
"""Clean and preprocess resume text."""
def __init__(self):
# Common resume section headers
self.section_patterns = [
r'(?i)^\s*(work\s*)?experience\s*:?\s*$',
r'(?i)^\s*education\s*:?\s*$',
r'(?i)^\s*skills?\s*:?\s*$',
r'(?i)^\s*technical\s+skills?\s*:?\s*$',
r'(?i)^\s*projects?\s*:?\s*$',
r'(?i)^\s*certifications?\s*:?\s*$',
r'(?i)^\s*summary\s*:?\s*$',
r'(?i)^\s*objective\s*:?\s*$',
r'(?i)^\s*profile\s*:?\s*$',
]
def clean_text(self, text: str) -> str:
"""Basic text cleaning."""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Fix common OCR errors
text = text.replace('|', 'I')
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s\.\,\-\@\+\(\)\/\:\;\&]', '', text)
return text.strip()
def normalize_text(self, text: str) -> str:
"""Normalize text while preserving structure."""
lines = text.split('\n')
normalized_lines = []
for line in lines:
# Remove leading/trailing whitespace
line = line.strip()
# Skip empty lines
if not line:
normalized_lines.append('')
continue
# Normalize whitespace within line
line = ' '.join(line.split())
normalized_lines.append(line)
return '\n'.join(normalized_lines)
def extract_lines(self, text: str) -> List[str]:
"""Split text into lines and filter empty ones."""
lines = text.split('\n')
return [line.strip() for line in lines if line.strip()]
def identify_sections(self, text: str) -> List[Tuple[str, str]]:
"""Identify resume sections and their content."""
lines = self.extract_lines(text)
sections = []
current_section = "HEADER"
current_content = []
for line in lines:
is_section_header = False
for pattern in self.section_patterns:
if re.match(pattern, line):
# Save previous section
if current_content:
sections.append((current_section, '\n'.join(current_content)))
# Start new section
current_section = re.sub(r'[:\s]+$', '', line).upper()
current_content = []
is_section_header = True
break
if not is_section_header:
current_content.append(line)
# Don't forget the last section
if current_content:
sections.append((current_section, '\n'.join(current_content)))
return sectionsStep 3: Entity Extractor
Use spaCy NER for intelligent extraction:
# parser/entity_extractor.py
import re
import spacy
import phonenumbers
import dateparser
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field
@dataclass
class ContactInfo:
name: Optional[str] = None
email: Optional[str] = None
phone: Optional[str] = None
linkedin: Optional[str] = None
github: Optional[str] = None
location: Optional[str] = None
@dataclass
class Experience:
company: str = ""
title: str = ""
start_date: Optional[str] = None
end_date: Optional[str] = None
description: str = ""
is_current: bool = False
@dataclass
class Education:
institution: str = ""
degree: str = ""
field_of_study: str = ""
graduation_date: Optional[str] = None
gpa: Optional[str] = None
class EntityExtractor:
"""Extract entities from resume text using NLP."""
def __init__(self):
self.nlp = spacy.load("en_core_web_lg")
# Email regex
self.email_pattern = re.compile(
r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
)
# LinkedIn URL pattern
self.linkedin_pattern = re.compile(
r'(?:https?://)?(?:www\.)?linkedin\.com/in/[\w-]+'
)
# GitHub URL pattern
self.github_pattern = re.compile(
r'(?:https?://)?(?:www\.)?github\.com/[\w-]+'
)
# Date patterns
self.date_patterns = [
r'(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|'
r'Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|'
r'Dec(?:ember)?)\s*\d{4}',
r'\d{1,2}/\d{4}',
r'\d{4}',
]
# Degree keywords
self.degree_keywords = [
'Bachelor', 'Master', 'PhD', 'Ph.D', 'Doctor', 'Associate',
'B.S.', 'B.A.', 'M.S.', 'M.A.', 'MBA', 'B.E.', 'M.E.',
'B.Tech', 'M.Tech', 'BBA', 'MCA', 'BCA'
]
def extract_contact_info(self, text: str) -> ContactInfo:
"""Extract contact information from resume."""
contact = ContactInfo()
# Extract email
email_match = self.email_pattern.search(text)
if email_match:
contact.email = email_match.group()
# Extract phone
contact.phone = self._extract_phone(text)
# Extract LinkedIn
linkedin_match = self.linkedin_pattern.search(text)
if linkedin_match:
contact.linkedin = linkedin_match.group()
# Extract GitHub
github_match = self.github_pattern.search(text)
if github_match:
contact.github = github_match.group()
# Extract name using NER
doc = self.nlp(text[:500]) # Name usually in first part
for ent in doc.ents:
if ent.label_ == "PERSON" and not contact.name:
contact.name = ent.text
break
# Extract location
for ent in doc.ents:
if ent.label_ in ["GPE", "LOC"] and not contact.location:
contact.location = ent.text
break
return contact
def _extract_phone(self, text: str) -> Optional[str]:
"""Extract phone number from text."""
# Try to parse phone numbers
for match in phonenumbers.PhoneNumberMatcher(text, "US"):
return phonenumbers.format_number(
match.number,
phonenumbers.PhoneNumberFormat.NATIONAL
)
# Fallback to regex
phone_pattern = re.compile(
r'(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}'
)
match = phone_pattern.search(text)
return match.group() if match else None
def extract_experience(self, text: str) -> List[Experience]:
"""Extract work experience from experience section."""
experiences = []
doc = self.nlp(text)
# Find organizations and dates
organizations = [ent for ent in doc.ents if ent.label_ == "ORG"]
dates = self._extract_dates(text)
# Split into experience blocks (simplified approach)
lines = text.split('\n')
current_exp = None
for line in lines:
line = line.strip()
if not line:
continue
line_doc = self.nlp(line)
line_orgs = [ent for ent in line_doc.ents if ent.label_ == "ORG"]
line_dates = self._extract_dates(line)
# Check if this is a new job entry
if line_orgs and (line_dates or self._looks_like_title(line)):
if current_exp and current_exp.company:
experiences.append(current_exp)
current_exp = Experience()
current_exp.company = line_orgs[0].text
current_exp.title = self._extract_title(line, line_orgs[0].text)
if line_dates:
dates_sorted = sorted(line_dates)
current_exp.start_date = dates_sorted[0]
if len(dates_sorted) > 1:
current_exp.end_date = dates_sorted[-1]
elif 'present' in line.lower() or 'current' in line.lower():
current_exp.is_current = True
elif current_exp:
current_exp.description += line + " "
if current_exp and current_exp.company:
experiences.append(current_exp)
return experiences
def _looks_like_title(self, line: str) -> bool:
"""Check if line looks like a job title."""
title_keywords = [
'engineer', 'developer', 'manager', 'analyst', 'designer',
'consultant', 'director', 'lead', 'senior', 'junior',
'intern', 'associate', 'specialist', 'coordinator'
]
return any(keyword in line.lower() for keyword in title_keywords)
def _extract_title(self, line: str, company: str) -> str:
"""Extract job title from line."""
# Remove company name and dates
title = line.replace(company, '').strip()
title = re.sub(r'\d{4}', '', title)
title = re.sub(r'[-|,]', '', title)
return title.strip()
def _extract_dates(self, text: str) -> List[str]:
"""Extract dates from text."""
dates = []
for pattern in self.date_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
dates.extend(matches)
return dates
def extract_education(self, text: str) -> List[Education]:
"""Extract education from education section."""
educations = []
doc = self.nlp(text)
# Find educational institutions
orgs = [ent for ent in doc.ents if ent.label_ == "ORG"]
lines = text.split('\n')
current_edu = None
for line in lines:
line = line.strip()
if not line:
continue
# Check for degree keywords
has_degree = any(deg in line for deg in self.degree_keywords)
line_doc = self.nlp(line)
line_orgs = [ent for ent in line_doc.ents if ent.label_ == "ORG"]
if has_degree or line_orgs:
if current_edu and (current_edu.institution or current_edu.degree):
educations.append(current_edu)
current_edu = Education()
if line_orgs:
current_edu.institution = line_orgs[0].text
# Extract degree
for deg in self.degree_keywords:
if deg in line:
current_edu.degree = deg
break
# Extract dates
dates = self._extract_dates(line)
if dates:
current_edu.graduation_date = dates[-1]
# Extract GPA
gpa_match = re.search(r'GPA[:\s]*(\d+\.?\d*)', line, re.IGNORECASE)
if gpa_match:
current_edu.gpa = gpa_match.group(1)
if current_edu and (current_edu.institution or current_edu.degree):
educations.append(current_edu)
return educationsStep 4: Skill Extractor
Extract technical and soft skills:
# parser/skill_extractor.py
import json
import re
from typing import List, Set, Dict
from pathlib import Path
class SkillExtractor:
"""Extract skills from resume text."""
def __init__(self, skills_db_path: str = None):
self.skills_db = self._load_skills_database(skills_db_path)
# Common skill categories
self.skill_categories = {
"programming_languages": [
"Python", "Java", "JavaScript", "C++", "C#", "Ruby", "Go",
"Rust", "Swift", "Kotlin", "TypeScript", "PHP", "Scala",
"R", "MATLAB", "Perl", "Shell", "Bash"
],
"frameworks": [
"React", "Angular", "Vue", "Django", "Flask", "Spring",
"Node.js", "Express", "FastAPI", "Rails", "Laravel",
"TensorFlow", "PyTorch", "Keras", "Scikit-learn"
],
"databases": [
"MySQL", "PostgreSQL", "MongoDB", "Redis", "Elasticsearch",
"Oracle", "SQLite", "Cassandra", "DynamoDB", "Firebase"
],
"cloud": [
"AWS", "Azure", "GCP", "Google Cloud", "Heroku", "Docker",
"Kubernetes", "Terraform", "Jenkins", "CircleCI", "GitHub Actions"
],
"data_science": [
"Machine Learning", "Deep Learning", "NLP", "Computer Vision",
"Data Analysis", "Statistics", "Pandas", "NumPy", "Matplotlib",
"Tableau", "Power BI", "Spark", "Hadoop"
],
"soft_skills": [
"Leadership", "Communication", "Team Management", "Problem Solving",
"Critical Thinking", "Time Management", "Agile", "Scrum"
]
}
def _load_skills_database(self, path: str) -> Dict:
"""Load skills database from JSON file."""
if path and Path(path).exists():
with open(path, 'r') as f:
return json.load(f)
return {}
def extract_skills(self, text: str) -> Dict[str, List[str]]:
"""Extract skills organized by category."""
text_lower = text.lower()
found_skills = {category: [] for category in self.skill_categories}
for category, skills in self.skill_categories.items():
for skill in skills:
# Case-insensitive search with word boundaries
pattern = r'\b' + re.escape(skill.lower()) + r'\b'
if re.search(pattern, text_lower):
found_skills[category].append(skill)
# Remove empty categories
return {k: v for k, v in found_skills.items() if v}
def extract_all_skills(self, text: str) -> List[str]:
"""Extract all skills as a flat list."""
skills_by_category = self.extract_skills(text)
all_skills = []
for skills in skills_by_category.values():
all_skills.extend(skills)
return list(set(all_skills))
def extract_years_of_experience(self, text: str) -> Dict[str, int]:
"""Try to extract years of experience for skills."""
experience = {}
# Pattern: "X years of Y experience" or "X+ years Y"
patterns = [
r'(\d+)\+?\s*years?\s*(?:of\s+)?(\w+(?:\s+\w+)?)\s*experience',
r'(\d+)\+?\s*years?\s*(?:with|using|in)\s+(\w+(?:\s+\w+)?)',
]
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for years, skill in matches:
experience[skill.strip()] = int(years)
return experienceStep 5: Main Resume Parser
Combine all components:
# parser/resume_parser.py
from dataclasses import dataclass, asdict
from typing import Dict, List, Any
import json
from .document_reader import DocumentReader
from .text_processor import TextProcessor
from .entity_extractor import EntityExtractor, ContactInfo, Experience, Education
from .skill_extractor import SkillExtractor
@dataclass
class ParsedResume:
contact: ContactInfo
summary: str
experience: List[Experience]
education: List[Education]
skills: Dict[str, List[str]]
raw_text: str
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
return {
"contact": asdict(self.contact),
"summary": self.summary,
"experience": [asdict(exp) for exp in self.experience],
"education": [asdict(edu) for edu in self.education],
"skills": self.skills,
}
def to_json(self) -> str:
"""Convert to JSON string."""
return json.dumps(self.to_dict(), indent=2)
class ResumeParser:
"""Main resume parsing class."""
def __init__(self):
self.document_reader = DocumentReader()
self.text_processor = TextProcessor()
self.entity_extractor = EntityExtractor()
self.skill_extractor = SkillExtractor()
def parse(self, file_path: str) -> ParsedResume:
"""Parse a resume file and extract structured information."""
# Step 1: Read document
raw_text = self.document_reader.read(file_path)
# Step 2: Process and identify sections
normalized_text = self.text_processor.normalize_text(raw_text)
sections = self.text_processor.identify_sections(normalized_text)
# Step 3: Extract contact info from header
header_text = ""
for section_name, content in sections:
if section_name == "HEADER":
header_text = content
break
contact = self.entity_extractor.extract_contact_info(
header_text if header_text else normalized_text[:1000]
)
# Step 4: Extract experience
experience_text = ""
for section_name, content in sections:
if "EXPERIENCE" in section_name or "WORK" in section_name:
experience_text = content
break
experience = self.entity_extractor.extract_experience(experience_text)
# Step 5: Extract education
education_text = ""
for section_name, content in sections:
if "EDUCATION" in section_name:
education_text = content
break
education = self.entity_extractor.extract_education(education_text)
# Step 6: Extract skills
skills_text = ""
for section_name, content in sections:
if "SKILL" in section_name:
skills_text = content
break
# Also search entire document for skills
skills = self.skill_extractor.extract_skills(
skills_text if skills_text else normalized_text
)
# Step 7: Extract summary
summary = ""
for section_name, content in sections:
if section_name in ["SUMMARY", "OBJECTIVE", "PROFILE"]:
summary = content
break
return ParsedResume(
contact=contact,
summary=summary,
experience=experience,
education=education,
skills=skills,
raw_text=raw_text
)Step 6: Flask API
Create a web API for the parser:
# app.py
import os
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
from werkzeug.utils import secure_filename
from parser.resume_parser import ResumeParser
app = Flask(__name__)
CORS(app)
app.config["UPLOAD_FOLDER"] = "uploads"
app.config["MAX_CONTENT_LENGTH"] = 10 * 1024 * 1024 # 10MB
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
parser = ResumeParser()
ALLOWED_EXTENSIONS = {"pdf", "docx", "doc", "png", "jpg", "jpeg"}
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route("/")
def index():
return render_template("index.html")
@app.route("/api/parse", methods=["POST"])
def parse_resume():
"""Parse uploaded resume."""
if "file" not in request.files:
return jsonify({"error": "No file provided"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "No file selected"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "File type not supported"}), 400
try:
filename = secure_filename(file.filename)
file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
file.save(file_path)
result = parser.parse(file_path)
# Clean up uploaded file
os.remove(file_path)
return jsonify(result.to_dict())
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/health")
def health():
return jsonify({"status": "healthy"})
if __name__ == "__main__":
app.run(debug=True, port=5000)Step 7: Web Interface
Create a user-friendly interface:
<!-- templates/index.html -->
<!DOCTYPE html>
<html>
<head>
<title>Resume Parser</title>
<style>
body { font-family: system-ui; max-width: 1000px; margin: 0 auto; padding: 20px; }
.upload-zone { border: 2px dashed #ccc; padding: 40px; text-align: center; border-radius: 8px; cursor: pointer; }
.upload-zone:hover { border-color: #007bff; background: #f8f9ff; }
.result { margin-top: 20px; }
.section { background: #f5f5f5; padding: 20px; border-radius: 8px; margin: 10px 0; }
.section h3 { margin-top: 0; color: #333; }
.skill-tag { display: inline-block; background: #007bff; color: white; padding: 4px 12px; border-radius: 20px; margin: 2px; font-size: 14px; }
.experience-item, .education-item { background: white; padding: 15px; border-radius: 4px; margin: 10px 0; }
button { background: #007bff; color: white; padding: 12px 24px; border: none; border-radius: 4px; cursor: pointer; }
button:hover { background: #0056b3; }
.loading { opacity: 0.5; }
</style>
</head>
<body>
<h1>Resume Parser</h1>
<div class="upload-zone" id="upload-zone">
<p>Drag and drop your resume here or click to browse</p>
<p><small>Supported: PDF, DOCX, PNG, JPG</small></p>
<input type="file" id="file-input" hidden accept=".pdf,.docx,.doc,.png,.jpg,.jpeg">
</div>
<div id="result" class="result"></div>
<script>
const uploadZone = document.getElementById("upload-zone");
const fileInput = document.getElementById("file-input");
const resultDiv = document.getElementById("result");
uploadZone.onclick = () => fileInput.click();
uploadZone.ondragover = (e) => { e.preventDefault(); uploadZone.style.borderColor = "#007bff"; };
uploadZone.ondragleave = () => uploadZone.style.borderColor = "#ccc";
uploadZone.ondrop = (e) => { e.preventDefault(); handleFile(e.dataTransfer.files[0]); };
fileInput.onchange = () => handleFile(fileInput.files[0]);
async function handleFile(file) {
if (!file) return;
uploadZone.classList.add("loading");
uploadZone.innerHTML = "Processing...";
const formData = new FormData();
formData.append("file", file);
try {
const response = await fetch("/api/parse", { method: "POST", body: formData });
const data = await response.json();
if (data.error) {
resultDiv.innerHTML = "<div class='section'><h3>Error</h3><p>" + data.error + "</p></div>";
} else {
displayResult(data);
}
} catch (error) {
resultDiv.innerHTML = "<div class='section'><h3>Error</h3><p>" + error.message + "</p></div>";
}
uploadZone.classList.remove("loading");
uploadZone.innerHTML = "<p>Drop another resume to parse</p>";
}
function displayResult(data) {
let html = "";
// Contact Info
html += "<div class='section'><h3>Contact Information</h3>";
if (data.contact.name) html += "<p><strong>Name:</strong> " + data.contact.name + "</p>";
if (data.contact.email) html += "<p><strong>Email:</strong> " + data.contact.email + "</p>";
if (data.contact.phone) html += "<p><strong>Phone:</strong> " + data.contact.phone + "</p>";
if (data.contact.linkedin) html += "<p><strong>LinkedIn:</strong> " + data.contact.linkedin + "</p>";
html += "</div>";
// Skills
if (Object.keys(data.skills).length > 0) {
html += "<div class='section'><h3>Skills</h3>";
for (const [category, skills] of Object.entries(data.skills)) {
html += "<p><strong>" + category.replace("_", " ") + ":</strong> ";
skills.forEach(skill => { html += "<span class='skill-tag'>" + skill + "</span> "; });
html += "</p>";
}
html += "</div>";
}
// Experience
if (data.experience.length > 0) {
html += "<div class='section'><h3>Experience</h3>";
data.experience.forEach(exp => {
html += "<div class='experience-item'>";
html += "<strong>" + exp.title + "</strong> at " + exp.company + "<br>";
if (exp.start_date) html += exp.start_date + " - " + (exp.end_date || "Present") + "<br>";
if (exp.description) html += "<p>" + exp.description.substring(0, 200) + "...</p>";
html += "</div>";
});
html += "</div>";
}
// Education
if (data.education.length > 0) {
html += "<div class='section'><h3>Education</h3>";
data.education.forEach(edu => {
html += "<div class='education-item'>";
html += "<strong>" + edu.degree + "</strong>";
if (edu.institution) html += " from " + edu.institution;
if (edu.graduation_date) html += " (" + edu.graduation_date + ")";
html += "</div>";
});
html += "</div>";
}
resultDiv.innerHTML = html;
}
</script>
</body>
</html>Testing and Usage
# test_parser.py
from parser.resume_parser import ResumeParser
parser = ResumeParser()
# Parse a resume
result = parser.parse("sample_resume.pdf")
# Access structured data
print(f"Name: {result.contact.name}")
print(f"Email: {result.contact.email}")
print(f"Skills: {result.skills}")
# Export to JSON
print(result.to_json())Conclusion
You’ve built a complete resume parsing system using NLP techniques. This parser handles multiple document formats, extracts structured information, and provides both an API and web interface.
Key takeaways:
- spaCy NER provides robust entity extraction for names and organizations
- Section detection improves extraction accuracy
- Pattern matching complements NER for emails, phones, and dates
- A skills database enables comprehensive skill extraction
- Multiple document format support is essential for real-world use
This foundation can be extended with machine learning for improved section detection, job matching algorithms, or integration with applicant tracking systems.
