[refactor] Upgrade survey normalization to use fuzzy search

Replace regex pattern matching with Fuse.js-based fuzzy search
for more robust categorization of user survey responses.

Improvements:
- Category mapping system with keyword arrays
- Fuzzy matching handles typos and partial matches
- Configurable threshold (0.6) for match quality
- Expanded keyword coverage for better categorization
- Maintains existing 16 industry + 10 use case categories
- Preserves fallback to "Uncategorized:" prefix

Examples now handled:
- "animtion" → "Film / TV / Animation" (typo correction)
- "game dev" → "Gaming / Interactive Media" (partial match)
- "social content" → "Marketing / Advertising / Social Media" (similarity)
This commit is contained in:
bymyself
2025-10-29 20:10:52 -07:00
parent a3b7417384
commit 8c3116d6ff

View File

@@ -3,11 +3,454 @@
*
* Smart categorization system to normalize free-text survey responses
* into standardized categories for better analytics breakdowns.
* Uses Fuse.js for fuzzy matching against category keywords.
*/
import Fuse from 'fuse.js'
interface CategoryMapping {
name: string
keywords: string[]
userCount?: number // For reference from analysis
}
/**
* Normalize industry responses into standardized categories
* Based on analysis of ~9,000 existing user responses
* Industry category mappings based on ~9,000 user analysis
*/
const INDUSTRY_CATEGORIES: CategoryMapping[] = [
{
name: 'Film / TV / Animation',
userCount: 2885,
keywords: [
'film',
'tv',
'television',
'animation',
'story',
'anime',
'video',
'cinematography',
'visual effects',
'vfx',
'movie',
'cinema',
'documentary',
'broadcast',
'streaming',
'production',
'director',
'filmmaker',
'post-production',
'editing'
]
},
{
name: 'Marketing / Advertising / Social Media',
userCount: 1340,
keywords: [
'marketing',
'advertising',
'youtube',
'tiktok',
'social media',
'content creation',
'influencer',
'brand',
'promotion',
'digital marketing',
'seo',
'campaigns',
'copywriting',
'growth',
'engagement'
]
},
{
name: 'Software / IT / AI',
userCount: 1100,
keywords: [
'software',
'it',
'ai',
'developer',
'consulting',
'engineering',
'tech',
'programmer',
'data science',
'machine learning',
'coding',
'programming',
'web development',
'app development',
'saas',
'startup'
]
},
{
name: 'Product & Industrial Design',
userCount: 1050,
keywords: [
'product design',
'industrial',
'manufacturing',
'3d rendering',
'product visualization',
'mechanical',
'automotive',
'cad',
'prototype',
'design engineering',
'invention'
]
},
{
name: 'Fine Art / Contemporary Art',
userCount: 780,
keywords: [
'fine art',
'art',
'illustration',
'contemporary',
'artist',
'painting',
'drawing',
'sculpture',
'gallery',
'canvas',
'digital art',
'mixed media',
'abstract',
'portrait'
]
},
{
name: 'Education / Research',
userCount: 640,
keywords: [
'education',
'student',
'teacher',
'research',
'learning',
'university',
'school',
'academic',
'professor',
'curriculum',
'training',
'instruction',
'pedagogy'
]
},
{
name: 'Architecture / Engineering / Construction',
userCount: 420,
keywords: [
'architecture',
'construction',
'engineering',
'civil',
'cad',
'building',
'structural',
'landscape',
'interior design',
'real estate',
'planning',
'blueprints'
]
},
{
name: 'Gaming / Interactive Media',
userCount: 410,
keywords: [
'gaming',
'game dev',
'game development',
'roblox',
'interactive',
'virtual world',
'vr',
'ar',
'metaverse',
'simulation',
'unity',
'unreal',
'indie games'
]
},
{
name: 'Photography / Videography',
userCount: 70,
keywords: [
'photography',
'photo',
'videography',
'camera',
'image',
'portrait',
'wedding',
'commercial photo',
'stock photography',
'photojournalism',
'event photography'
]
},
{
name: 'Fashion / Beauty / Retail',
userCount: 25,
keywords: [
'fashion',
'beauty',
'jewelry',
'retail',
'style',
'clothing',
'cosmetics',
'makeup',
'accessories',
'boutique',
'ecommerce'
]
},
{
name: 'Music / Performing Arts',
userCount: 25,
keywords: [
'music',
'vj',
'dance',
'projection mapping',
'audio visual',
'concert',
'performance',
'theater',
'stage',
'live events'
]
},
{
name: 'Healthcare / Medical / Life Science',
userCount: 30,
keywords: [
'healthcare',
'medical',
'doctor',
'biotech',
'life science',
'pharmaceutical',
'clinical',
'hospital',
'medicine',
'health'
]
},
{
name: 'E-commerce / Print-on-Demand / Business',
userCount: 15,
keywords: [
'ecommerce',
'print on demand',
'shop',
'business',
'commercial',
'startup',
'entrepreneur',
'sales',
'retail',
'online store'
]
},
{
name: 'Nonprofit / Government / Public Sector',
userCount: 15,
keywords: [
'501c3',
'ngo',
'government',
'public service',
'policy',
'nonprofit',
'charity',
'civic',
'community',
'social impact'
]
},
{
name: 'Adult / NSFW',
userCount: 10,
keywords: ['nsfw', 'adult', 'erotic', 'explicit', 'xxx', 'porn']
}
]
/**
* Use case category mappings based on common patterns
*/
const USE_CASE_CATEGORIES: CategoryMapping[] = [
{
name: 'Content Creation & Marketing',
keywords: [
'content creation',
'social media',
'marketing',
'advertising',
'youtube',
'tiktok',
'instagram',
'thumbnails',
'posts',
'campaigns',
'brand content'
]
},
{
name: 'Art & Illustration',
keywords: [
'art',
'illustration',
'drawing',
'painting',
'concept art',
'character design',
'digital art',
'fantasy art',
'portraits'
]
},
{
name: 'Product Visualization & Design',
keywords: [
'product',
'visualization',
'design',
'prototype',
'mockup',
'3d rendering',
'industrial design',
'product photos'
]
},
{
name: 'Film & Video Production',
keywords: [
'film',
'video',
'movie',
'animation',
'vfx',
'visual effects',
'storyboard',
'cinematography',
'post production'
]
},
{
name: 'Gaming & Interactive Media',
keywords: [
'game',
'gaming',
'interactive',
'vr',
'ar',
'virtual',
'simulation',
'metaverse',
'game assets',
'textures'
]
},
{
name: 'Architecture & Construction',
keywords: [
'architecture',
'building',
'construction',
'interior design',
'landscape',
'real estate',
'floor plans',
'renderings'
]
},
{
name: 'Education & Training',
keywords: [
'education',
'training',
'learning',
'teaching',
'tutorial',
'course',
'academic',
'instructional',
'workshops'
]
},
{
name: 'Research & Development',
keywords: [
'research',
'development',
'experiment',
'prototype',
'testing',
'analysis',
'study',
'innovation',
'r&d'
]
},
{
name: 'Personal & Hobby',
keywords: [
'personal',
'hobby',
'fun',
'experiment',
'learning',
'curiosity',
'explore',
'creative',
'side project'
]
},
{
name: 'Photography & Image Processing',
keywords: [
'photography',
'photo',
'image',
'portrait',
'editing',
'enhancement',
'restoration',
'photo manipulation'
]
}
]
/**
* Fuzzy search configuration for category matching
*/
const FUSE_OPTIONS = {
keys: ['keywords'],
threshold: 0.4, // Lower = more strict matching
includeScore: true,
includeMatches: true,
ignoreLocation: true,
findAllMatches: true
}
/**
* Create Fuse instances for category matching
*/
const industryFuse = new Fuse(INDUSTRY_CATEGORIES, FUSE_OPTIONS)
const useCaseFuse = new Fuse(USE_CASE_CATEGORIES, FUSE_OPTIONS)
/**
* Normalize industry responses using fuzzy search
*/
export function normalizeIndustry(rawIndustry: string): string {
if (!rawIndustry || typeof rawIndustry !== 'string') {
@@ -16,153 +459,27 @@ export function normalizeIndustry(rawIndustry: string): string {
const industry = rawIndustry.toLowerCase().trim()
// Film / TV / Animation (~2,885 users)
// Handle common undefined responses
if (
industry.match(
/film|tv|animation|story|anime|video|cinematography|visual effects|vfx|movie|cinema/
)
) {
return 'Film / TV / Animation'
}
// Marketing / Advertising / Social Media (~1,340 users)
if (
industry.match(
/marketing|advertising|youtube|tiktok|social media|content creation|influencer|brand|promotion/
)
) {
return 'Marketing / Advertising / Social Media'
}
// Software / IT / AI (~1,100 users)
if (
industry.match(
/software|it|ai|developer|consulting|engineering|tech|programmer|data science|machine learning/
)
) {
return 'Software / IT / AI'
}
// Product & Industrial Design (~1,050 users)
if (
industry.match(
/product.?design|industrial|manufacturing|3d rendering|product visualization|mechanical|automotive/
)
) {
return 'Product & Industrial Design'
}
// Fine Art / Contemporary Art (~780 users)
if (
industry.match(
/fine.?art|art|illustration|contemporary|artist|painting|drawing|sculpture|gallery/
)
) {
return 'Fine Art / Contemporary Art'
}
// Education / Research (~640 users)
if (
industry.match(
/education|student|teacher|research|learning|university|school|academic|professor/
)
) {
return 'Education / Research'
}
// Architecture / Engineering / Construction (~420 users)
if (
industry.match(
/architecture|construction|engineering|civil|cad|building|structural|landscape/
)
) {
return 'Architecture / Engineering / Construction'
}
// Gaming / Interactive Media (~410 users)
if (
industry.match(
/gaming|game dev|roblox|interactive|virtual world|vr|ar|metaverse|simulation/
)
) {
return 'Gaming / Interactive Media'
}
// Photography / Videography (~70 users)
if (
industry.match(
/photography|photo|videography|camera|image|portrait|wedding|commercial photo/
)
) {
return 'Photography / Videography'
}
// Fashion / Beauty / Retail (~25 users)
if (
industry.match(
/fashion|beauty|jewelry|retail|style|clothing|cosmetics|makeup/
)
) {
return 'Fashion / Beauty / Retail'
}
// Music / Performing Arts (~25 users)
if (
industry.match(
/music|vj|dance|projection mapping|audio visual|concert|performance|theater/
)
) {
return 'Music / Performing Arts'
}
// Healthcare / Medical / Life Science (~30 users)
if (
industry.match(
/healthcare|medical|doctor|biotech|life science|pharmaceutical|clinical|hospital/
)
) {
return 'Healthcare / Medical / Life Science'
}
// E-commerce / Print-on-Demand / Business (~15 users)
if (
industry.match(
/ecommerce|print on demand|shop|business|commercial|startup|entrepreneur|sales/
)
) {
return 'E-commerce / Print-on-Demand / Business'
}
// Nonprofit / Government / Public Sector (~15 users)
if (
industry.match(
/501c3|ngo|government|public service|policy|nonprofit|charity|civic/
)
) {
return 'Nonprofit / Government / Public Sector'
}
// Adult / NSFW (~10 users)
if (industry.match(/nsfw|adult|erotic|explicit|xxx|porn/)) {
return 'Adult / NSFW'
}
// Other / Undefined - catch common undefined responses
if (
industry.match(
/other|none|undefined|unknown|n\/a|not applicable|^-$|^$|misc/
)
industry.match(/^(other|none|undefined|unknown|n\/a|not applicable|-|)$/)
) {
return 'Other / Undefined'
}
// Uncategorized - preserve original but prefix for analysis
// Fuzzy search for best category match
const results = industryFuse.search(industry)
if (results.length > 0 && results[0].score! < 0.6) {
// Good match found
return results[0].item.name
}
// No good match found - preserve original with prefix
return `Uncategorized: ${rawIndustry}`
}
/**
* Normalize use case responses into standardized categories
* Based on common patterns in user responses
* Normalize use case responses using fuzzy search
*/
export function normalizeUseCase(rawUseCase: string): string {
if (!rawUseCase || typeof rawUseCase !== 'string') {
@@ -171,102 +488,22 @@ export function normalizeUseCase(rawUseCase: string): string {
const useCase = rawUseCase.toLowerCase().trim()
// Content Creation & Marketing
// Handle common undefined responses
if (
useCase.match(
/content creation|social media|marketing|advertising|youtube|tiktok|instagram|thumbnails/
)
) {
return 'Content Creation & Marketing'
}
// Art & Illustration
if (
useCase.match(
/art|illustration|drawing|painting|concept art|character design|digital art/
)
) {
return 'Art & Illustration'
}
// Product Visualization & Design
if (
useCase.match(
/product|visualization|design|prototype|mockup|3d rendering|industrial design/
)
) {
return 'Product Visualization & Design'
}
// Film & Video Production
if (
useCase.match(
/film|video|movie|animation|vfx|visual effects|storyboard|cinematography/
)
) {
return 'Film & Video Production'
}
// Gaming & Interactive Media
if (
useCase.match(/game|gaming|interactive|vr|ar|virtual|simulation|metaverse/)
) {
return 'Gaming & Interactive Media'
}
// Architecture & Construction
if (
useCase.match(
/architecture|building|construction|interior design|landscape|real estate/
)
) {
return 'Architecture & Construction'
}
// Education & Training
if (
useCase.match(
/education|training|learning|teaching|tutorial|course|academic/
)
) {
return 'Education & Training'
}
// Research & Development
if (
useCase.match(
/research|development|experiment|prototype|testing|analysis|study/
)
) {
return 'Research & Development'
}
// Personal & Hobby
if (
useCase.match(/personal|hobby|fun|experiment|learning|curiosity|explore/)
) {
return 'Personal & Hobby'
}
// Photography & Image Processing
if (
useCase.match(
/photography|photo|image|portrait|editing|enhancement|restoration/
)
) {
return 'Photography & Image Processing'
}
// Other / Undefined
if (
useCase.match(
/other|none|undefined|unknown|n\/a|not applicable|^-$|^$|misc/
)
useCase.match(/^(other|none|undefined|unknown|n\/a|not applicable|-|)$/)
) {
return 'Other / Undefined'
}
// Uncategorized - preserve original but prefix for analysis
// Fuzzy search for best category match
const results = useCaseFuse.search(useCase)
if (results.length > 0 && results[0].score! < 0.6) {
// Good match found
return results[0].item.name
}
// No good match found - preserve original with prefix
return `Uncategorized: ${rawUseCase}`
}