From 8c3116d6ff9f320b1a3304ff7a694c127da8cd46 Mon Sep 17 00:00:00 2001 From: bymyself Date: Wed, 29 Oct 2025 20:10:52 -0700 Subject: [PATCH] [refactor] Upgrade survey normalization to use fuzzy search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace regex pattern matching with Fuse.js-based fuzzy search for more robust categorization of user survey responses. Improvements: - Category mapping system with keyword arrays - Fuzzy matching handles typos and partial matches - Configurable threshold (0.6) for match quality - Expanded keyword coverage for better categorization - Maintains existing 16 industry + 10 use case categories - Preserves fallback to "Uncategorized:" prefix Examples now handled: - "animtion" → "Film / TV / Animation" (typo correction) - "game dev" → "Gaming / Interactive Media" (partial match) - "social content" → "Marketing / Advertising / Social Media" (similarity) --- .../telemetry/utils/surveyNormalization.ts | 699 ++++++++++++------ 1 file changed, 468 insertions(+), 231 deletions(-) diff --git a/src/platform/telemetry/utils/surveyNormalization.ts b/src/platform/telemetry/utils/surveyNormalization.ts index 3c2cb6f69..2a8312dbe 100644 --- a/src/platform/telemetry/utils/surveyNormalization.ts +++ b/src/platform/telemetry/utils/surveyNormalization.ts @@ -3,11 +3,454 @@ * * Smart categorization system to normalize free-text survey responses * into standardized categories for better analytics breakdowns. + * Uses Fuse.js for fuzzy matching against category keywords. */ +import Fuse from 'fuse.js' + +interface CategoryMapping { + name: string + keywords: string[] + userCount?: number // For reference from analysis +} + /** - * Normalize industry responses into standardized categories - * Based on analysis of ~9,000 existing user responses + * Industry category mappings based on ~9,000 user analysis + */ +const INDUSTRY_CATEGORIES: CategoryMapping[] = [ + { + name: 'Film / TV / Animation', + userCount: 2885, + keywords: [ + 'film', + 'tv', + 'television', + 'animation', + 'story', + 'anime', + 'video', + 'cinematography', + 'visual effects', + 'vfx', + 'movie', + 'cinema', + 'documentary', + 'broadcast', + 'streaming', + 'production', + 'director', + 'filmmaker', + 'post-production', + 'editing' + ] + }, + { + name: 'Marketing / Advertising / Social Media', + userCount: 1340, + keywords: [ + 'marketing', + 'advertising', + 'youtube', + 'tiktok', + 'social media', + 'content creation', + 'influencer', + 'brand', + 'promotion', + 'digital marketing', + 'seo', + 'campaigns', + 'copywriting', + 'growth', + 'engagement' + ] + }, + { + name: 'Software / IT / AI', + userCount: 1100, + keywords: [ + 'software', + 'it', + 'ai', + 'developer', + 'consulting', + 'engineering', + 'tech', + 'programmer', + 'data science', + 'machine learning', + 'coding', + 'programming', + 'web development', + 'app development', + 'saas', + 'startup' + ] + }, + { + name: 'Product & Industrial Design', + userCount: 1050, + keywords: [ + 'product design', + 'industrial', + 'manufacturing', + '3d rendering', + 'product visualization', + 'mechanical', + 'automotive', + 'cad', + 'prototype', + 'design engineering', + 'invention' + ] + }, + { + name: 'Fine Art / Contemporary Art', + userCount: 780, + keywords: [ + 'fine art', + 'art', + 'illustration', + 'contemporary', + 'artist', + 'painting', + 'drawing', + 'sculpture', + 'gallery', + 'canvas', + 'digital art', + 'mixed media', + 'abstract', + 'portrait' + ] + }, + { + name: 'Education / Research', + userCount: 640, + keywords: [ + 'education', + 'student', + 'teacher', + 'research', + 'learning', + 'university', + 'school', + 'academic', + 'professor', + 'curriculum', + 'training', + 'instruction', + 'pedagogy' + ] + }, + { + name: 'Architecture / Engineering / Construction', + userCount: 420, + keywords: [ + 'architecture', + 'construction', + 'engineering', + 'civil', + 'cad', + 'building', + 'structural', + 'landscape', + 'interior design', + 'real estate', + 'planning', + 'blueprints' + ] + }, + { + name: 'Gaming / Interactive Media', + userCount: 410, + keywords: [ + 'gaming', + 'game dev', + 'game development', + 'roblox', + 'interactive', + 'virtual world', + 'vr', + 'ar', + 'metaverse', + 'simulation', + 'unity', + 'unreal', + 'indie games' + ] + }, + { + name: 'Photography / Videography', + userCount: 70, + keywords: [ + 'photography', + 'photo', + 'videography', + 'camera', + 'image', + 'portrait', + 'wedding', + 'commercial photo', + 'stock photography', + 'photojournalism', + 'event photography' + ] + }, + { + name: 'Fashion / Beauty / Retail', + userCount: 25, + keywords: [ + 'fashion', + 'beauty', + 'jewelry', + 'retail', + 'style', + 'clothing', + 'cosmetics', + 'makeup', + 'accessories', + 'boutique', + 'ecommerce' + ] + }, + { + name: 'Music / Performing Arts', + userCount: 25, + keywords: [ + 'music', + 'vj', + 'dance', + 'projection mapping', + 'audio visual', + 'concert', + 'performance', + 'theater', + 'stage', + 'live events' + ] + }, + { + name: 'Healthcare / Medical / Life Science', + userCount: 30, + keywords: [ + 'healthcare', + 'medical', + 'doctor', + 'biotech', + 'life science', + 'pharmaceutical', + 'clinical', + 'hospital', + 'medicine', + 'health' + ] + }, + { + name: 'E-commerce / Print-on-Demand / Business', + userCount: 15, + keywords: [ + 'ecommerce', + 'print on demand', + 'shop', + 'business', + 'commercial', + 'startup', + 'entrepreneur', + 'sales', + 'retail', + 'online store' + ] + }, + { + name: 'Nonprofit / Government / Public Sector', + userCount: 15, + keywords: [ + '501c3', + 'ngo', + 'government', + 'public service', + 'policy', + 'nonprofit', + 'charity', + 'civic', + 'community', + 'social impact' + ] + }, + { + name: 'Adult / NSFW', + userCount: 10, + keywords: ['nsfw', 'adult', 'erotic', 'explicit', 'xxx', 'porn'] + } +] + +/** + * Use case category mappings based on common patterns + */ +const USE_CASE_CATEGORIES: CategoryMapping[] = [ + { + name: 'Content Creation & Marketing', + keywords: [ + 'content creation', + 'social media', + 'marketing', + 'advertising', + 'youtube', + 'tiktok', + 'instagram', + 'thumbnails', + 'posts', + 'campaigns', + 'brand content' + ] + }, + { + name: 'Art & Illustration', + keywords: [ + 'art', + 'illustration', + 'drawing', + 'painting', + 'concept art', + 'character design', + 'digital art', + 'fantasy art', + 'portraits' + ] + }, + { + name: 'Product Visualization & Design', + keywords: [ + 'product', + 'visualization', + 'design', + 'prototype', + 'mockup', + '3d rendering', + 'industrial design', + 'product photos' + ] + }, + { + name: 'Film & Video Production', + keywords: [ + 'film', + 'video', + 'movie', + 'animation', + 'vfx', + 'visual effects', + 'storyboard', + 'cinematography', + 'post production' + ] + }, + { + name: 'Gaming & Interactive Media', + keywords: [ + 'game', + 'gaming', + 'interactive', + 'vr', + 'ar', + 'virtual', + 'simulation', + 'metaverse', + 'game assets', + 'textures' + ] + }, + { + name: 'Architecture & Construction', + keywords: [ + 'architecture', + 'building', + 'construction', + 'interior design', + 'landscape', + 'real estate', + 'floor plans', + 'renderings' + ] + }, + { + name: 'Education & Training', + keywords: [ + 'education', + 'training', + 'learning', + 'teaching', + 'tutorial', + 'course', + 'academic', + 'instructional', + 'workshops' + ] + }, + { + name: 'Research & Development', + keywords: [ + 'research', + 'development', + 'experiment', + 'prototype', + 'testing', + 'analysis', + 'study', + 'innovation', + 'r&d' + ] + }, + { + name: 'Personal & Hobby', + keywords: [ + 'personal', + 'hobby', + 'fun', + 'experiment', + 'learning', + 'curiosity', + 'explore', + 'creative', + 'side project' + ] + }, + { + name: 'Photography & Image Processing', + keywords: [ + 'photography', + 'photo', + 'image', + 'portrait', + 'editing', + 'enhancement', + 'restoration', + 'photo manipulation' + ] + } +] + +/** + * Fuzzy search configuration for category matching + */ +const FUSE_OPTIONS = { + keys: ['keywords'], + threshold: 0.4, // Lower = more strict matching + includeScore: true, + includeMatches: true, + ignoreLocation: true, + findAllMatches: true +} + +/** + * Create Fuse instances for category matching + */ +const industryFuse = new Fuse(INDUSTRY_CATEGORIES, FUSE_OPTIONS) +const useCaseFuse = new Fuse(USE_CASE_CATEGORIES, FUSE_OPTIONS) + +/** + * Normalize industry responses using fuzzy search */ export function normalizeIndustry(rawIndustry: string): string { if (!rawIndustry || typeof rawIndustry !== 'string') { @@ -16,153 +459,27 @@ export function normalizeIndustry(rawIndustry: string): string { const industry = rawIndustry.toLowerCase().trim() - // Film / TV / Animation (~2,885 users) + // Handle common undefined responses if ( - industry.match( - /film|tv|animation|story|anime|video|cinematography|visual effects|vfx|movie|cinema/ - ) - ) { - return 'Film / TV / Animation' - } - - // Marketing / Advertising / Social Media (~1,340 users) - if ( - industry.match( - /marketing|advertising|youtube|tiktok|social media|content creation|influencer|brand|promotion/ - ) - ) { - return 'Marketing / Advertising / Social Media' - } - - // Software / IT / AI (~1,100 users) - if ( - industry.match( - /software|it|ai|developer|consulting|engineering|tech|programmer|data science|machine learning/ - ) - ) { - return 'Software / IT / AI' - } - - // Product & Industrial Design (~1,050 users) - if ( - industry.match( - /product.?design|industrial|manufacturing|3d rendering|product visualization|mechanical|automotive/ - ) - ) { - return 'Product & Industrial Design' - } - - // Fine Art / Contemporary Art (~780 users) - if ( - industry.match( - /fine.?art|art|illustration|contemporary|artist|painting|drawing|sculpture|gallery/ - ) - ) { - return 'Fine Art / Contemporary Art' - } - - // Education / Research (~640 users) - if ( - industry.match( - /education|student|teacher|research|learning|university|school|academic|professor/ - ) - ) { - return 'Education / Research' - } - - // Architecture / Engineering / Construction (~420 users) - if ( - industry.match( - /architecture|construction|engineering|civil|cad|building|structural|landscape/ - ) - ) { - return 'Architecture / Engineering / Construction' - } - - // Gaming / Interactive Media (~410 users) - if ( - industry.match( - /gaming|game dev|roblox|interactive|virtual world|vr|ar|metaverse|simulation/ - ) - ) { - return 'Gaming / Interactive Media' - } - - // Photography / Videography (~70 users) - if ( - industry.match( - /photography|photo|videography|camera|image|portrait|wedding|commercial photo/ - ) - ) { - return 'Photography / Videography' - } - - // Fashion / Beauty / Retail (~25 users) - if ( - industry.match( - /fashion|beauty|jewelry|retail|style|clothing|cosmetics|makeup/ - ) - ) { - return 'Fashion / Beauty / Retail' - } - - // Music / Performing Arts (~25 users) - if ( - industry.match( - /music|vj|dance|projection mapping|audio visual|concert|performance|theater/ - ) - ) { - return 'Music / Performing Arts' - } - - // Healthcare / Medical / Life Science (~30 users) - if ( - industry.match( - /healthcare|medical|doctor|biotech|life science|pharmaceutical|clinical|hospital/ - ) - ) { - return 'Healthcare / Medical / Life Science' - } - - // E-commerce / Print-on-Demand / Business (~15 users) - if ( - industry.match( - /ecommerce|print on demand|shop|business|commercial|startup|entrepreneur|sales/ - ) - ) { - return 'E-commerce / Print-on-Demand / Business' - } - - // Nonprofit / Government / Public Sector (~15 users) - if ( - industry.match( - /501c3|ngo|government|public service|policy|nonprofit|charity|civic/ - ) - ) { - return 'Nonprofit / Government / Public Sector' - } - - // Adult / NSFW (~10 users) - if (industry.match(/nsfw|adult|erotic|explicit|xxx|porn/)) { - return 'Adult / NSFW' - } - - // Other / Undefined - catch common undefined responses - if ( - industry.match( - /other|none|undefined|unknown|n\/a|not applicable|^-$|^$|misc/ - ) + industry.match(/^(other|none|undefined|unknown|n\/a|not applicable|-|)$/) ) { return 'Other / Undefined' } - // Uncategorized - preserve original but prefix for analysis + // Fuzzy search for best category match + const results = industryFuse.search(industry) + + if (results.length > 0 && results[0].score! < 0.6) { + // Good match found + return results[0].item.name + } + + // No good match found - preserve original with prefix return `Uncategorized: ${rawIndustry}` } /** - * Normalize use case responses into standardized categories - * Based on common patterns in user responses + * Normalize use case responses using fuzzy search */ export function normalizeUseCase(rawUseCase: string): string { if (!rawUseCase || typeof rawUseCase !== 'string') { @@ -171,102 +488,22 @@ export function normalizeUseCase(rawUseCase: string): string { const useCase = rawUseCase.toLowerCase().trim() - // Content Creation & Marketing + // Handle common undefined responses if ( - useCase.match( - /content creation|social media|marketing|advertising|youtube|tiktok|instagram|thumbnails/ - ) - ) { - return 'Content Creation & Marketing' - } - - // Art & Illustration - if ( - useCase.match( - /art|illustration|drawing|painting|concept art|character design|digital art/ - ) - ) { - return 'Art & Illustration' - } - - // Product Visualization & Design - if ( - useCase.match( - /product|visualization|design|prototype|mockup|3d rendering|industrial design/ - ) - ) { - return 'Product Visualization & Design' - } - - // Film & Video Production - if ( - useCase.match( - /film|video|movie|animation|vfx|visual effects|storyboard|cinematography/ - ) - ) { - return 'Film & Video Production' - } - - // Gaming & Interactive Media - if ( - useCase.match(/game|gaming|interactive|vr|ar|virtual|simulation|metaverse/) - ) { - return 'Gaming & Interactive Media' - } - - // Architecture & Construction - if ( - useCase.match( - /architecture|building|construction|interior design|landscape|real estate/ - ) - ) { - return 'Architecture & Construction' - } - - // Education & Training - if ( - useCase.match( - /education|training|learning|teaching|tutorial|course|academic/ - ) - ) { - return 'Education & Training' - } - - // Research & Development - if ( - useCase.match( - /research|development|experiment|prototype|testing|analysis|study/ - ) - ) { - return 'Research & Development' - } - - // Personal & Hobby - if ( - useCase.match(/personal|hobby|fun|experiment|learning|curiosity|explore/) - ) { - return 'Personal & Hobby' - } - - // Photography & Image Processing - if ( - useCase.match( - /photography|photo|image|portrait|editing|enhancement|restoration/ - ) - ) { - return 'Photography & Image Processing' - } - - // Other / Undefined - if ( - useCase.match( - /other|none|undefined|unknown|n\/a|not applicable|^-$|^$|misc/ - ) + useCase.match(/^(other|none|undefined|unknown|n\/a|not applicable|-|)$/) ) { return 'Other / Undefined' } - // Uncategorized - preserve original but prefix for analysis + // Fuzzy search for best category match + const results = useCaseFuse.search(useCase) + + if (results.length > 0 && results[0].score! < 0.6) { + // Good match found + return results[0].item.name + } + + // No good match found - preserve original with prefix return `Uncategorized: ${rawUseCase}` }