[feat] Add survey response normalization system

Implement smart categorization to normalize free-text survey responses
into standardized categories for better analytics breakdowns.

Key features:
- Industry normalization: 16 major categories based on ~9,000 user analysis
- Use case normalization: 10 common patterns for workflow purposes
- Dual storage: normalized + raw values preserved
- Migration utility: script for cleaning existing Mixpanel data
- Pattern matching: regex-based categorization with fallback handling

Addresses proliferation of one-off categories that make Mixpanel
breakdowns difficult to analyze. Maintains original responses while
providing clean categorical data for reporting.
This commit is contained in:
bymyself
2025-10-29 20:01:18 -07:00
parent 0c04f00da0
commit a3b7417384
3 changed files with 547 additions and 1 deletions

View File

@@ -0,0 +1,234 @@
#!/usr/bin/env node
/**
* Survey Data Migration Script
*
* One-time utility to normalize existing Mixpanel user properties
* for industry and use case fields. This addresses the proliferation
* of one-off categories that make analytics difficult.
*
* Usage: pnpm ts-node scripts/survey-data-migration.ts
*
* IMPORTANT: This script requires Mixpanel Data Management API access
* and should be run with appropriate credentials in production.
*/
/* eslint-disable no-console */
import {
normalizeIndustry,
normalizeUseCase
} from '../src/platform/telemetry/utils/surveyNormalization'
interface MixpanelUser {
$distinct_id: string
$properties: {
industry?: string
useCase?: string
[key: string]: any
}
}
interface MigrationStats {
totalUsers: number
industryNormalized: number
useCaseNormalized: number
uncategorizedIndustries: Set<string>
uncategorizedUseCases: Set<string>
}
/**
* Simulate the data migration process
* In production, this would integrate with Mixpanel Data Management API
*/
function simulateMigration(users: MixpanelUser[]): MigrationStats {
const stats: MigrationStats = {
totalUsers: users.length,
industryNormalized: 0,
useCaseNormalized: 0,
uncategorizedIndustries: new Set<string>(),
uncategorizedUseCases: new Set<string>()
}
users.forEach((user) => {
let needsUpdate = false
const updates: Record<string, any> = {}
// Process industry normalization
if (user.$properties.industry) {
const normalized = normalizeIndustry(user.$properties.industry)
if (normalized !== user.$properties.industry) {
updates.industry_normalized = normalized
updates.industry_raw = user.$properties.industry
stats.industryNormalized++
needsUpdate = true
if (normalized.startsWith('Uncategorized:')) {
stats.uncategorizedIndustries.add(user.$properties.industry)
}
}
}
// Process use case normalization
if (user.$properties.useCase) {
const normalized = normalizeUseCase(user.$properties.useCase)
if (normalized !== user.$properties.useCase) {
updates.useCase_normalized = normalized
updates.useCase_raw = user.$properties.useCase
stats.useCaseNormalized++
needsUpdate = true
if (normalized.startsWith('Uncategorized:')) {
stats.uncategorizedUseCases.add(user.$properties.useCase)
}
}
}
// In production, this would make API calls to update user properties
if (needsUpdate) {
console.log(`Would update user ${user.$distinct_id}:`, updates)
}
})
return stats
}
/**
* Generate sample data for testing normalization rules
*/
function generateSampleData(): MixpanelUser[] {
return [
{
$distinct_id: 'user1',
$properties: {
industry: 'Film and television production',
useCase: 'Creating concept art for movies'
}
},
{
$distinct_id: 'user2',
$properties: {
industry: 'Marketing & Social Media',
useCase: 'YouTube thumbnail generation'
}
},
{
$distinct_id: 'user3',
$properties: {
industry: 'Software Development',
useCase: 'Product mockup creation'
}
},
{
$distinct_id: 'user4',
$properties: {
industry: 'Indie Game Studio',
useCase: 'Game asset generation'
}
},
{
$distinct_id: 'user5',
$properties: {
industry: 'Architecture firm',
useCase: 'Building visualization'
}
},
{
$distinct_id: 'user6',
$properties: {
industry: 'Custom Jewelry Design',
useCase: 'Product photography'
}
},
{
$distinct_id: 'user7',
$properties: {
industry: 'Medical Research',
useCase: 'Scientific visualization'
}
},
{
$distinct_id: 'user8',
$properties: {
industry: 'Unknown Creative Field',
useCase: 'Personal art projects'
}
}
]
}
/**
* Production implementation would use Mixpanel Data Management API
* Example API structure (not actual implementation):
*/
async function productionMigration() {
console.log('🔧 Production Migration Process:')
console.log('1. Export user profiles via Mixpanel Data Management API')
console.log('2. Apply normalization to industry and useCase fields')
console.log(
'3. Create new properties: industry_normalized, useCase_normalized'
)
console.log('4. Preserve original values as: industry_raw, useCase_raw')
console.log('5. Batch update user profiles')
console.log('6. Generate uncategorized response report for review')
/*
Example API calls:
// 1. Export users
const users = await mixpanel.people.query({
where: 'properties["industry"] != null or properties["useCase"] != null'
})
// 2. Process and update
for (const user of users) {
const normalizedData = normalizeSurveyResponses(user.properties)
await mixpanel.people.set(user.distinct_id, normalizedData)
}
*/
}
/**
* Main migration runner
*/
function main() {
console.log('📊 Survey Data Migration Utility')
console.log('================================\n')
// Run simulation with sample data
console.log('🧪 Running simulation with sample data...\n')
const sampleUsers = generateSampleData()
const stats = simulateMigration(sampleUsers)
// Display results
console.log('📈 Migration Results:')
console.log(`Total users processed: ${stats.totalUsers}`)
console.log(`Industry fields normalized: ${stats.industryNormalized}`)
console.log(`Use case fields normalized: ${stats.useCaseNormalized}`)
if (stats.uncategorizedIndustries.size > 0) {
console.log('\n❓ Uncategorized Industries (need review):')
Array.from(stats.uncategorizedIndustries).forEach((industry) => {
console.log(`${industry}`)
})
}
if (stats.uncategorizedUseCases.size > 0) {
console.log('\n❓ Uncategorized Use Cases (need review):')
Array.from(stats.uncategorizedUseCases).forEach((useCase) => {
console.log(`${useCase}`)
})
}
console.log('\n' + '='.repeat(50))
void productionMigration()
}
// Run if called directly
if (require.main === module) {
main()
}
export { simulateMigration, generateSampleData, MigrationStats }

View File

@@ -7,6 +7,7 @@ import { app } from '@/scripts/app'
import { useNodeDefStore } from '@/stores/nodeDefStore'
import { NodeSourceType } from '@/types/nodeSource'
import { reduceAllNodes } from '@/utils/graphTraversalUtil'
import { normalizeSurveyResponses } from '../../utils/surveyNormalization'
import type {
AuthMetadata,
@@ -178,7 +179,21 @@ export class MixpanelTelemetryProvider implements TelemetryProvider {
? TelemetryEvents.USER_SURVEY_OPENED
: TelemetryEvents.USER_SURVEY_SUBMITTED
this.trackEvent(eventName, responses)
// Apply normalization to survey responses
const normalizedResponses = responses
? normalizeSurveyResponses(responses)
: undefined
this.trackEvent(eventName, normalizedResponses)
// If this is a survey submission, also set user properties with normalized data
if (stage === 'submitted' && normalizedResponses && this.mixpanel) {
try {
this.mixpanel.people.set(normalizedResponses)
} catch (error) {
console.error('Failed to set survey user properties:', error)
}
}
}
trackEmailVerification(stage: 'opened' | 'requested' | 'completed'): void {

View File

@@ -0,0 +1,297 @@
/**
* Survey Response Normalization Utilities
*
* Smart categorization system to normalize free-text survey responses
* into standardized categories for better analytics breakdowns.
*/
/**
* Normalize industry responses into standardized categories
* Based on analysis of ~9,000 existing user responses
*/
export function normalizeIndustry(rawIndustry: string): string {
if (!rawIndustry || typeof rawIndustry !== 'string') {
return 'Other / Undefined'
}
const industry = rawIndustry.toLowerCase().trim()
// Film / TV / Animation (~2,885 users)
if (
industry.match(
/film|tv|animation|story|anime|video|cinematography|visual effects|vfx|movie|cinema/
)
) {
return 'Film / TV / Animation'
}
// Marketing / Advertising / Social Media (~1,340 users)
if (
industry.match(
/marketing|advertising|youtube|tiktok|social media|content creation|influencer|brand|promotion/
)
) {
return 'Marketing / Advertising / Social Media'
}
// Software / IT / AI (~1,100 users)
if (
industry.match(
/software|it|ai|developer|consulting|engineering|tech|programmer|data science|machine learning/
)
) {
return 'Software / IT / AI'
}
// Product & Industrial Design (~1,050 users)
if (
industry.match(
/product.?design|industrial|manufacturing|3d rendering|product visualization|mechanical|automotive/
)
) {
return 'Product & Industrial Design'
}
// Fine Art / Contemporary Art (~780 users)
if (
industry.match(
/fine.?art|art|illustration|contemporary|artist|painting|drawing|sculpture|gallery/
)
) {
return 'Fine Art / Contemporary Art'
}
// Education / Research (~640 users)
if (
industry.match(
/education|student|teacher|research|learning|university|school|academic|professor/
)
) {
return 'Education / Research'
}
// Architecture / Engineering / Construction (~420 users)
if (
industry.match(
/architecture|construction|engineering|civil|cad|building|structural|landscape/
)
) {
return 'Architecture / Engineering / Construction'
}
// Gaming / Interactive Media (~410 users)
if (
industry.match(
/gaming|game dev|roblox|interactive|virtual world|vr|ar|metaverse|simulation/
)
) {
return 'Gaming / Interactive Media'
}
// Photography / Videography (~70 users)
if (
industry.match(
/photography|photo|videography|camera|image|portrait|wedding|commercial photo/
)
) {
return 'Photography / Videography'
}
// Fashion / Beauty / Retail (~25 users)
if (
industry.match(
/fashion|beauty|jewelry|retail|style|clothing|cosmetics|makeup/
)
) {
return 'Fashion / Beauty / Retail'
}
// Music / Performing Arts (~25 users)
if (
industry.match(
/music|vj|dance|projection mapping|audio visual|concert|performance|theater/
)
) {
return 'Music / Performing Arts'
}
// Healthcare / Medical / Life Science (~30 users)
if (
industry.match(
/healthcare|medical|doctor|biotech|life science|pharmaceutical|clinical|hospital/
)
) {
return 'Healthcare / Medical / Life Science'
}
// E-commerce / Print-on-Demand / Business (~15 users)
if (
industry.match(
/ecommerce|print on demand|shop|business|commercial|startup|entrepreneur|sales/
)
) {
return 'E-commerce / Print-on-Demand / Business'
}
// Nonprofit / Government / Public Sector (~15 users)
if (
industry.match(
/501c3|ngo|government|public service|policy|nonprofit|charity|civic/
)
) {
return 'Nonprofit / Government / Public Sector'
}
// Adult / NSFW (~10 users)
if (industry.match(/nsfw|adult|erotic|explicit|xxx|porn/)) {
return 'Adult / NSFW'
}
// Other / Undefined - catch common undefined responses
if (
industry.match(
/other|none|undefined|unknown|n\/a|not applicable|^-$|^$|misc/
)
) {
return 'Other / Undefined'
}
// Uncategorized - preserve original but prefix for analysis
return `Uncategorized: ${rawIndustry}`
}
/**
* Normalize use case responses into standardized categories
* Based on common patterns in user responses
*/
export function normalizeUseCase(rawUseCase: string): string {
if (!rawUseCase || typeof rawUseCase !== 'string') {
return 'Other / Undefined'
}
const useCase = rawUseCase.toLowerCase().trim()
// Content Creation & Marketing
if (
useCase.match(
/content creation|social media|marketing|advertising|youtube|tiktok|instagram|thumbnails/
)
) {
return 'Content Creation & Marketing'
}
// Art & Illustration
if (
useCase.match(
/art|illustration|drawing|painting|concept art|character design|digital art/
)
) {
return 'Art & Illustration'
}
// Product Visualization & Design
if (
useCase.match(
/product|visualization|design|prototype|mockup|3d rendering|industrial design/
)
) {
return 'Product Visualization & Design'
}
// Film & Video Production
if (
useCase.match(
/film|video|movie|animation|vfx|visual effects|storyboard|cinematography/
)
) {
return 'Film & Video Production'
}
// Gaming & Interactive Media
if (
useCase.match(/game|gaming|interactive|vr|ar|virtual|simulation|metaverse/)
) {
return 'Gaming & Interactive Media'
}
// Architecture & Construction
if (
useCase.match(
/architecture|building|construction|interior design|landscape|real estate/
)
) {
return 'Architecture & Construction'
}
// Education & Training
if (
useCase.match(
/education|training|learning|teaching|tutorial|course|academic/
)
) {
return 'Education & Training'
}
// Research & Development
if (
useCase.match(
/research|development|experiment|prototype|testing|analysis|study/
)
) {
return 'Research & Development'
}
// Personal & Hobby
if (
useCase.match(/personal|hobby|fun|experiment|learning|curiosity|explore/)
) {
return 'Personal & Hobby'
}
// Photography & Image Processing
if (
useCase.match(
/photography|photo|image|portrait|editing|enhancement|restoration/
)
) {
return 'Photography & Image Processing'
}
// Other / Undefined
if (
useCase.match(
/other|none|undefined|unknown|n\/a|not applicable|^-$|^$|misc/
)
) {
return 'Other / Undefined'
}
// Uncategorized - preserve original but prefix for analysis
return `Uncategorized: ${rawUseCase}`
}
/**
* Apply normalization to survey responses
* Creates both normalized and raw versions of responses
*/
export function normalizeSurveyResponses(responses: {
industry?: string
useCase?: string
[key: string]: any
}) {
const normalized = { ...responses }
// Normalize industry
if (responses.industry) {
normalized.industry_normalized = normalizeIndustry(responses.industry)
normalized.industry_raw = responses.industry
}
// Normalize use case
if (responses.useCase) {
normalized.useCase_normalized = normalizeUseCase(responses.useCase)
normalized.useCase_raw = responses.useCase
}
return normalized
}