Rust - String Processing

Overview

Estimated time: 50–70 minutes

Master advanced string processing in Rust including parsing, regular expressions, formatting, and text manipulation. Learn to work effectively with different string types and processing libraries.

Learning Objectives

Prerequisites

String Parsing

Convert strings to other data types using the parse method:

fn main() {
    let number_str = "42";
    let float_str = "3.14159";
    let bool_str = "true";
    
    // Parse with explicit type annotation
    let number: i32 = number_str.parse().expect("Not a valid number");
    let pi: f64 = float_str.parse().expect("Not a valid float");
    let flag: bool = bool_str.parse().expect("Not a valid boolean");
    
    println!("Number: {}", number);
    println!("Pi: {}", pi);
    println!("Flag: {}", flag);
    
    // Parse with turbofish syntax
    let number2 = number_str.parse::().expect("Not a valid number");
    println!("Number2: {}", number2);
}

Expected output:

Number: 42
Pi: 3.14159
Flag: true
Number2: 42

Safe Parsing with Result

Handle parsing errors gracefully:

fn main() {
    let inputs = vec!["42", "3.14", "not_a_number", "100"];
    
    for input in inputs {
        match input.parse::() {
            Ok(number) => println!("'{}' parsed as: {}", input, number),
            Err(e) => println!("Failed to parse '{}': {}", input, e),
        }
    }
    
    // Using if let for concise error handling
    let maybe_number = "123".parse::();
    if let Ok(num) = maybe_number {
        println!("Successfully parsed: {}", num);
    }
}

Expected output:

'42' parsed as: 42
Failed to parse '3.14': invalid digit found in string
Failed to parse 'not_a_number': invalid digit found in string
'100' parsed as: 100
Successfully parsed: 123

String Formatting

Advanced formatting techniques with format! macro:

fn main() {
    let name = "Alice";
    let age = 30;
    let score = 95.7;
    
    // Basic formatting
    println!("Name: {}, Age: {}", name, age);
    
    // Positional arguments
    println!("Score: {1}, Name: {0}", name, score);
    
    // Named arguments
    println!("Hello {name}, you scored {score:.1}%", name=name, score=score);
    
    // Padding and alignment
    println!("Left:  '{:<10}'", name);
    println!("Right: '{:>10}'", name);
    println!("Center: '{:^10}'", name);
    
    // Number formatting
    println!("Decimal: {:05}", age);        // Zero-pad to 5 digits
    println!("Hex: 0x{:X}", age);           // Uppercase hex
    println!("Binary: 0b{:08b}", age);      // Binary with padding
    println!("Float: {:.2}", score);        // 2 decimal places
    println!("Scientific: {:.2e}", 1234.567);
}

Expected output:

Name: Alice, Age: 30
Score: 95.7, Name: Alice
Hello Alice, you scored 95.7%
Left:  'Alice     '
Right: '     Alice'
Center: '  Alice   '
Decimal: 00030
Hex: 0x1E
Binary: 0b00011110
Float: 95.70
Scientific: 1.23e3

Text Processing

Common text processing operations:

fn main() {
    let text = "  Hello, World! This is Rust programming.  ";
    
    // Trimming whitespace
    let trimmed = text.trim();
    println!("Trimmed: '{}'", trimmed);
    
    // Case conversion
    println!("Uppercase: {}", trimmed.to_uppercase());
    println!("Lowercase: {}", trimmed.to_lowercase());
    
    // Word operations
    let words: Vec<&str> = trimmed.split_whitespace().collect();
    println!("Words: {:?}", words);
    println!("Word count: {}", words.len());
    
    // Replace operations
    let replaced = trimmed.replace("World", "Rust");
    println!("Replaced: {}", replaced);
    
    // Character operations
    let char_count = trimmed.chars().count();
    let byte_count = trimmed.len();
    println!("Characters: {}, Bytes: {}", char_count, byte_count);
}

Expected output:

Trimmed: 'Hello, World! This is Rust programming.'
Uppercase: HELLO, WORLD! THIS IS RUST PROGRAMMING.
Lowercase: hello, world! this is rust programming.
Words: ["Hello,", "World!", "This", "is", "Rust", "programming."]
Word count: 6
Replaced: Hello, Rust! This is Rust programming.
Characters: 39, Bytes: 39

Line Processing

Process text line by line:

fn main() {
    let text = "Line 1\nLine 2\nLine 3\n\nLine 5";
    
    // Process each line
    println!("Lines:");
    for (i, line) in text.lines().enumerate() {
        if line.is_empty() {
            println!("{}: ", i + 1);
        } else {
            println!("{}: {}", i + 1, line);
        }
    }
    
    // Filter and collect non-empty lines
    let non_empty_lines: Vec<&str> = text
        .lines()
        .filter(|line| !line.is_empty())
        .collect();
    
    println!("\nNon-empty lines: {:?}", non_empty_lines);
    
    // Join lines with a different separator
    let joined = non_empty_lines.join(" | ");
    println!("Joined: {}", joined);
}

Expected output:

Lines:
1: Line 1
2: Line 2
3: Line 3
4: 
5: Line 5

Non-empty lines: ["Line 1", "Line 2", "Line 3", "Line 5"]
Joined: Line 1 | Line 2 | Line 3 | Line 5

CSV Processing

Basic CSV parsing without external crates:

fn main() {
    let csv_data = "name,age,city\nAlice,30,New York\nBob,25,Los Angeles\nCharlie,35,Chicago";
    
    let lines: Vec<&str> = csv_data.lines().collect();
    let header = lines[0];
    let data_lines = &lines[1..];
    
    println!("Header: {}", header);
    println!("Data rows:");
    
    for (i, line) in data_lines.iter().enumerate() {
        let fields: Vec<&str> = line.split(',').collect();
        println!("  Row {}: name={}, age={}, city={}", 
                i + 1, fields[0], fields[1], fields[2]);
    }
    
    // Parse into structured data
    #[derive(Debug)]
    struct Person {
        name: String,
        age: u32,
        city: String,
    }
    
    let people: Vec = data_lines
        .iter()
        .map(|line| {
            let fields: Vec<&str> = line.split(',').collect();
            Person {
                name: fields[0].to_string(),
                age: fields[1].parse().unwrap(),
                city: fields[2].to_string(),
            }
        })
        .collect();
    
    println!("\nParsed people: {:#?}", people);
}

Expected output:

Header: name,age,city
Data rows:
  Row 1: name=Alice, age=30, city=New York
  Row 2: name=Bob, age=25, city=Los Angeles
  Row 3: name=Charlie, age=35, city=Chicago

Parsed people: [
    Person {
        name: "Alice",
        age: 30,
        city: "New York",
    },
    Person {
        name: "Bob",
        age: 25,
        city: "Los Angeles",
    },
    Person {
        name: "Charlie",
        age: 35,
        city: "Chicago",
    },
]

Pattern Matching with Contains and Starts/Ends

Basic pattern matching without regex:

fn main() {
    let emails = vec![
        "[email protected]",
        "[email protected]",
        "[email protected]",
        "invalid-email",
        "[email protected]"
    ];
    
    println!("Email analysis:");
    for email in emails {
        println!("Email: {}", email);
        
        if email.contains('@') && email.contains('.') {
            println!("  ✓ Looks like a valid email");
            
            if email.ends_with(".com") {
                println!("  ✓ .com domain");
            } else if email.ends_with(".org") {
                println!("  ✓ .org domain");
            }
            
            if email.contains("gmail") {
                println!("  ✓ Gmail account");
            }
        } else {
            println!("  ✗ Invalid email format");
        }
        println!();
    }
}

Expected output:

Email analysis:
Email: [email protected]
  ✓ Looks like a valid email
  ✓ .com domain

Email: [email protected]
  ✓ Looks like a valid email
  ✓ .com domain
  ✓ Gmail account

Email: [email protected]
  ✓ Looks like a valid email
  ✓ .org domain

Email: invalid-email
  ✗ Invalid email format

Email: [email protected]
  ✓ Looks like a valid email
  ✓ .com domain

Unicode and Character Processing

Working with Unicode characters:

fn main() {
    let text = "Hello 🦀 Rust! 你好 Мир";
    
    println!("Text: {}", text);
    println!("Length in bytes: {}", text.len());
    println!("Length in chars: {}", text.chars().count());
    
    // Character iteration
    println!("\nCharacters:");
    for (i, ch) in text.chars().enumerate() {
        println!("  {}: '{}' (U+{:04X})", i, ch, ch as u32);
    }
    
    // Filter characters
    let ascii_only: String = text
        .chars()
        .filter(|c| c.is_ascii())
        .collect();
    println!("\nASCII only: '{}'", ascii_only);
    
    // Check character types
    println!("\nCharacter analysis:");
    for ch in text.chars() {
        if ch.is_alphabetic() {
            println!("'{}' is alphabetic", ch);
        } else if ch.is_numeric() {
            println!("'{}' is numeric", ch);
        } else if ch.is_whitespace() {
            println!("'{}' is whitespace", ch);
        } else {
            println!("'{}' is other", ch);
        }
    }
}

Expected output:

Text: Hello 🦀 Rust! 你好 Мир
Length in bytes: 26
Length in chars: 16

Characters:
  0: 'H' (U+0048)
  1: 'e' (U+0065)
  2: 'l' (U+006C)
  3: 'l' (U+006C)
  4: 'o' (U+006F)
  5: ' ' (U+0020)
  6: '🦀' (U+1F980)
  7: ' ' (U+0020)
  8: 'R' (U+0052)
  9: 'u' (U+0075)
  10: 's' (U+0073)
  11: 't' (U+0074)
  12: '!' (U+0021)
  13: ' ' (U+0020)
  14: '你' (U+4F60)
  15: '好' (U+597D)

ASCII only: 'Hello  Rust! '

Character analysis:
'H' is alphabetic
'e' is alphabetic
'l' is alphabetic
'l' is alphabetic
'o' is alphabetic
' ' is whitespace
'🦀' is other
' ' is whitespace
'R' is alphabetic
'u' is alphabetic
's' is alphabetic
't' is alphabetic
'!' is other
' ' is whitespace
'你' is alphabetic
'好' is alphabetic

Performance Tips

Efficient string processing techniques:

fn main() {
    // Use String::with_capacity for known size
    let mut result = String::with_capacity(100);
    for i in 0..10 {
        result.push_str(&format!("Item {} ", i));
    }
    println!("Result: {}", result);
    
    // Use Vec<&str> instead of Vec when possible
    let text = "one two three four five";
    let words: Vec<&str> = text.split_whitespace().collect(); // No allocation
    let words_owned: Vec = text.split_whitespace()
        .map(|s| s.to_string()).collect(); // Allocates
    
    println!("Borrowed words: {:?}", words);
    println!("Owned words: {:?}", words_owned);
    
    // Use chars().nth() instead of indexing for Unicode safety
    let unicode_text = "🦀🦀🦀";
    if let Some(ch) = unicode_text.chars().nth(1) {
        println!("Second character: {}", ch); // Safe
    }
    
    // Avoid: unicode_text.chars().collect()[1] // Less efficient
}

Expected output:

Result: Item 0 Item 1 Item 2 Item 3 Item 4 Item 5 Item 6 Item 7 Item 8 Item 9 
Borrowed words: ["one", "two", "three", "four", "five"]
Owned words: ["one", "two", "three", "four", "five"]
Second character: 🦀

Common Pitfalls

1. Byte vs Character Indexing

fn main() {
    let text = "café"; // 'é' is 2 bytes in UTF-8
    
    println!("Text: {}", text);
    println!("Byte length: {}", text.len());        // 5 bytes
    println!("Char length: {}", text.chars().count()); // 4 characters
    
    // Don't do this with Unicode:
    // let ch = text.chars().collect()[2]; // Inefficient
    
    // Do this instead:
    if let Some(ch) = text.chars().nth(2) {
        println!("Third character: {}", ch);
    }
}

2. String vs &str Conversions

fn process_string(s: &str) {
    println!("Processing: {}", s);
}

fn main() {
    let owned = String::from("hello");
    let borrowed = "world";
    
    // Both work - String automatically derefs to &str
    process_string(&owned);  // Borrow String as &str
    process_string(borrowed); // &str directly
    
    // Convert &str to String when needed
    let owned_from_literal = borrowed.to_string();
    println!("Owned: {}", owned_from_literal);
}

Checks for Understanding

Question 1: What's the difference between len() and chars().count()?

Answer: len() returns the number of bytes in the string, while chars().count() returns the number of Unicode characters. For ASCII text they're the same, but for Unicode text with multi-byte characters, they differ.

Question 2: How do you safely parse a string to a number?

Answer: Use the parse() method with error handling: let num = "42".parse::<i32>()?; or match "42".parse::<i32>() { Ok(n) => ..., Err(e) => ... }

Question 3: What's the most efficient way to build a string from multiple parts?

Answer: Use String::with_capacity() if you know the approximate size, or collect into a String: parts.join("") or use format! macro for complex formatting.