Strings Binaries

Need efficient text processing in Elixir? This guide teaches you to work with UTF-8 strings and binaries using pattern matching, the String module, and binary syntax for high-performance text manipulation in production applications.

Prerequisites

  • Basic Elixir syntax
  • Understanding of pattern matching
  • Completed Quick Start Tutorial
  • Familiarity with Unicode concepts (helpful)

Problem

Text processing appears simple but hides complexity: character encoding (UTF-8 vs ASCII), grapheme clusters vs bytes, performance when processing large text, and cross-platform newline handling. Elixir strings are UTF-8 encoded binaries, requiring understanding of both string and binary operations for efficient text manipulation.

Challenges:

  • Distinguishing between characters, graphemes, and bytes
  • Efficient string concatenation and manipulation
  • Binary pattern matching for parsing protocols
  • Unicode normalization and case conversion
  • Handling non-UTF-8 data (legacy encodings, binary protocols)
  • Performance optimization for large text processing

Solution Overview

Elixir strings are UTF-8 binaries with powerful pattern matching and a comprehensive String module. Use String module for high-level Unicode operations and binary pattern matching for performance-critical parsing.

Key Concepts:

  • String: UTF-8 encoded binary ("hello")
  • Charlists: List of Unicode codepoints (~c"hello")
  • Binaries: Raw byte sequences (<<1, 2, 3>>)
  • Graphemes: User-perceived characters (may be multiple codepoints)
  • Codepoints: Unicode character codes

Detailed Implementation

1. String Basics

String Creation and Concatenation

greeting = "Hello, World!"

full_name = "John" <> " " <> "Doe"

name = "Alice"
message = "Hello, #{name}!"  # "Hello, Alice!"

text = """
This is a
multi-line
string.
"""

sql = ~s"""
SELECT * FROM users
WHERE age > 18
ORDER BY name
"""

String Module Operations

String.length("café")  # 4 (not 5!)
byte_size("café")      # 5 (é is 2 bytes in UTF-8)

String.upcase("hello")     # "HELLO"
String.downcase("WORLD")   # "world"
String.capitalize("alice") # "Alice"

String.trim("  hello  ")        # "hello"
String.trim_leading("  hello") # "hello"
String.trim_trailing("hello  ")# "hello"

String.split("a,b,c", ",")           # ["a", "b", "c"]
String.split("hello world", " ")      # ["hello", "world"]
String.split("a|b|c", "|", parts: 2)  # ["a", "b|c"]

String.replace("hello", "l", "L")          # "heLLo"
String.replace("hello", "l", "L", global: false) # "heLlo" (first only)

String.contains?("hello world", "world")  # true
String.starts_with?("hello", "he")        # true
String.ends_with?("hello", "lo")          # true

Enum.join(["a", "b", "c"], ",")  # "a,b,c"

2. Binary Pattern Matching

Binary pattern matching enables efficient parsing without string allocations.

Basic Binary Patterns

<<"GET ", path::binary>> = "GET /users"
path  # "/users"

def parse_request(request) do
  case request do
    <<"GET ", rest::binary>> ->
      {:get, parse_path(rest)}

    <<"POST ", rest::binary>> ->
      {:post, parse_path(rest)}

    <<"PUT ", rest::binary>> ->
      {:put, parse_path(rest)}

    _ ->
      {:error, :unknown_method}
  end
end

<<version::8, flags::8, rest::binary>> = <<1, 0, 100, 200>>
version  # 1
flags    # 0
rest     # <<100, 200>>

<<r::8, g::8, b::8>> = <<255, 128, 0>>  # RGB color
r  # 255 (red)
g  # 128 (green)
b  # 0   (blue)

Parsing Text Protocols

defmodule CSV do
  def parse_line(line) do
    line
    |> String.trim()
    |> String.split(",")
    |> Enum.map(&String.trim/1)
  end

  # More efficient with pattern matching
  def parse_line_fast(line) do
    parse_fields(line, [], [])
  end

  defp parse_fields("," <> rest, current, acc) do
    field = current |> Enum.reverse() |> IO.iodata_to_binary() |> String.trim()
    parse_fields(rest, [], [field | acc])
  end

  defp parse_fields(<<char, rest::binary>>, current, acc) do
    parse_fields(rest, [char | current], acc)
  end

  defp parse_fields("", current, acc) do
    field = current |> Enum.reverse() |> IO.iodata_to_binary() |> String.trim()
    Enum.reverse([field | acc])
  end
end

def parse_query_string(query) do
  query
  |> String.split("&")
  |> Enum.map(fn pair ->
    case String.split(pair, "=", parts: 2) do
      [key, value] -> {key, value}
      [key] -> {key, ""}
    end
  end)
  |> Map.new()
end

parse_query_string("name=Alice&age=30")

3. Unicode and Graphemes

Understanding Unicode is crucial for correct string handling.

Graphemes vs Codepoints vs Bytes

text = "café"

String.length(text)           # 4
String.graphemes(text)        # ["c", "a", "f", "é"]

String.codepoints(text)       # ["c", "a", "f", "é"]

byte_size(text)               # 5 (é is 2 bytes)
:binary.bin_to_list(text)     # [99, 97, 102, 195, 169]

Unicode Normalization

nfc = "é"        # Single codepoint (NFC normalized)
nfd = "e\u0301"  # e + combining acute accent (NFD normalized)

String.length(nfc)  # 1
String.length(nfd)  # 2 (different representation)

String.normalize(nfc, :nfc) == String.normalize(nfd, :nfc)  # true

Working with Emojis

text = "Hello 👋🏽"  # Wave with skin tone (multi-codepoint grapheme)

String.length(text)      # 7 (6 chars + 1 emoji grapheme)
String.codepoints(text)  # [..., "👋", "🏽"] (wave + modifier)
String.graphemes(text)   # [..., "👋🏽"] (perceived as single unit)

String.split_at(text, 6)  # {"Hello ", "👋🏽"}

4. Advanced String Operations

String Slicing

text = "Hello, World!"

String.slice(text, 0, 5)    # "Hello"
String.slice(text, 7, 5)    # "World"
String.slice(text, 0..4)    # "Hello"
String.slice(text, 7..-1)   # "World!"

String.slice(text, -6, 6)   # "World!"

String.at(text, 0)   # "H"
String.at(text, -1)  # "!"

Pattern-Based Operations

text = "The year is 2024"
Regex.replace(~r/\d+/, text, "XXXX")  # "The year is XXXX"

Regex.scan(~r/\d+/, "Port: 8080, Timeout: 30")

regex = ~r/(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2})/
Regex.named_captures(regex, "Date: 2024-12-21")

Regex.match?(~r/@/, "alice@example.com")  # true

String Validation

defmodule Validator do
  def email?(string) do
    String.match?(string, ~r/^[^\s@]+@[^\s@]+\.[^\s@]+$/)
  end

  def url?(string) do
    String.match?(string, ~r/^https?:\/\/.+/)
  end

  def alphanumeric?(string) do
    String.match?(string, ~r/^[a-zA-Z0-9]+$/)
  end

  def empty?(string) do
    String.trim(string) == ""
  end
end

5. Performance Optimization

IO Lists for Concatenation

html =
  "<html>" <>
  "<head>" <>
  "<title>Page</title>" <>
  "</head>" <>
  "<body>Content</body>" <>
  "</html>"

html =
  ["<html>",
   "<head>",
   "<title>Page</title>",
   "</head>",
   "<body>Content</body>",
   "</html>"]
  |> IO.iodata_to_binary()

title = "Page"
content = "Content"

html = """
<html>
<head><title>#{title}</title></head>
<body>#{content}</body>
</html>
"""

Efficient String Building

def join_bad(items) do
  Enum.reduce(items, "", fn item, acc ->
    acc <> item <> ", "
  end)
end

def join_good(items) do
  items
  |> Enum.intersperse(", ")
  |> IO.iodata_to_binary()
end

Enum.join(items, ", ")

How It Works

UTF-8 Encoding

Elixir strings use UTF-8 encoding where characters take 1-4 bytes:

  • ASCII (0-127): 1 byte
  • Extended Latin (128-2047): 2 bytes
  • Most common (2048-65535): 3 bytes
  • Rare/historic (65536+): 4 bytes

Example: “café”

  • c: 99 (1 byte)
  • a: 97 (1 byte)
  • f: 102 (1 byte)
  • é: 195, 169 (2 bytes)

Total: 5 bytes for 4 characters

Binary Pattern Matching Efficiency

Pattern matching on binaries is highly optimized:

<<"GET ", path::binary>> = request

String Immutability

Strings are immutable - operations return new strings:

original = "hello"
upper = String.upcase(original)

original  # Still "hello"
upper     # "HELLO"

Variations

1. Custom String Sigils

defmodule MySigils do
  def sigil_u(string, _opts) do
    String.upcase(string)
  end

  def sigil_t(string, _opts) do
    String.trim(string)
  end
end

import MySigils

~u/hello/  # "HELLO"
~t/  hi  / # "hi"

2. String Protocols

defprotocol Stringify do
  def to_string(data)
end

defimpl Stringify, for: List do
  def to_string(list), do: Enum.join(list, ",")
end

defimpl Stringify, for: Map do
  def to_string(map), do: inspect(map)
end

Stringify.to_string([1, 2, 3])        # "1,2,3"
Stringify.to_string(%{a: 1})          # "%{a: 1}"

3. Custom Parsers

defmodule JSONParser do
  def parse("{" <> rest) do
    parse_object(rest, %{})
  end

  def parse("[" <> rest) do
    parse_array(rest, [])
  end

  defp parse_object("}" <> _rest, acc), do: acc

  defp parse_object(rest, acc) do
    # Simplified JSON object parser
    # Production code should use Jason library
  end
end

Pitfalls and Best Practices

Common Mistakes

1. Confusing Strings and Charlists

Bad:

list = 'hello'  # Charlist, not string!
String.upcase(list)  # ERROR

Good:

string = "hello"  # String
String.upcase(string)  # "HELLO"

list = ~c"hello"
String.Chars.to_string(list)  # "hello"

2. Inefficient Concatenation

Bad:

result = Enum.reduce(1..1000, "", fn i, acc ->
  acc <> Integer.to_string(i) <> ","
end)

Good:

result =
  1..1000
  |> Enum.map(&Integer.to_string/1)
  |> Enum.join(",")

3. Ignoring Unicode

Bad:

def initials(name) do
  String.slice(name, 0, 1)  # Fails for "🔥Fire" → "�"
end

Good:

def initials(name) do
  name
  |> String.graphemes()
  |> List.first()
end

4. Incorrect Byte Operations

Bad:

<<head::binary-size(3), _::binary>> = "café"
head  # <<99, 97, 102>> = "caf" (corrupted é!)

Good:

String.slice("café", 0, 3)  # "caf"

Related Resources

Last updated