175. Unicode and Encoding

Snippet 1: Encoding and Decoding Strings

Copy

# Encoding a string to bytes
text = "Hello, World!"
encoded_text = text.encode("utf-8")
print(f"Encoded text: {encoded_text}")

# Decoding bytes back to string
decoded_text = encoded_text.decode("utf-8")
print(f"Decoded text: {decoded_text}")

Snippet 2: Writing Unicode Data to a File

Copy

text = "Hello, 世界"  # "Hello, World" in Chinese
with open("unicode_file.txt", "w", encoding="utf-8") as file:
    file.write(text)
print("Text written to file with UTF-8 encoding.")

Snippet 3: Reading Unicode Data from a File

Copy

with open("unicode_file.txt", "r", encoding="utf-8") as file:
    text = file.read()
    print(f"Read text from file: {text}")

Snippet 4: Handling Encoding Errors with errors Parameter

Copy

# Attempting to decode a byte sequence with a different encoding
byte_sequence = b'\xe2\x82\xac'  # Euro sign in UTF-8
try:
    text = byte_sequence.decode("utf-8")
    print(text)
except UnicodeDecodeError:
    print("Unicode decoding error occurred!")

# Using 'ignore' to skip invalid bytes
text = byte_sequence.decode("utf-8", errors="ignore")
print(f"Decoded text with errors ignored: {text}")

Snippet 5: Detecting File Encoding with chardet

Copy

import chardet

with open("unicode_file.txt", "rb") as file:
    raw_data = file.read()
    result = chardet.detect(raw_data)
    print(f"File encoding: {result['encoding']}")

Snippet 6: Converting Between Encodings

Copy

text = "Hello, World!"
# Convert string to bytes using UTF-8
utf_bytes = text.encode("utf-8")
# Convert the bytes to Latin-1 (ISO-8859-1)
latin_bytes = utf_bytes.decode("utf-8").encode("latin-1")
print(f"Text in Latin-1 encoding: {latin_bytes}")

Snippet 7: Unicode Normalization

Copy

import unicodedata

# Unicode string with different normalization forms
text = "e\u0301"  # 'e' + acute accent

# Normalize to NFC (Canonical Composition)
nfc_text = unicodedata.normalize("NFC", text)
print(f"NFC normalization: {nfc_text}")

# Normalize to NFD (Canonical Decomposition)
nfd_text = unicodedata.normalize("NFD", text)
print(f"NFD normalization: {nfd_text}")

Snippet 8: Handling Unicode with Regular Expressions

Copy

import re

text = "This is a test: Hello, 世界"
# Regular expression to match Unicode characters
pattern = r"[^\x00-\x7F]+"  # Matches non-ASCII characters
matches = re.findall(pattern, text)
print(f"Unicode matches found: {matches}")

Snippet 9: Writing a File with a Non-UTF-8 Encoding

Copy

text = "Hello, World!"
# Writing text with ISO-8859-1 encoding
with open("iso_file.txt", "w", encoding="iso-8859-1") as file:
    file.write(text)
print("Text written to file with ISO-8859-1 encoding.")

Snippet 10: Reading a File with Different Encodings

Copy

with open("iso_file.txt", "r", encoding="iso-8859-1") as file:
    text = file.read()
    print(f"Read text from ISO file: {text}")

Previous174. Exception Hierarchy and Inheritance Next176. CSV Parsing with DictReader

Last updated 9 months ago