# Encoding a string to bytes
text = "Hello, World!"
encoded_text = text.encode("utf-8")
print(f"Encoded text: {encoded_text}")
# Decoding bytes back to string
decoded_text = encoded_text.decode("utf-8")
print(f"Decoded text: {decoded_text}")
Snippet 2: Writing Unicode Data to a File
Copy
text = "Hello, 世界" # "Hello, World" in Chinese
with open("unicode_file.txt", "w", encoding="utf-8") as file:
file.write(text)
print("Text written to file with UTF-8 encoding.")
Snippet 3: Reading Unicode Data from a File
Copy
Snippet 4: Handling Encoding Errors with errors Parameter
Copy
Snippet 5: Detecting File Encoding with chardet
Copy
Snippet 6: Converting Between Encodings
Copy
Snippet 7: Unicode Normalization
Copy
Snippet 8: Handling Unicode with Regular Expressions
Copy
Snippet 9: Writing a File with a Non-UTF-8 Encoding
Copy
Snippet 10: Reading a File with Different Encodings
with open("unicode_file.txt", "r", encoding="utf-8") as file:
text = file.read()
print(f"Read text from file: {text}")
# Attempting to decode a byte sequence with a different encoding
byte_sequence = b'\xe2\x82\xac' # Euro sign in UTF-8
try:
text = byte_sequence.decode("utf-8")
print(text)
except UnicodeDecodeError:
print("Unicode decoding error occurred!")
# Using 'ignore' to skip invalid bytes
text = byte_sequence.decode("utf-8", errors="ignore")
print(f"Decoded text with errors ignored: {text}")
import chardet
with open("unicode_file.txt", "rb") as file:
raw_data = file.read()
result = chardet.detect(raw_data)
print(f"File encoding: {result['encoding']}")
text = "Hello, World!"
# Convert string to bytes using UTF-8
utf_bytes = text.encode("utf-8")
# Convert the bytes to Latin-1 (ISO-8859-1)
latin_bytes = utf_bytes.decode("utf-8").encode("latin-1")
print(f"Text in Latin-1 encoding: {latin_bytes}")
import unicodedata
# Unicode string with different normalization forms
text = "e\u0301" # 'e' + acute accent
# Normalize to NFC (Canonical Composition)
nfc_text = unicodedata.normalize("NFC", text)
print(f"NFC normalization: {nfc_text}")
# Normalize to NFD (Canonical Decomposition)
nfd_text = unicodedata.normalize("NFD", text)
print(f"NFD normalization: {nfd_text}")
import re
text = "This is a test: Hello, 世界"
# Regular expression to match Unicode characters
pattern = r"[^\x00-\x7F]+" # Matches non-ASCII characters
matches = re.findall(pattern, text)
print(f"Unicode matches found: {matches}")
text = "Hello, World!"
# Writing text with ISO-8859-1 encoding
with open("iso_file.txt", "w", encoding="iso-8859-1") as file:
file.write(text)
print("Text written to file with ISO-8859-1 encoding.")
with open("iso_file.txt", "r", encoding="iso-8859-1") as file:
text = file.read()
print(f"Read text from ISO file: {text}")