I need to delete accents from characters in Spanish and others languages from different datasets.
I already did a function based in the code provided in this post t
Another way for doing using python Unicode Database :
import unicodedata
import sys
from pyspark.sql.functions import translate, regexp_replace
def make_trans():
matching_string = ""
replace_string = ""
for i in range(ord(" "), sys.maxunicode):
name = unicodedata.name(chr(i), "")
if "WITH" in name:
try:
base = unicodedata.lookup(name.split(" WITH")[0])
matching_string += chr(i)
replace_string += base
except KeyError:
pass
return matching_string, replace_string
def clean_text(c):
matching_string, replace_string = make_trans()
return translate(
regexp_replace(c, "\p{M}", ""),
matching_string, replace_string
).alias(c)
So now let's test it :
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid"),
(5, "São Paulo"), (6, "Maracaibó")
]).toDF(["id", "text"])
df.select(clean_text("text")).show()
## +---------------+
## | text|
## +---------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## | Sao Paulo|
## | Maracaibo|
## +---------------+
acknowledge @zero323