Commit 29a2d4d7 authored by Cossin Sebastien's avatar Cossin Sebastien
Browse files

normalize forms and packaging

parent b4672b78
library(SPARQL)
ENV_VARIABLES <- drugsmapping::getENVvariables()
conn <- drugsmapping::getPostGresConnection(ENV_VARIABLES)
### Forms:
query <- "SELECT distinct
cis.cis,
drug,
cis.dosageform as drugform,
compo.dosageform,
packaging,
refstrength
FROM cis
INNER JOIN cip
on cis.cis = cip.cis
INNER JOIN compo
on cis.cis = compo.cis
where cis.cis not in (SELECT CIS FROM cis_homeopathy)"
forms <- DBI::dbGetQuery(conn, query)
forms <- unique(forms)
# writeTable(x = forms, filename = "forms.tsv")
###################### Extract the packages ######################
########################################################################################
#### normalizing
forms$packaging_norm <- RomediETL::packaging_normalize(forms$packaging)
forms$packaging_norm <- RomediETL::remove_txt_inside_parentheses(forms$packaging_norm)
#### Extracting
regex_packaging <- RomediETL::packaging_get_regex_to_extract_in_txt()
forms$packages <- stringr::str_extract(string = forms$packaging_norm, pattern = regex_packaging)
all(!is.na(forms$packages) | forms$packaging == "") # true if everything was extracted
### Normalize quantity and names
forms$packages_quantity <- stringr::str_extract(string= forms$packages, pattern = "^[0-9]+")
forms$packages_quantity <- as.numeric(forms$packages_quantity)
no_quantity <- is.na(forms$packages_quantity)
forms$packages_quantity[no_quantity] <- 1
forms$packages_name <- stringr::str_extract(string= forms$packages, pattern = "[^0-9]+")
forms$packages_name <- trimws(forms$packages_name)
sort(table(forms$packages_name))
###################### Extract the forms ######################
########################################################################################
### Remove what was extracted
forms$packaging_norm2 <- mapply(FUN = RomediETL::remove_txt_begin_by,
char_vector = forms$packaging_norm,
txt_to_remove = forms$packages)
forms$dosageform_norm <- RomediETL::remove_accents_and_lower(forms$dosageform)
known_forms <- RomediETL::forms_get_list()
regex_forms <- RomediETL::forms_get_regex_to_extract_in_txt()
forms$galenique <- stringr::str_extract(string = forms$packaging_norm2, pattern = regex_forms)
### special scenario for "delivrance à l'unité"
plaquette_failed <- forms$packages_name == "plaquette" & is.na(forms$galenique) & !is.na(forms$packages_name)
rule_2 <- plaquette_failed & grepl(x = forms$packaging, pattern = "délivrance à l'unité") & forms$dosageform_norm %in% known_forms
forms$galenique[rule_2] <- paste0("1 ",forms$dosageform_norm[rule_2])
## special scenario for "par 28" (meaning par 28 comprimés)
plaquette_failed <- forms$packages_name == "plaquette" & is.na(forms$galenique) & !is.na(forms$packages_name)
ends_by_number <- "[0-9]+$"
rule_3 <- plaquette_failed & grepl(x = forms$packaging, pattern = ends_by_number ) & forms$dosageform_norm %in% known_forms
forms$galenique[rule_3] <- paste0(stringr::str_extract(string = forms$packaging_norm2[rule_3],
pattern = ends_by_number),
" ",forms$dosageform_norm[rule_3])
#### extraire la quantité
regex_dosage <- RomediETL::get_regex_quantity()
failed_1 <- is.na(forms$galenique)
forms$galenique[failed_1] <- stringr::str_extract(string = forms$packaging_norm2[failed_1], pattern = regex_dosage)
# for debugging:
failed_2 <- is.na(forms$galenique)
any_quantity_remain <- is.na(forms$galenique) & grepl("[0-9]",forms$packaging_norm2)
voir <- subset(forms, any_quantity_remain)
##### Normalizing the quantity:
quantity_regex <- "[0-9,.]+"
forms$galenique_quantity <- stringr::str_extract(string= forms$galenique, pattern = quantity_regex)
forms$galenique_quantity <- gsub(",",".",forms$galenique_quantity, fixed = T)
forms$galenique_quantity <- as.numeric(forms$galenique_quantity)
forms$galenique_name <- stringr::str_extract(string= forms$galenique, pattern = "[^0-9,.]+")
forms$galenique_name <- trimws(forms$galenique_name)
sort(table(forms$galenique_name))
## debugging purpose:
is_same_as_dose_form <- ((forms$galenique_name == forms$dosageform_norm) & !is.na(forms$galenique_name)) |
forms$galenique_name %in% c("l","ml","g","mg")
voir <- subset(forms, !is_same_as_dose_form)
###################### Extract the reference ######################
########################################################################################
forms$refstrength_norm <- RomediETL::normalize_ref_strength(forms$refstrength)
# step1: extract dosage:
forms$ref_dosage <- stringr::str_extract(string= forms$refstrength_norm, pattern = regex_dosage)
table(forms$ref_dosage)
# step2/ packaging or form:
packaging_and_forms <- c(RomediETL::forms_get_list(), RomediETL::packaging_get_list())
packaging_and_forms <- unique(packaging_and_forms)
regex_packaging_and_forms <- RomediETL::create_regex_from_word_vector(packaging_and_forms)
undetected <- is.na(forms$ref_dosage)
sum(undetected)
### TODO: pourquoi separer ici ?
forms$ref_pack_forms[undetected] <- stringr::str_extract(string = forms$refstrength_norm[undetected],
pattern = regex_packaging_and_forms)
undetected <- is.na(forms$ref_dosage) & is.na(forms$ref_pack_forms) & forms$refstrength != ""
sum(undetected)
voir <- subset(forms, undetected)
voir$refstrength_norm[1]
##### Normalizing the quantity:
quantity_regex <- "[0-9,.]+"
forms$refstrength_quantity <- stringr::str_extract(string= forms$refstrength_norm, pattern = quantity_regex)
forms$refstrength_quantity <- gsub(",",".",forms$refstrength_quantity, fixed = T)
forms$refstrength_quantity <- as.numeric(forms$refstrength_quantity)
forms$refstrength_name <- stringr::str_extract(string= forms$refstrength_norm, pattern = "[^0-9,.]+")
forms$refstrength_name <- trimws(forms$refstrength_name)
sort(table(forms$refstrength_name))
## debugging purpose:
forms$refstrength_name[undetected] <- forms$dosageform_norm[undetected]
forms$refstrength_quantity[undetected] <- 1
sort(table(forms$refstrength_name))
same_ref <- forms$refstrength_name == forms$galenique_name |
forms$refstrength_name == forms$packages_name
sum(same_ref,na.rm = T)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment