Commit 11b547fd authored by Cossin Sebastien's avatar Cossin Sebastien
Browse files

WIP building CDC

parent 29a2d4d7
...@@ -56,6 +56,7 @@ forms$galenique <- stringr::str_extract(string = forms$packaging_norm2, pattern ...@@ -56,6 +56,7 @@ forms$galenique <- stringr::str_extract(string = forms$packaging_norm2, pattern
### special scenario for "delivrance à l'unité" ### special scenario for "delivrance à l'unité"
plaquette_failed <- forms$packages_name == "plaquette" & is.na(forms$galenique) & !is.na(forms$packages_name) plaquette_failed <- forms$packages_name == "plaquette" & is.na(forms$galenique) & !is.na(forms$packages_name)
rule_2 <- plaquette_failed & grepl(x = forms$packaging, pattern = "délivrance à l'unité") & forms$dosageform_norm %in% known_forms rule_2 <- plaquette_failed & grepl(x = forms$packaging, pattern = "délivrance à l'unité") & forms$dosageform_norm %in% known_forms
sum(rule_2)
forms$galenique[rule_2] <- paste0("1 ",forms$dosageform_norm[rule_2]) forms$galenique[rule_2] <- paste0("1 ",forms$dosageform_norm[rule_2])
## special scenario for "par 28" (meaning par 28 comprimés) ## special scenario for "par 28" (meaning par 28 comprimés)
...@@ -82,7 +83,8 @@ forms$galenique_quantity <- gsub(",",".",forms$galenique_quantity, fixed = T) ...@@ -82,7 +83,8 @@ forms$galenique_quantity <- gsub(",",".",forms$galenique_quantity, fixed = T)
forms$galenique_quantity <- as.numeric(forms$galenique_quantity) forms$galenique_quantity <- as.numeric(forms$galenique_quantity)
forms$galenique_name <- stringr::str_extract(string= forms$galenique, pattern = "[^0-9,.]+") forms$galenique_name <- stringr::str_extract(string= forms$galenique, pattern = "[^0-9,.]+")
forms$galenique_name <- trimws(forms$galenique_name) forms$galenique_name <- trimws(forms$galenique_name)
sort(table(forms$galenique_name)) forms$galenique_name <- gsub("[[:punct:]]","",forms$galenique_name)
sort(table(forms$galenique_name)) # either a form or a quantity
## debugging purpose: ## debugging purpose:
is_same_as_dose_form <- ((forms$galenique_name == forms$dosageform_norm) & !is.na(forms$galenique_name)) | is_same_as_dose_form <- ((forms$galenique_name == forms$dosageform_norm) & !is.na(forms$galenique_name)) |
forms$galenique_name %in% c("l","ml","g","mg") forms$galenique_name %in% c("l","ml","g","mg")
...@@ -93,35 +95,140 @@ voir <- subset(forms, !is_same_as_dose_form) ...@@ -93,35 +95,140 @@ voir <- subset(forms, !is_same_as_dose_form)
forms$refstrength_norm <- RomediETL::normalize_ref_strength(forms$refstrength) forms$refstrength_norm <- RomediETL::normalize_ref_strength(forms$refstrength)
# step1: extract dosage: # step1: extract dosage:
forms$ref_dosage <- stringr::str_extract(string= forms$refstrength_norm, pattern = regex_dosage) forms$ref_dosage <- stringr::str_extract(string= forms$refstrength_norm, pattern = regex_dosage)
table(forms$ref_dosage) undetected <- is.na(forms$ref_dosage)
sum(undetected)
# step2/ packaging or form: # step2/ packaging or form:
packaging_and_forms <- c(RomediETL::forms_get_list(), RomediETL::packaging_get_list()) packaging_and_forms <- c(RomediETL::forms_get_list(), RomediETL::packaging_get_list())
packaging_and_forms <- unique(packaging_and_forms) packaging_and_forms <- unique(packaging_and_forms)
regex_packaging_and_forms <- RomediETL::create_regex_from_word_vector(packaging_and_forms) regex_packaging_and_forms <- RomediETL::create_regex_from_word_vector(packaging_and_forms)
undetected <- is.na(forms$ref_dosage) forms$ref_pack_forms <- stringr::str_extract(string = forms$refstrength_norm,
sum(undetected)
### TODO: pourquoi separer ici ?
forms$ref_pack_forms[undetected] <- stringr::str_extract(string = forms$refstrength_norm[undetected],
pattern = regex_packaging_and_forms) pattern = regex_packaging_and_forms)
undetected <- is.na(forms$ref_dosage) & is.na(forms$ref_pack_forms) & forms$refstrength != "" undetected <- is.na(forms$ref_dosage) & is.na(forms$ref_pack_forms) & forms$refstrength != ""
sum(undetected) sum(undetected)
voir <- subset(forms, undetected) voir <- subset(forms, undetected)
voir$refstrength_norm[1] forms$ref_pack_forms[undetected] <- forms$dosageform_norm[undetected]
##### Normalizing the quantity: ##### Normalizing the quantity:
quantity_regex <- "[0-9,.]+" quantity_regex <- "[0-9,.]+"
forms$refstrength_quantity <- stringr::str_extract(string= forms$refstrength_norm, pattern = quantity_regex) forms$ref_dosage_quantity <- stringr::str_extract(string= forms$ref_dosage, pattern = quantity_regex)
forms$refstrength_quantity <- gsub(",",".",forms$refstrength_quantity, fixed = T) forms$ref_dosage_quantity <- gsub(",",".",forms$ref_dosage_quantity, fixed = T)
forms$refstrength_quantity <- as.numeric(forms$refstrength_quantity) forms$ref_dosage_quantity <- as.numeric(forms$ref_dosage_quantity)
forms$refstrength_name <- stringr::str_extract(string= forms$refstrength_norm, pattern = "[^0-9,.]+") forms$ref_dosage_name <- stringr::str_extract(string = forms$ref_dosage, pattern = "[^0-9,.]+")
forms$refstrength_name <- trimws(forms$refstrength_name) forms$ref_dosage_name <- trimws(forms$ref_dosage_name)
sort(table(forms$refstrength_name)) forms$ref_dosage_name <- gsub("[[:punct:]]","",forms$ref_dosage_name)
## debugging purpose: ## debugging purpose:
forms$refstrength_name[undetected] <- forms$dosageform_norm[undetected] bool <- !is.na(forms$ref_dosage_name) & !is.na(forms$ref_pack_forms)
forms$refstrength_quantity[undetected] <- 1 voir <- subset(forms, bool)
bool <- !is.na(forms$ref_pack_forms) & forms$ref_pack_forms %in% forms$packages_name |
forms$ref_pack_forms %in% forms$galenique_name
sum(bool)
sum(!bool)
voir <- subset(forms, !bool)
### export
## normalizing the reference:
sort(table(forms$ref_dosage_name))
units <- c("mole","mole","1",
"microlitres","ml","0.001",
"gramme","mg","1000",
"g","mg","1000",
"litre","ml","1000",
"mg","mg","1",
"ml","ml","1",
"ug","mg","0.001")
units <- matrix(data = units, ncol = 3, byrow = T)
units <- data.frame(units,stringsAsFactors = F)
colnames(units) <- c("label","label_standard","conversion")
units$conversion <- as.numeric(units$conversion)
all(units$label %in% forms$ref_dosage_name)
## put galenique name and quantity when ref_strength is NA (not detected)
bool <- is.na(forms$ref_dosage_name) & forms$galenique_name %in% units$label
sum(bool)
forms$ref_dosage_name[bool] <- forms$galenique_name[bool]
forms$ref_dosage_quantity[bool] <- forms$galenique_quantity[bool]
forms2 <- merge(forms, units, by.x="ref_dosage_name",by.y="label", all.x=T)
forms2$ref_dosage_quantity_norm <- forms2$ref_dosage_quantity * forms2$conversion
col_label_dosage_form_norm <- which(colnames(forms2) == "label_standard")
colnames(forms2)[col_label_dosage_form_norm] <- "ref_dosage_name_norm"
## remove when refstrength is empty
is_empty_ref_strength <- forms2$refstrength == ""
sum(is_empty_ref_strength)
forms3 <- subset(forms2, !is_empty_ref_strength)
sum(!forms2$cis %in% forms3$cis) # only 14 that doesn't have a refstrength at all
packaging_normalized <- subset(forms3, select= c("cis","drug","drugform","dosageform",
"packages_quantity",
"packages_name","galenique_quantity","galenique_name",
"ref_dosage_quantity_norm","ref_dosage_name_norm"))
## remove empty refstrength:
packaging_normalized <- unique(packaging_normalized)
save(x = packaging_normalized, file = "packaging_normalized.rdata")
# rm(list=ls())
# load("packaging_normalized.rdata")
ref_normalized <- subset(packaging_normalized, select=c("cis","dosageform",
"ref_dosage_quantity_norm",
"ref_dosage_name_norm"))
ref_normalized <- unique(ref_normalized)
#### récupérer les quantités
sa_with_ft_subset <- RomediETL::get_sa_with_ft_subset(conn)
sa_without_ft_subset <- RomediETL::get_sa_without_ft_subset(conn)
compo_new <- RomediETL::rbind_with_without_ft(sa_with_ft_subset,
sa_without_ft_subset)
colnames(compo_new)
compo_new$id <- NULL
library(dplyr)
colnames(ref_normalized)
ref_normalized2 <- ref_normalized %>% group_by(cis, dosageform) %>% mutate(n = row_number())
multiple_norm <- subset(ref_normalized2, n != 1)
# remove if one is NA
bool <- ref_normalized2$cis %in% multiple_norm$cis & is.na(ref_normalized2$ref_dosage_quantity_norm)
ref_normalized3 <- subset(ref_normalized2, !bool)
ref_normalized3 <- ref_normalized2 %>% group_by(cis, dosageform) %>% mutate(n = row_number())
ref_normalized3 <- subset(ref_normalized3, n == 1) ## TODO: find a better way
ref_normalized3$n <- NULL
ref_normalized3$id <- NULL
compo_new_2 <- merge(compo_new, ref_normalized3, by = c("cis","dosageform"))
## normal d'avoir une duplication des lignes
## need to convert the strength:
sort(table(compo_new_2$saucumunit),decreasing = T)
colnames(units)
compo_new_3 <- merge(compo_new_2, units, by.x="ftucumunit",by.y = "label",all.x=T)
colnames(compo_new_3)[c(16,17)] <- c("ftucumunit_standard","ft_conversion")
compo_new_3$ftamount_standard <- compo_new_3$ftamount * compo_new_3$ft_conversion
is_quantify <- !is.na(compo_new_3$ref_dosage_quantity_norm) & !is.na(compo_new_3$ftamount_standard)
sum(is_quantify)
compo_new_3$ftamount_quantified[is_quantify] <- compo_new_3$ftamount_standard[is_quantify] /
compo_new_3$ref_dosage_quantity_norm[is_quantify]
compo_new_3$ftamount_quantified_unit[is_quantify] <- paste(compo_new_3$ftucumunit_standard[is_quantify],
compo_new_3$ref_dosage_name_norm[is_quantify], sep = "/")
sort(table(forms$refstrength_name)) ## besoin d'ajouter le libellé:
same_ref <- forms$refstrength_name == forms$galenique_name | prefLabels <- RomediR::loadPrefLabels()
forms$refstrength_name == forms$packages_name bool <- !compo_new_3$ftwikidata %in% prefLabels$code
sum(same_ref,na.rm = T) voir <- subset(compo_new_3, bool)
wikidataMappings <- RomediR::loadWikidataMappings()
language = "en"
wikidata_pref_label <- RomediETL::getWikidataLabels(wikidataMappings, language = "en")
wikidata_pref_label$itemLabel <- tolower(wikidata_pref_label$itemLabel)
compo_new_4 <- merge(compo_new_3,
wikidata_pref_label,
by.x="ftwikidata",
by.y="item",
all.x=T)
colnames(compo_new_4)[21] <- "ftenlabel"
is_quantify <- !is.na(compo_new_4$ref_dosage_quantity_norm) & !is.na(compo_new_4$ftamount_standard)
sum(is_quantify)
compo_new_4$cdc <- ifelse(is_quantify,
paste0(compo_new_4$ftenlabel, " ", compo_new_4$ftamount_quantified,
" ",toupper(compo_new_4$ftamount_quantified_unit)),
paste0(compo_new_4$ftenlabel, " ", compo_new_4$ftamount_standard,
" ",toupper(compo_new_4$ftucumunit_standard)))
bool <- is.na(compo_new_4$ftenlabel)
compo_new_4$cdc[bool] <- NA
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment