Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Cossin Sebastien
ehden-bordeaux-mappings
Commits
29a2d4d7
Commit
29a2d4d7
authored
Nov 07, 2021
by
Cossin Sebastien
Browse files
normalize forms and packaging
parent
b4672b78
Changes
2
Hide whitespace changes
Inline
Side-by-side
drugs/HDH/packaging.R
0 → 100644
View file @
29a2d4d7
library
(
SPARQL
)
ENV_VARIABLES
<-
drugsmapping
::
getENVvariables
()
conn
<-
drugsmapping
::
getPostGresConnection
(
ENV_VARIABLES
)
### Forms:
query
<-
"SELECT distinct
cis.cis,
drug,
cis.dosageform as drugform,
compo.dosageform,
packaging,
refstrength
FROM cis
INNER JOIN cip
on cis.cis = cip.cis
INNER JOIN compo
on cis.cis = compo.cis
where cis.cis not in (SELECT CIS FROM cis_homeopathy)"
forms
<-
DBI
::
dbGetQuery
(
conn
,
query
)
forms
<-
unique
(
forms
)
# writeTable(x = forms, filename = "forms.tsv")
###################### Extract the packages ######################
########################################################################################
#### normalizing
forms
$
packaging_norm
<-
RomediETL
::
packaging_normalize
(
forms
$
packaging
)
forms
$
packaging_norm
<-
RomediETL
::
remove_txt_inside_parentheses
(
forms
$
packaging_norm
)
#### Extracting
regex_packaging
<-
RomediETL
::
packaging_get_regex_to_extract_in_txt
()
forms
$
packages
<-
stringr
::
str_extract
(
string
=
forms
$
packaging_norm
,
pattern
=
regex_packaging
)
all
(
!
is.na
(
forms
$
packages
)
|
forms
$
packaging
==
""
)
# true if everything was extracted
### Normalize quantity and names
forms
$
packages_quantity
<-
stringr
::
str_extract
(
string
=
forms
$
packages
,
pattern
=
"^[0-9]+"
)
forms
$
packages_quantity
<-
as.numeric
(
forms
$
packages_quantity
)
no_quantity
<-
is.na
(
forms
$
packages_quantity
)
forms
$
packages_quantity
[
no_quantity
]
<-
1
forms
$
packages_name
<-
stringr
::
str_extract
(
string
=
forms
$
packages
,
pattern
=
"[^0-9]+"
)
forms
$
packages_name
<-
trimws
(
forms
$
packages_name
)
sort
(
table
(
forms
$
packages_name
))
###################### Extract the forms ######################
########################################################################################
### Remove what was extracted
forms
$
packaging_norm2
<-
mapply
(
FUN
=
RomediETL
::
remove_txt_begin_by
,
char_vector
=
forms
$
packaging_norm
,
txt_to_remove
=
forms
$
packages
)
forms
$
dosageform_norm
<-
RomediETL
::
remove_accents_and_lower
(
forms
$
dosageform
)
known_forms
<-
RomediETL
::
forms_get_list
()
regex_forms
<-
RomediETL
::
forms_get_regex_to_extract_in_txt
()
forms
$
galenique
<-
stringr
::
str_extract
(
string
=
forms
$
packaging_norm2
,
pattern
=
regex_forms
)
### special scenario for "delivrance à l'unité"
plaquette_failed
<-
forms
$
packages_name
==
"plaquette"
&
is.na
(
forms
$
galenique
)
&
!
is.na
(
forms
$
packages_name
)
rule_2
<-
plaquette_failed
&
grepl
(
x
=
forms
$
packaging
,
pattern
=
"délivrance à l'unité"
)
&
forms
$
dosageform_norm
%in%
known_forms
forms
$
galenique
[
rule_2
]
<-
paste0
(
"1 "
,
forms
$
dosageform_norm
[
rule_2
])
## special scenario for "par 28" (meaning par 28 comprimés)
plaquette_failed
<-
forms
$
packages_name
==
"plaquette"
&
is.na
(
forms
$
galenique
)
&
!
is.na
(
forms
$
packages_name
)
ends_by_number
<-
"[0-9]+$"
rule_3
<-
plaquette_failed
&
grepl
(
x
=
forms
$
packaging
,
pattern
=
ends_by_number
)
&
forms
$
dosageform_norm
%in%
known_forms
forms
$
galenique
[
rule_3
]
<-
paste0
(
stringr
::
str_extract
(
string
=
forms
$
packaging_norm2
[
rule_3
],
pattern
=
ends_by_number
),
" "
,
forms
$
dosageform_norm
[
rule_3
])
#### extraire la quantité
regex_dosage
<-
RomediETL
::
get_regex_quantity
()
failed_1
<-
is.na
(
forms
$
galenique
)
forms
$
galenique
[
failed_1
]
<-
stringr
::
str_extract
(
string
=
forms
$
packaging_norm2
[
failed_1
],
pattern
=
regex_dosage
)
# for debugging:
failed_2
<-
is.na
(
forms
$
galenique
)
any_quantity_remain
<-
is.na
(
forms
$
galenique
)
&
grepl
(
"[0-9]"
,
forms
$
packaging_norm2
)
voir
<-
subset
(
forms
,
any_quantity_remain
)
##### Normalizing the quantity:
quantity_regex
<-
"[0-9,.]+"
forms
$
galenique_quantity
<-
stringr
::
str_extract
(
string
=
forms
$
galenique
,
pattern
=
quantity_regex
)
forms
$
galenique_quantity
<-
gsub
(
","
,
"."
,
forms
$
galenique_quantity
,
fixed
=
T
)
forms
$
galenique_quantity
<-
as.numeric
(
forms
$
galenique_quantity
)
forms
$
galenique_name
<-
stringr
::
str_extract
(
string
=
forms
$
galenique
,
pattern
=
"[^0-9,.]+"
)
forms
$
galenique_name
<-
trimws
(
forms
$
galenique_name
)
sort
(
table
(
forms
$
galenique_name
))
## debugging purpose:
is_same_as_dose_form
<-
((
forms
$
galenique_name
==
forms
$
dosageform_norm
)
&
!
is.na
(
forms
$
galenique_name
))
|
forms
$
galenique_name
%in%
c
(
"l"
,
"ml"
,
"g"
,
"mg"
)
voir
<-
subset
(
forms
,
!
is_same_as_dose_form
)
###################### Extract the reference ######################
########################################################################################
forms
$
refstrength_norm
<-
RomediETL
::
normalize_ref_strength
(
forms
$
refstrength
)
# step1: extract dosage:
forms
$
ref_dosage
<-
stringr
::
str_extract
(
string
=
forms
$
refstrength_norm
,
pattern
=
regex_dosage
)
table
(
forms
$
ref_dosage
)
# step2/ packaging or form:
packaging_and_forms
<-
c
(
RomediETL
::
forms_get_list
(),
RomediETL
::
packaging_get_list
())
packaging_and_forms
<-
unique
(
packaging_and_forms
)
regex_packaging_and_forms
<-
RomediETL
::
create_regex_from_word_vector
(
packaging_and_forms
)
undetected
<-
is.na
(
forms
$
ref_dosage
)
sum
(
undetected
)
### TODO: pourquoi separer ici ?
forms
$
ref_pack_forms
[
undetected
]
<-
stringr
::
str_extract
(
string
=
forms
$
refstrength_norm
[
undetected
],
pattern
=
regex_packaging_and_forms
)
undetected
<-
is.na
(
forms
$
ref_dosage
)
&
is.na
(
forms
$
ref_pack_forms
)
&
forms
$
refstrength
!=
""
sum
(
undetected
)
voir
<-
subset
(
forms
,
undetected
)
voir
$
refstrength_norm
[
1
]
##### Normalizing the quantity:
quantity_regex
<-
"[0-9,.]+"
forms
$
refstrength_quantity
<-
stringr
::
str_extract
(
string
=
forms
$
refstrength_norm
,
pattern
=
quantity_regex
)
forms
$
refstrength_quantity
<-
gsub
(
","
,
"."
,
forms
$
refstrength_quantity
,
fixed
=
T
)
forms
$
refstrength_quantity
<-
as.numeric
(
forms
$
refstrength_quantity
)
forms
$
refstrength_name
<-
stringr
::
str_extract
(
string
=
forms
$
refstrength_norm
,
pattern
=
"[^0-9,.]+"
)
forms
$
refstrength_name
<-
trimws
(
forms
$
refstrength_name
)
sort
(
table
(
forms
$
refstrength_name
))
## debugging purpose:
forms
$
refstrength_name
[
undetected
]
<-
forms
$
dosageform_norm
[
undetected
]
forms
$
refstrength_quantity
[
undetected
]
<-
1
sort
(
table
(
forms
$
refstrength_name
))
same_ref
<-
forms
$
refstrength_name
==
forms
$
galenique_name
|
forms
$
refstrength_name
==
forms
$
packages_name
sum
(
same_ref
,
na.rm
=
T
)
drugs/java/src/main/java/fr/erias/frenchdrugs/form
s
/IFormDetector.java
→
drugs/java/src/main/java/fr/erias/frenchdrugs/form
group
/IFormDetector.java
View file @
29a2d4d7
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment